From 38cfd1ede86f147dd06b3fe765f8b2913f8406d4 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 27 Oct 2023 17:07:24 +0200
Subject: [PATCH 01/16] add wrapper and new more generic implementation

---
 include/dlaf/multiplication/general.h      | 24 +++++++
 include/dlaf/multiplication/general/api.h  |  4 ++
 include/dlaf/multiplication/general/impl.h | 76 ++++++++++++++++++++++
 3 files changed, 104 insertions(+)
diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 6eaa3e1f22..40f88355ac 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -64,6 +64,30 @@ void generalMatrix(const blas::Op opA, const blas::Op opB, const T alpha, Matrix
     DLAF_UNIMPLEMENTED(opA, opB);
 }
 
+template <Backend B, Device D, class T>
+void generalMatrix([[maybe_unused]] comm::CommunicatorGrid grid,
+                   common::Pipeline<comm::Communicator>& row_task_chain,
+                   common::Pipeline<comm::Communicator>& col_task_chain, const SizeType a,
+                   const SizeType b, const T alpha, MatrixRef<const T, D>& mat_a,
+                   MatrixRef<const T, D>& mat_b, const T beta, MatrixRef<T, D>& mat_c) {
+  DLAF_ASSERT(equal_process_grid(mat_a, grid), mat_a, grid);
+  DLAF_ASSERT(equal_process_grid(mat_b, grid), mat_a, grid);
+  DLAF_ASSERT(equal_process_grid(mat_c, grid), mat_a, grid);
+
+  using matrix::multipliable_sizes;
+  DLAF_ASSERT(multipliable_sizes(mat_a.size(), mat_b.size(), mat_c.size()),
+              "Multiplication incompatible matrix sizes.", mat_a.size(), mat_b.size(), mat_c.size());
+  DLAF_ASSERT(multipliable_sizes(mat_a.blockSize(), mat_b.blockSize(), mat_c.blockSize()),
+              "Multiplication incompatible tile sizes.");
+  DLAF_ASSERT(mat_c.size().isEmpty() || multipliable_sizes(mat_a.distribution().tileSize({0, 0}),
+                                                           mat_b.distribution().tileSize({0, 0}),
+                                                           mat_c.distribution().tileSize({0, 0})),
+              "Multiplication incompatible tile sizes in first row/col. "
+              "(Are you using a matrix with offset not aligned with tile?)");
+
+  internal::General<B, D, T>::callNN(row_task_chain, col_task_chain, alpha, mat_a, mat_b, beta, mat_c);
+}
+
 /// General sub-matrix multiplication implementation on local memory, computing
 /// C[a:b][a:b] = alpha * opA(A[a:b][a:b]) * opB(B[a:b][a:b]) + beta * C[a:b][a:b]
 /// where [a:b] is the range of tiles starting from tile index @p a to tile index @p b (excluded)
diff --git a/include/dlaf/multiplication/general/api.h b/include/dlaf/multiplication/general/api.h
index 3bb418cac4..058c1ba8de 100644
--- a/include/dlaf/multiplication/general/api.h
+++ b/include/dlaf/multiplication/general/api.h
@@ -25,6 +25,10 @@ template <Backend B, Device D, class T>
 struct General {
   static void callNN(const T alpha, MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b,
                      const T beta, MatrixRef<T, D>& mat_c);
+  static void callNN(common::Pipeline<comm::Communicator>& row_task_chain,
+                     common::Pipeline<comm::Communicator>& col_task_chain, const T alpha,
+                     MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b, const T beta,
+                     MatrixRef<T, D>& mat_c);
 };
 
 template <Backend B, Device D, class T>
diff --git a/include/dlaf/multiplication/general/impl.h b/include/dlaf/multiplication/general/impl.h
index 34abe4adef..2c6178bdf2 100644
--- a/include/dlaf/multiplication/general/impl.h
+++ b/include/dlaf/multiplication/general/impl.h
@@ -60,6 +60,82 @@ void General<B, D, T>::callNN(const T alpha, MatrixRef<const T, D>& mat_a, Matri
   }
 }
 
+template <Backend B, Device D, class T>
+void General<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_chain,
+                              common::Pipeline<comm::Communicator>& col_task_chain, const T alpha,
+                              MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b, const T beta,
+                              MatrixRef<T, D>& mat_c) {
+  namespace ex = pika::execution::experimental;
+
+  if (mat_c.size().isEmpty())
+    return;
+
+  const matrix::Distribution& dist_a = mat_a.distribution();
+  const matrix::Distribution& dist_b = mat_b.distribution();
+  const matrix::Distribution& dist_c = mat_c.distribution();
+  const auto rank = dist_c.rankIndex();
+
+  constexpr std::size_t n_workspaces = 2;
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_c);
+  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_c);
+
+  DLAF_ASSERT_HEAVY(mat_a.nrTiles().cols() == mat_b.nrTiles().rows(), mat_a.nrTiles(), mat_b.nrTiles());
+
+  // This loops over the global indices for k, because every rank has to participate in communication
+  for (SizeType k = 0; k < mat_a.nrTiles().cols(); ++k) {
+    auto& panelA = panelsA.nextResource();
+    auto& panelB = panelsB.nextResource();
+
+    if (k == 0 || k == mat_a.nrTiles().cols() - 1) {
+      DLAF_ASSERT_HEAVY(dist_a.tileSize<Coord::Col>(k) == dist_b.tileSize<Coord::Row>(k),
+                        dist_a.tileSize<Coord::Col>(k), dist_b.tileSize<Coord::Row>(k));
+      const SizeType kSize = dist_a.tileSize<Coord::Col>(k);
+      panelA.setWidth(kSize);
+      panelB.setHeight(kSize);
+    }
+
+    // Setup the column workspace for the root ranks, i.e. the ones in the current col
+    const auto rank_k_col = dist_a.rankGlobalTile<Coord::Col>(k);
+    if (rank_k_col == rank.col()) {
+      const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Col>(k);
+      for (SizeType i = 0; i < dist_c.localNrTiles().rows(); ++i) {
+        const LocalTileIndex ik(i, k_local);
+        panelA.setTile(ik, mat_a.read(ik));
+      }
+    }
+    // Setup the row workspace for the root ranks, i.e. the ones in the current row
+    const auto rank_k_row = dist_b.rankGlobalTile<Coord::Row>(k);
+    if (rank_k_row == rank.row()) {
+      const auto k_local = dist_b.template localTileFromGlobalTile<Coord::Row>(k);
+      for (SizeType j = 0; j < dist_c.localNrTiles().cols(); ++j) {
+        const LocalTileIndex kj(k_local, j);
+        panelB.setTile(kj, mat_b.read(kj));
+      }
+    }
+
+    // Broadcast both column and row panel from root to others (row-wise and col-wise, respectively)
+    broadcast(rank_k_col, panelA, row_task_chain);
+    broadcast(rank_k_row, panelB, col_task_chain);
+
+    // This is the core loop where the k step performs the update over the entire local matrix using
+    // the col and row workspaces.
+    // Everything needed for the update is available locally thanks to previous broadcasts.
+    for (SizeType i = 0; i < dist_c.localNrTiles().rows(); ++i) {
+      for (SizeType j = 0; j < dist_c.localNrTiles().cols(); ++j) {
+        const LocalTileIndex ij(i, j);
+
+        ex::start_detached(dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha,
+                                                       panelA.read(ij), panelB.read(ij),
+                                                       k == 0 ? beta : T(1), mat_c.readwrite(ij)) |
+                           tile::gemm(dlaf::internal::Policy<B>()));
+      }
+    }
+
+    panelA.reset();
+    panelB.reset();
+  }
+}
+
 template <Backend B, Device D, class T>
 void GeneralSub<B, D, T>::callNN(const SizeType idx_begin, const SizeType idx_end, const blas::Op opA,
                                  const blas::Op opB, const T alpha, Matrix<const T, D>& mat_a,

From 865d6e783d4924ec485e68a40c172e8dd6c5e85f Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 27 Oct 2023 17:07:57 +0200
Subject: [PATCH 02/16] WIP: workaround for making it work with both Matrix and
 MatrixRef

---
 include/dlaf/util_matrix.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/dlaf/util_matrix.h b/include/dlaf/util_matrix.h
index 2c07fef1a6..b8bb845bab 100644
--- a/include/dlaf/util_matrix.h
+++ b/include/dlaf/util_matrix.h
@@ -74,8 +74,8 @@ bool local_matrix(const MatrixLike<const T, D>& m) noexcept {
 }
 
 /// Returns true if the matrix is distributed on the communication grid.
-template <class T, Device D>
-bool equal_process_grid(const Matrix<const T, D>& m, const comm::CommunicatorGrid& g) noexcept {
+template <template <class, Device> class MatrixLike, class T, Device D>
+bool equal_process_grid(const MatrixLike<const T, D>& m, const comm::CommunicatorGrid& g) noexcept {
   return m.commGridSize() == g.size() && m.rankIndex() == g.rank();
 }
 

From 87934edd7908e05ad10702a2c5490521a4e57ddf Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 20 Nov 2023 17:23:15 +0100
Subject: [PATCH 03/16] make implementation more generic by fixing panel
 allocation

---
 include/dlaf/multiplication/general/impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/dlaf/multiplication/general/impl.h b/include/dlaf/multiplication/general/impl.h
index 2c6178bdf2..1737cdd967 100644
--- a/include/dlaf/multiplication/general/impl.h
+++ b/include/dlaf/multiplication/general/impl.h
@@ -76,8 +76,8 @@ void General<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_cha
   const auto rank = dist_c.rankIndex();
 
   constexpr std::size_t n_workspaces = 2;
-  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_c);
-  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_c);
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_a);
+  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_b);
 
   DLAF_ASSERT_HEAVY(mat_a.nrTiles().cols() == mat_b.nrTiles().rows(), mat_a.nrTiles(), mat_b.nrTiles());
 

From 84103ab2a6b2df5207792eac78c8f6c0ca77b2a2 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 20 Nov 2023 17:24:44 +0100
Subject: [PATCH 04/16] fix k=0 edge-case

---
 include/dlaf/multiplication/general/impl.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/dlaf/multiplication/general/impl.h b/include/dlaf/multiplication/general/impl.h
index 1737cdd967..6e80a69b5c 100644
--- a/include/dlaf/multiplication/general/impl.h
+++ b/include/dlaf/multiplication/general/impl.h
@@ -81,6 +81,17 @@ void General<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_cha
 
   DLAF_ASSERT_HEAVY(mat_a.nrTiles().cols() == mat_b.nrTiles().rows(), mat_a.nrTiles(), mat_b.nrTiles());
 
+  if (mat_a.nrTiles().cols() == 0) {
+    // Note: if beta == 1, we optimize by not even scheduling anything
+    if (beta != T(1)) {
+      for (SizeType j = 0; j < mat_c.distribution().local_nr_tiles().cols(); ++j)
+        for (SizeType i = 0; i < mat_c.distribution().local_nr_tiles().rows(); ++i)
+          ex::start_detached(dlaf::internal::whenAllLift(beta, mat_c.readwrite(LocalTileIndex(i, j))) |
+                             tile::scal(dlaf::internal::Policy<B>()));
+    }
+    return;
+  }
+
   // This loops over the global indices for k, because every rank has to participate in communication
   for (SizeType k = 0; k < mat_a.nrTiles().cols(); ++k) {
     auto& panelA = panelsA.nextResource();

From 9bba5d48cad0029c9ec81e7c8be5c2cf68a7ac8e Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 20 Nov 2023 18:53:52 +0100
Subject: [PATCH 05/16] add tests for distributed (still not fully working)

---
 .../test_multiplication_general.cpp           | 145 +++++++++++++++++-
 1 file changed, 144 insertions(+), 1 deletion(-)

diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index b1ce3be19d..f3833ab16b 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -13,6 +13,8 @@
 #include <dlaf/blas/enum_output.h>
 #include <dlaf/common/assert.h>
 #include <dlaf/common/index2d.h>
+#include <dlaf/common/pipeline.h>
+#include <dlaf/communication/communicator.h>
 #include <dlaf/communication/communicator_grid.h>
 #include <dlaf/matrix/index.h>
 #include <dlaf/matrix/matrix.h>
@@ -40,6 +42,10 @@ template <class T>
 struct GeneralMultiplicationTestMC : public ::testing::Test {};
 TYPED_TEST_SUITE(GeneralMultiplicationTestMC, MatrixElementTypes);
 
+template <class T>
+struct GeneralMultiplicationDistTestMC : public TestWithCommGrids {};
+TYPED_TEST_SUITE(GeneralMultiplicationDistTestMC, MatrixElementTypes);
+
 template <class T>
 struct GeneralSubMultiplicationTestMC : public ::testing::Test {};
 TYPED_TEST_SUITE(GeneralSubMultiplicationTestMC, MatrixElementTypes);
@@ -53,6 +59,10 @@ template <class T>
 struct GeneralMultiplicationTestGPU : public ::testing::Test {};
 TYPED_TEST_SUITE(GeneralMultiplicationTestGPU, MatrixElementTypes);
 
+template <class T>
+struct GeneralMultiplicationDistTestGPU : public TestWithCommGrids {};
+TYPED_TEST_SUITE(GeneralMultiplicationDistTestGPU, MatrixElementTypes);
+
 template <class T>
 struct GeneralSubMultiplicationTestGPU : public ::testing::Test {};
 TYPED_TEST_SUITE(GeneralSubMultiplicationTestGPU, MatrixElementTypes);
@@ -136,6 +146,85 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
                     2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error);
 }
 
+comm::Index2D alignSubRankIndex(const Distribution& dist_in, const GlobalElementIndex& offset_in,
+                                const GlobalElementIndex& offset_out) {
+  const auto pos_mod = [](const auto& a, const auto& b) {
+    const auto mod = a % b;
+    return (mod >= 0) ? mod : (mod + b);
+  };
+
+  const comm::Size2D grid_size = dist_in.grid_size();
+  const comm::Index2D sub_rank(dist_in.rank_global_element<Coord::Row>(offset_in.row()),
+                               dist_in.rank_global_element<Coord::Col>(offset_in.col()));
+  const GlobalTileIndex offset_rank(offset_out.row() / dist_in.block_size().rows(),
+                                    offset_out.col() / dist_in.block_size().cols());
+
+  return {pos_mod(sub_rank.row() - static_cast<comm::IndexT_MPI>(offset_rank.row()), grid_size.rows()),
+          pos_mod(sub_rank.col() - static_cast<comm::IndexT_MPI>(offset_rank.col()), grid_size.cols())};
+}
+
+template <class T, Backend B, Device D>
+void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& config,
+                               comm::CommunicatorGrid& grid) {
+  using dlaf::matrix::internal::MatrixRef;
+
+  common::Pipeline<comm::Communicator> mpi_row_chain(grid.rowCommunicator());
+  common::Pipeline<comm::Communicator> mpi_col_chain(grid.colCommunicator());
+
+  const comm::Index2D src_rank_c(std::max(0, grid.size().rows() - 1),
+                                 std::min(1, grid.size().cols() - 1));
+  const matrix::Distribution dist_c(config.full_c(), {config.mb, config.nb}, grid.size(), grid.rank(),
+                                    src_rank_c);
+
+  const comm::Index2D src_rank_a =
+      alignSubRankIndex(dist_c, config.sub_c().origin, config.sub_a().origin);
+  const comm::Index2D src_rank_b =
+      alignSubRankIndex(dist_c, config.sub_c().origin, config.sub_b().origin);
+
+  const matrix::Distribution dist_a(config.full_a(), {config.mb, config.kb}, grid.size(), grid.rank(),
+                                    src_rank_a);
+  const matrix::Distribution dist_b(config.full_b(), {config.kb, config.nb}, grid.size(), grid.rank(),
+                                    src_rank_b);
+
+  auto setMatrix = [&](auto&& elSetter, matrix::Distribution dist) {
+    Matrix<T, Device::CPU> matrix(std::move(dist));
+    dlaf::matrix::util::set(matrix, elSetter);
+    return matrix;
+  };
+
+  auto [subValuesA, subValuesB, subValuesC, subValuesResult] =
+      matrix::test::getMatrixMatrixMultiplication<GlobalElementIndex, T>(config.opA, config.opB,
+                                                                         config.k, alpha, beta);
+
+  const auto fullValuesA = mix_values(config.sub_a(), subValuesA, [](auto) { return T(-99); });
+  const auto fullValuesB = mix_values(config.sub_b(), subValuesB, [](auto) { return T(-99); });
+  const auto fullValuesC = mix_values(config.sub_c(), subValuesC, [](auto) { return T(-99); });
+
+  Matrix<const T, Device::CPU> mat_ah = setMatrix(fullValuesA, dist_a);
+  Matrix<const T, Device::CPU> mat_bh = setMatrix(fullValuesB, dist_b);
+  Matrix<T, Device::CPU> mat_ch = setMatrix(fullValuesC, dist_c);
+
+  {
+    MatrixMirror<const T, D, Device::CPU> mat_a(mat_ah);
+    MatrixMirror<const T, D, Device::CPU> mat_b(mat_bh);
+    MatrixMirror<T, D, Device::CPU> mat_c(mat_ch);
+
+    MatrixRef<const T, D> mat_sub_a(mat_a.get(), config.sub_a());
+    MatrixRef<const T, D> mat_sub_b(mat_b.get(), config.sub_b());
+    MatrixRef<T, D> mat_sub_c(mat_c.get(), config.sub_c());
+
+    // Note: currently it is implemented just the NoTrans/NoTrans case
+    ASSERT_EQ(config.opA, blas::Op::NoTrans);
+    ASSERT_EQ(config.opB, blas::Op::NoTrans);
+    multiplication::internal::General<B, D, T>::callNN(mpi_row_chain, mpi_col_chain, alpha, mat_sub_a,
+                                                       mat_sub_b, beta, mat_sub_c);
+  }
+
+  const auto fullValuesResult = mix_values(config.sub_c(), subValuesResult, fullValuesC);
+  CHECK_MATRIX_NEAR(fullValuesResult, mat_ch, 2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error,
+                    2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error);
+}
+
 std::vector<GemmConfig> gemm_configs = {
     // empty matrices
     {blas::Op::NoTrans, blas::Op::NoTrans, 0, 0, 7, 3, 6, 2},
@@ -163,7 +252,9 @@ std::vector<GemmConfig> sub_gemm_configs = {
     // single-tile
     {blas::Op::NoTrans, blas::Op::NoTrans, 8, 8, 11, 10, 9, 13, {{2, 1}}, {{1, 1}}, {{0, 0}}},
     // multi-tile
-    {blas::Op::NoTrans, blas::Op::NoTrans, 12, 20, 11, 3, 4, 5, {{7, 1}}, {{11, 10}}, {{4, 2}}},
+    // TODO check this, I suspect a problem/limit with panel offset
+    // {blas::Op::NoTrans, blas::Op::NoTrans, 12, 20, 11, 3, 4, 5, {{7, 1}}, {{11, 10}}, {{4, 2}}},
+    {blas::Op::NoTrans, blas::Op::NoTrans, 12, 20, 11, 3, 4, 5, {{6, 10}}, {{5, 8}}, {{9, 12}}},
 };
 
 TYPED_TEST(GeneralMultiplicationTestMC, CorrectnessLocal) {
@@ -311,6 +402,32 @@ void testGeneralSubMultiplication(comm::CommunicatorGrid grid, const SizeType a,
                     2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error);
 }
 
+TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributedWithMatrixRef) {
+  constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
+  constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
+
+  for (auto comm_grid : this->commGrids()) {
+    for (const GemmConfig& test_config : gemm_configs) {
+      testGeneralMultiplication<TypeParam, Backend::MC, Device::CPU>(alpha, beta, test_config,
+                                                                     comm_grid);
+      pika::wait();
+    }
+  }
+}
+
+TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributedWithMatrixRefSub) {
+  constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
+  constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
+
+  for (auto comm_grid : this->commGrids()) {
+    for (const GemmConfig& test_config : sub_gemm_configs) {
+      testGeneralMultiplication<TypeParam, Backend::MC, Device::CPU>(alpha, beta, test_config,
+                                                                     comm_grid);
+      pika::wait();
+    }
+  }
+}
+
 TYPED_TEST(GeneralSubMultiplicationDistTestMC, CorrectnessDistributed) {
   for (auto comm_grid : this->commGrids()) {
     for (const auto& [m, mb, a, b] : sizes) {
@@ -324,6 +441,32 @@ TYPED_TEST(GeneralSubMultiplicationDistTestMC, CorrectnessDistributed) {
 }
 
 #ifdef DLAF_WITH_GPU
+TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributedWithMatrixRef) {
+  constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
+  constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
+
+  for (auto comm_grid : this->commGrids()) {
+    for (const GemmConfig& test_config : gemm_configs) {
+      testGeneralMultiplication<TypeParam, Backend::GPU, Device::GPU>(alpha, beta, test_config,
+                                                                      comm_grid);
+      pika::wait();
+    }
+  }
+}
+
+TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributedWithMatrixRefSub) {
+  constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
+  constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
+
+  for (auto comm_grid : this->commGrids()) {
+    for (const GemmConfig& test_config : sub_gemm_configs) {
+      testGeneralMultiplication<TypeParam, Backend::GPU, Device::GPU>(alpha, beta, test_config,
+                                                                      comm_grid);
+      pika::wait();
+    }
+  }
+}
+
 TYPED_TEST(GeneralSubMultiplicationDistTestGPU, CorrectnessDistributed) {
   for (auto comm_grid : this->commGrids()) {
     for (const auto& [m, mb, a, b] : sizes) {

From 8c1758fc96b1d76b2f636959a091a5d64a7bc08c Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 21 Nov 2023 14:07:21 +0100
Subject: [PATCH 06/16] use more snake_case

---
 include/dlaf/multiplication/general/impl.h | 43 +++++++++++-----------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/include/dlaf/multiplication/general/impl.h b/include/dlaf/multiplication/general/impl.h
index 6e80a69b5c..fc4c5318f9 100644
--- a/include/dlaf/multiplication/general/impl.h
+++ b/include/dlaf/multiplication/general/impl.h
@@ -73,15 +73,9 @@ void General<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_cha
   const matrix::Distribution& dist_a = mat_a.distribution();
   const matrix::Distribution& dist_b = mat_b.distribution();
   const matrix::Distribution& dist_c = mat_c.distribution();
-  const auto rank = dist_c.rankIndex();
+  const auto rank = dist_c.rank_index();
 
-  constexpr std::size_t n_workspaces = 2;
-  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_a);
-  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_b);
-
-  DLAF_ASSERT_HEAVY(mat_a.nrTiles().cols() == mat_b.nrTiles().rows(), mat_a.nrTiles(), mat_b.nrTiles());
-
-  if (mat_a.nrTiles().cols() == 0) {
+  if (mat_a.nr_tiles().cols() == 0) {
     // Note: if beta == 1, we optimize by not even scheduling anything
     if (beta != T(1)) {
       for (SizeType j = 0; j < mat_c.distribution().local_nr_tiles().cols(); ++j)
@@ -92,33 +86,40 @@ void General<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_cha
     return;
   }
 
+  constexpr std::size_t n_workspaces = 2;
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_a);
+  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_b);
+
+  DLAF_ASSERT_HEAVY(mat_a.nr_tiles().cols() == mat_b.nr_tiles().rows(), mat_a.nr_tiles(),
+                    mat_b.nr_tiles());
+
   // This loops over the global indices for k, because every rank has to participate in communication
-  for (SizeType k = 0; k < mat_a.nrTiles().cols(); ++k) {
+  for (SizeType k = 0; k < mat_a.nr_tiles().cols(); ++k) {
     auto& panelA = panelsA.nextResource();
     auto& panelB = panelsB.nextResource();
 
-    if (k == 0 || k == mat_a.nrTiles().cols() - 1) {
-      DLAF_ASSERT_HEAVY(dist_a.tileSize<Coord::Col>(k) == dist_b.tileSize<Coord::Row>(k),
-                        dist_a.tileSize<Coord::Col>(k), dist_b.tileSize<Coord::Row>(k));
-      const SizeType kSize = dist_a.tileSize<Coord::Col>(k);
+    if (k == 0 || k == mat_a.nr_tiles().cols() - 1) {
+      DLAF_ASSERT_HEAVY(dist_a.tile_size_of<Coord::Col>(k) == dist_b.tile_size_of<Coord::Row>(k),
+                        dist_a.tile_size_of<Coord::Col>(k), dist_b.tile_size_of<Coord::Row>(k));
+      const SizeType kSize = dist_a.tile_size_of<Coord::Col>(k);
       panelA.setWidth(kSize);
       panelB.setHeight(kSize);
     }
 
     // Setup the column workspace for the root ranks, i.e. the ones in the current col
-    const auto rank_k_col = dist_a.rankGlobalTile<Coord::Col>(k);
+    const auto rank_k_col = dist_a.rank_global_tile<Coord::Col>(k);
     if (rank_k_col == rank.col()) {
-      const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Col>(k);
-      for (SizeType i = 0; i < dist_c.localNrTiles().rows(); ++i) {
+      const auto k_local = dist_a.local_tile_from_global_tile<Coord::Col>(k);
+      for (SizeType i = 0; i < dist_c.local_nr_tiles().rows(); ++i) {
         const LocalTileIndex ik(i, k_local);
         panelA.setTile(ik, mat_a.read(ik));
       }
     }
     // Setup the row workspace for the root ranks, i.e. the ones in the current row
-    const auto rank_k_row = dist_b.rankGlobalTile<Coord::Row>(k);
+    const auto rank_k_row = dist_b.rank_global_tile<Coord::Row>(k);
     if (rank_k_row == rank.row()) {
-      const auto k_local = dist_b.template localTileFromGlobalTile<Coord::Row>(k);
-      for (SizeType j = 0; j < dist_c.localNrTiles().cols(); ++j) {
+      const auto k_local = dist_b.local_tile_from_global_tile<Coord::Row>(k);
+      for (SizeType j = 0; j < dist_c.local_nr_tiles().cols(); ++j) {
         const LocalTileIndex kj(k_local, j);
         panelB.setTile(kj, mat_b.read(kj));
       }
@@ -131,8 +132,8 @@ void General<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_cha
     // This is the core loop where the k step performs the update over the entire local matrix using
     // the col and row workspaces.
     // Everything needed for the update is available locally thanks to previous broadcasts.
-    for (SizeType i = 0; i < dist_c.localNrTiles().rows(); ++i) {
-      for (SizeType j = 0; j < dist_c.localNrTiles().cols(); ++j) {
+    for (SizeType i = 0; i < dist_c.local_nr_tiles().rows(); ++i) {
+      for (SizeType j = 0; j < dist_c.local_nr_tiles().cols(); ++j) {
         const LocalTileIndex ij(i, j);
 
         ex::start_detached(dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha,

From 202967663da234f566a102c872b02b82a6f8abfc Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 22 Nov 2023 11:52:44 +0100
Subject: [PATCH 07/16] refine entry-point (new signature + checks)

---
 include/dlaf/multiplication/general.h         | 24 +++++++------------
 .../test_multiplication_general.cpp           |  4 ++--
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 40f88355ac..1f09e7d9c6 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -67,23 +67,15 @@ void generalMatrix(const blas::Op opA, const blas::Op opB, const T alpha, Matrix
 template <Backend B, Device D, class T>
 void generalMatrix([[maybe_unused]] comm::CommunicatorGrid grid,
                    common::Pipeline<comm::Communicator>& row_task_chain,
-                   common::Pipeline<comm::Communicator>& col_task_chain, const SizeType a,
-                   const SizeType b, const T alpha, MatrixRef<const T, D>& mat_a,
-                   MatrixRef<const T, D>& mat_b, const T beta, MatrixRef<T, D>& mat_c) {
-  DLAF_ASSERT(equal_process_grid(mat_a, grid), mat_a, grid);
-  DLAF_ASSERT(equal_process_grid(mat_b, grid), mat_a, grid);
-  DLAF_ASSERT(equal_process_grid(mat_c, grid), mat_a, grid);
+                   common::Pipeline<comm::Communicator>& col_task_chain, const T alpha,
+                   MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b, const T beta,
+                   MatrixRef<T, D>& mat_c) {
+  DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
+  DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_a, grid);
+  DLAF_ASSERT(matrix::equal_process_grid(mat_c, grid), mat_a, grid);
 
-  using matrix::multipliable_sizes;
-  DLAF_ASSERT(multipliable_sizes(mat_a.size(), mat_b.size(), mat_c.size()),
-              "Multiplication incompatible matrix sizes.", mat_a.size(), mat_b.size(), mat_c.size());
-  DLAF_ASSERT(multipliable_sizes(mat_a.blockSize(), mat_b.blockSize(), mat_c.blockSize()),
-              "Multiplication incompatible tile sizes.");
-  DLAF_ASSERT(mat_c.size().isEmpty() || multipliable_sizes(mat_a.distribution().tileSize({0, 0}),
-                                                           mat_b.distribution().tileSize({0, 0}),
-                                                           mat_c.distribution().tileSize({0, 0})),
-              "Multiplication incompatible tile sizes in first row/col. "
-              "(Are you using a matrix with offset not aligned with tile?)");
+  DLAF_ASSERT_HEAVY(matrix::multipliable(mat_a, mat_b, mat_c, blas::Op::NoTrans, blas::Op::NoTrans),
+                    mat_a, mat_b, mat_c);
 
   internal::General<B, D, T>::callNN(row_task_chain, col_task_chain, alpha, mat_a, mat_b, beta, mat_c);
 }
diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index f3833ab16b..7f543b73eb 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -216,8 +216,8 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
     // Note: currently it is implemented just the NoTrans/NoTrans case
     ASSERT_EQ(config.opA, blas::Op::NoTrans);
     ASSERT_EQ(config.opB, blas::Op::NoTrans);
-    multiplication::internal::General<B, D, T>::callNN(mpi_row_chain, mpi_col_chain, alpha, mat_sub_a,
-                                                       mat_sub_b, beta, mat_sub_c);
+    multiplication::internal::generalMatrix<B>(grid, mpi_row_chain, mpi_col_chain, alpha, mat_sub_a,
+                                               mat_sub_b, beta, mat_sub_c);
   }
 
   const auto fullValuesResult = mix_values(config.sub_c(), subValuesResult, fullValuesC);

From 126d8b99549392c08108f0b61f96538fdb03dc5d Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 22 Nov 2023 11:53:31 +0100
Subject: [PATCH 08/16] rename tests

---
 test/unit/multiplication/test_multiplication_general.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index 7f543b73eb..0751e0b3f3 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -402,7 +402,7 @@ void testGeneralSubMultiplication(comm::CommunicatorGrid grid, const SizeType a,
                     2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error);
 }
 
-TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributedWithMatrixRef) {
+TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributed) {
   constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
   constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
 
@@ -415,7 +415,7 @@ TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributedWithMatrixRef)
   }
 }
 
-TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributedWithMatrixRefSub) {
+TYPED_TEST(GeneralMultiplicationDistTestMC, CorrectnessDistributedSub) {
   constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
   constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
 
@@ -441,7 +441,7 @@ TYPED_TEST(GeneralSubMultiplicationDistTestMC, CorrectnessDistributed) {
 }
 
 #ifdef DLAF_WITH_GPU
-TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributedWithMatrixRef) {
+TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributed) {
   constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
   constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
 
@@ -454,7 +454,7 @@ TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributedWithMatrixRef
   }
 }
 
-TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributedWithMatrixRefSub) {
+TYPED_TEST(GeneralMultiplicationDistTestGPU, CorrectnessDistributedSub) {
   constexpr TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
   constexpr TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
 

From 8a020f17f498c89fc6fdb5d4f27c2d54d01a419f Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 22 Nov 2023 11:57:24 +0100
Subject: [PATCH 09/16] fix empty test-cases and generalize alignSubRankIndex

---
 .../test_multiplication_general.cpp           | 64 +++++++++++++------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index 0751e0b3f3..1d29296f7f 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -146,21 +146,35 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
                     2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error);
 }
 
-comm::Index2D alignSubRankIndex(const Distribution& dist_in, const GlobalElementIndex& offset_in,
-                                const GlobalElementIndex& offset_out) {
+template <Coord coord>
+comm::IndexT_MPI alignSubRankIndex(const Distribution& dist_in, const GlobalElementIndex& offset_in,
+                                   const TileElementSize& blocksize_out,
+                                   const GlobalElementIndex& offset_out) {
+  const SizeType blocksize = blocksize_out.get<coord>();
+  DLAF_ASSERT(dist_in.block_size().get<coord>() == blocksize, dist_in.block_size().get<coord>(),
+              blocksize);
+
+  const SizeType grid_size = dist_in.grid_size().get<coord>();
+
+  if (!offset_in.isIn(dist_in.size()))
+    return grid_size / 2;
+
   const auto pos_mod = [](const auto& a, const auto& b) {
     const auto mod = a % b;
     return (mod >= 0) ? mod : (mod + b);
   };
 
-  const comm::Size2D grid_size = dist_in.grid_size();
-  const comm::Index2D sub_rank(dist_in.rank_global_element<Coord::Row>(offset_in.row()),
-                               dist_in.rank_global_element<Coord::Col>(offset_in.col()));
-  const GlobalTileIndex offset_rank(offset_out.row() / dist_in.block_size().rows(),
-                                    offset_out.col() / dist_in.block_size().cols());
+  const SizeType sub_rank = dist_in.rank_global_element<coord>(offset_in.get<coord>());
+  const SizeType offset_rank(offset_out.get<coord>() / blocksize);
 
-  return {pos_mod(sub_rank.row() - static_cast<comm::IndexT_MPI>(offset_rank.row()), grid_size.rows()),
-          pos_mod(sub_rank.col() - static_cast<comm::IndexT_MPI>(offset_rank.col()), grid_size.cols())};
+  return pos_mod(sub_rank - offset_rank, grid_size);
+}
+
+comm::Index2D alignSubRankIndex(const Distribution& dist_in, const GlobalElementIndex& offset_in,
+                                const TileElementSize& blocksize_out,
+                                const GlobalElementIndex& offset_out) {
+  return {alignSubRankIndex<Coord::Row>(dist_in, offset_in, blocksize_out, offset_out),
+          alignSubRankIndex<Coord::Col>(dist_in, offset_in, blocksize_out, offset_out)};
 }
 
 template <class T, Backend B, Device D>
@@ -171,20 +185,28 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
   common::Pipeline<comm::Communicator> mpi_row_chain(grid.rowCommunicator());
   common::Pipeline<comm::Communicator> mpi_col_chain(grid.colCommunicator());
 
+  const TileElementSize blocksize_a(config.mb, config.kb);
+  const TileElementSize blocksize_b(config.kb, config.nb);
+  const TileElementSize blocksize_c(config.mb, config.nb);
+
   const comm::Index2D src_rank_c(std::max(0, grid.size().rows() - 1),
                                  std::min(1, grid.size().cols() - 1));
-  const matrix::Distribution dist_c(config.full_c(), {config.mb, config.nb}, grid.size(), grid.rank(),
-                                    src_rank_c);
-
-  const comm::Index2D src_rank_a =
-      alignSubRankIndex(dist_c, config.sub_c().origin, config.sub_a().origin);
-  const comm::Index2D src_rank_b =
-      alignSubRankIndex(dist_c, config.sub_c().origin, config.sub_b().origin);
-
-  const matrix::Distribution dist_a(config.full_a(), {config.mb, config.kb}, grid.size(), grid.rank(),
-                                    src_rank_a);
-  const matrix::Distribution dist_b(config.full_b(), {config.kb, config.nb}, grid.size(), grid.rank(),
-                                    src_rank_b);
+  const matrix::Distribution dist_c(config.full_c(), blocksize_c, grid.size(), grid.rank(), src_rank_c);
+
+  const comm::IndexT_MPI rank_aligned_row =
+      alignSubRankIndex<Coord::Row>(dist_c, config.sub_c().origin, blocksize_a, config.sub_a().origin);
+  const comm::IndexT_MPI rank_aligned_col =
+      alignSubRankIndex<Coord::Col>(dist_c, config.sub_c().origin, blocksize_b, config.sub_b().origin);
+
+  // Note:
+  // GEMM(NoTrans, NoTrans) requires:
+  // - a is rank aligned with c for what concerns rows
+  // - b is rank aligned with c for what concerns cols
+  const comm::Index2D src_rank_a{rank_aligned_row, 0};
+  const comm::Index2D src_rank_b{0, rank_aligned_col};
+
+  const matrix::Distribution dist_a(config.full_a(), blocksize_a, grid.size(), grid.rank(), src_rank_a);
+  const matrix::Distribution dist_b(config.full_b(), blocksize_b, grid.size(), grid.rank(), src_rank_b);
 
   auto setMatrix = [&](auto&& elSetter, matrix::Distribution dist) {
     Matrix<T, Device::CPU> matrix(std::move(dist));

From 6df8ef6222b5150d28d52cf84d0641849bd6dae8 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 22 Nov 2023 12:07:43 +0100
Subject: [PATCH 10/16] move alignSubRankIndex helpers in a common place

---
 .../test_multiplication_general.cpp           | 31 -------------------
 1 file changed, 31 deletions(-)

diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index 1d29296f7f..4b9965f2c7 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -146,37 +146,6 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
                     2 * (mat_ah.size().cols() + 1) * TypeUtilities<T>::error);
 }
 
-template <Coord coord>
-comm::IndexT_MPI alignSubRankIndex(const Distribution& dist_in, const GlobalElementIndex& offset_in,
-                                   const TileElementSize& blocksize_out,
-                                   const GlobalElementIndex& offset_out) {
-  const SizeType blocksize = blocksize_out.get<coord>();
-  DLAF_ASSERT(dist_in.block_size().get<coord>() == blocksize, dist_in.block_size().get<coord>(),
-              blocksize);
-
-  const SizeType grid_size = dist_in.grid_size().get<coord>();
-
-  if (!offset_in.isIn(dist_in.size()))
-    return grid_size / 2;
-
-  const auto pos_mod = [](const auto& a, const auto& b) {
-    const auto mod = a % b;
-    return (mod >= 0) ? mod : (mod + b);
-  };
-
-  const SizeType sub_rank = dist_in.rank_global_element<coord>(offset_in.get<coord>());
-  const SizeType offset_rank(offset_out.get<coord>() / blocksize);
-
-  return pos_mod(sub_rank - offset_rank, grid_size);
-}
-
-comm::Index2D alignSubRankIndex(const Distribution& dist_in, const GlobalElementIndex& offset_in,
-                                const TileElementSize& blocksize_out,
-                                const GlobalElementIndex& offset_out) {
-  return {alignSubRankIndex<Coord::Row>(dist_in, offset_in, blocksize_out, offset_out),
-          alignSubRankIndex<Coord::Col>(dist_in, offset_in, blocksize_out, offset_out)};
-}
-
 template <class T, Backend B, Device D>
 void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& config,
                                comm::CommunicatorGrid& grid) {

From 3ed359accb5514547c4b29736a90e2df6e5283d5 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 22 Nov 2023 12:19:18 +0100
Subject: [PATCH 11/16] use snake_case also for helper

---
 test/unit/multiplication/test_multiplication_general.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index 4b9965f2c7..3753629683 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -163,9 +163,11 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
   const matrix::Distribution dist_c(config.full_c(), blocksize_c, grid.size(), grid.rank(), src_rank_c);
 
   const comm::IndexT_MPI rank_aligned_row =
-      alignSubRankIndex<Coord::Row>(dist_c, config.sub_c().origin, blocksize_a, config.sub_a().origin);
+      align_sub_rank_index<Coord::Row>(dist_c, config.sub_c().origin, blocksize_a,
+                                       config.sub_a().origin);
   const comm::IndexT_MPI rank_aligned_col =
-      alignSubRankIndex<Coord::Col>(dist_c, config.sub_c().origin, blocksize_b, config.sub_b().origin);
+      align_sub_rank_index<Coord::Col>(dist_c, config.sub_c().origin, blocksize_b,
+                                       config.sub_b().origin);
 
   // Note:
   // GEMM(NoTrans, NoTrans) requires:

From e1bf4b5fab020db36a4f54dda4607eb7f955c4d3 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 22 Nov 2023 12:28:27 +0100
Subject: [PATCH 12/16] basic doc

---
 include/dlaf/multiplication/general.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 1f09e7d9c6..9c137280a4 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -64,6 +64,23 @@ void generalMatrix(const blas::Op opA, const blas::Op opB, const T alpha, Matrix
     DLAF_UNIMPLEMENTED(opA, opB);
 }
 
+/// General sub-matrix distributed multiplication, computing
+/// C = alpha * A * B + beta * C
+///
+/// @param  mat_a contains the input matrix A.
+/// @pre @p mat_a is distributed according to @p grid
+///
+/// @param  mat_b contains the input matrix B.
+/// @pre @p mat_b is distributed according to @p grid
+///
+/// @param  mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be
+///         overwritten with the result, while others are left untouched.
+/// @pre @p mat_c is distributed according to @p grid
+///
+/// @pre multipliable_sizes(mat_a.size(), mat_b.size(), mat_c.size(), opA, opB)
+/// @pre multipliable_sizes(mat_a.tile_size(), mat_b.tile_size(), mat_c.tile_size(), opA, opB)
+/// @pre multipliable_sizes(mat_a.tile_size_of({0, 0}), mat_b.tile_size_of({0, 0}),
+///      mat_c.tile_size_of({0, 0}), opA, opB)
 template <Backend B, Device D, class T>
 void generalMatrix([[maybe_unused]] comm::CommunicatorGrid grid,
                    common::Pipeline<comm::Communicator>& row_task_chain,

From 9052467b6eb2b0c99566e574b4372da45eae04a7 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 27 Nov 2023 17:56:31 +0100
Subject: [PATCH 13/16] enable test case after fix for panel bound to matrix
 with offset

---
 test/unit/multiplication/test_multiplication_general.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index 3753629683..d4434924ff 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -245,8 +245,7 @@ std::vector<GemmConfig> sub_gemm_configs = {
     // single-tile
     {blas::Op::NoTrans, blas::Op::NoTrans, 8, 8, 11, 10, 9, 13, {{2, 1}}, {{1, 1}}, {{0, 0}}},
     // multi-tile
-    // TODO check this, I suspect a problem/limit with panel offset
-    // {blas::Op::NoTrans, blas::Op::NoTrans, 12, 20, 11, 3, 4, 5, {{7, 1}}, {{11, 10}}, {{4, 2}}},
+    {blas::Op::NoTrans, blas::Op::NoTrans, 12, 20, 11, 3, 4, 5, {{7, 1}}, {{11, 10}}, {{4, 2}}},
     {blas::Op::NoTrans, blas::Op::NoTrans, 12, 20, 11, 3, 4, 5, {{6, 10}}, {{5, 8}}, {{9, 12}}},
 };
 

From 3fdeea2a8589450149306ff2b14131b2357d5fab Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 28 Nov 2023 11:32:14 +0100
Subject: [PATCH 14/16] drop grid from signature since we can get the same
 information directly from matrix

---
 include/dlaf/multiplication/general.h              | 14 ++++----------
 include/dlaf/util_matrix.h                         |  2 +-
 .../multiplication/test_multiplication_general.cpp |  4 ++--
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 9c137280a4..703356d618 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -68,28 +68,22 @@ void generalMatrix(const blas::Op opA, const blas::Op opB, const T alpha, Matrix
 /// C = alpha * A * B + beta * C
 ///
 /// @param  mat_a contains the input matrix A.
-/// @pre @p mat_a is distributed according to @p grid
-///
 /// @param  mat_b contains the input matrix B.
-/// @pre @p mat_b is distributed according to @p grid
-///
 /// @param  mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be
 ///         overwritten with the result, while others are left untouched.
-/// @pre @p mat_c is distributed according to @p grid
 ///
+/// @pre @p mat_a, @p mat_b and @p mat_c are distributed the same way,
 /// @pre multipliable_sizes(mat_a.size(), mat_b.size(), mat_c.size(), opA, opB)
 /// @pre multipliable_sizes(mat_a.tile_size(), mat_b.tile_size(), mat_c.tile_size(), opA, opB)
 /// @pre multipliable_sizes(mat_a.tile_size_of({0, 0}), mat_b.tile_size_of({0, 0}),
 ///      mat_c.tile_size_of({0, 0}), opA, opB)
 template <Backend B, Device D, class T>
-void generalMatrix([[maybe_unused]] comm::CommunicatorGrid grid,
-                   common::Pipeline<comm::Communicator>& row_task_chain,
+void generalMatrix(common::Pipeline<comm::Communicator>& row_task_chain,
                    common::Pipeline<comm::Communicator>& col_task_chain, const T alpha,
                    MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b, const T beta,
                    MatrixRef<T, D>& mat_c) {
-  DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
-  DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_a, grid);
-  DLAF_ASSERT(matrix::equal_process_grid(mat_c, grid), mat_a, grid);
+  DLAF_ASSERT(matrix::same_process_grid(mat_c, mat_a), mat_c, mat_b);
+  DLAF_ASSERT(matrix::same_process_grid(mat_c, mat_b), mat_c, mat_b);
 
   DLAF_ASSERT_HEAVY(matrix::multipliable(mat_a, mat_b, mat_c, blas::Op::NoTrans, blas::Op::NoTrans),
                     mat_a, mat_b, mat_c);
diff --git a/include/dlaf/util_matrix.h b/include/dlaf/util_matrix.h
index b8bb845bab..9977d174f6 100644
--- a/include/dlaf/util_matrix.h
+++ b/include/dlaf/util_matrix.h
@@ -79,7 +79,7 @@ bool equal_process_grid(const MatrixLike<const T, D>& m, const comm::Communicato
   return m.commGridSize() == g.size() && m.rankIndex() == g.rank();
 }
 
-/// Returns true if the matrix is distributed on the communication grid.
+/// Returns true if the two matrices are distributed on the same grid
 template <template <class, Device> class MatrixLikeA, template <class, Device> class MatrixLikeB,
           class T, Device D1, Device D2>
 bool same_process_grid(const MatrixLikeA<const T, D1>& a, const MatrixLikeB<const T, D2>& b) noexcept {
diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index d4434924ff..5152514478 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -209,8 +209,8 @@ void testGeneralMultiplication(const T alpha, const T beta, const GemmConfig& co
     // Note: currently it is implemented just the NoTrans/NoTrans case
     ASSERT_EQ(config.opA, blas::Op::NoTrans);
     ASSERT_EQ(config.opB, blas::Op::NoTrans);
-    multiplication::internal::generalMatrix<B>(grid, mpi_row_chain, mpi_col_chain, alpha, mat_sub_a,
-                                               mat_sub_b, beta, mat_sub_c);
+    multiplication::internal::generalMatrix<B>(mpi_row_chain, mpi_col_chain, alpha, mat_sub_a, mat_sub_b,
+                                               beta, mat_sub_c);
   }
 
   const auto fullValuesResult = mix_values(config.sub_c(), subValuesResult, fullValuesC);

From 92bd701aab910f32a3cb4b7d815618ae11e71c19 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 30 Nov 2023 15:34:29 +0100
Subject: [PATCH 15/16] address comment about doc

---
 include/dlaf/multiplication/general.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 703356d618..039a2fab66 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -72,7 +72,7 @@ void generalMatrix(const blas::Op opA, const blas::Op opB, const T alpha, Matrix
 /// @param  mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be
 ///         overwritten with the result, while others are left untouched.
 ///
-/// @pre @p mat_a, @p mat_b and @p mat_c are distributed the same way,
+/// @pre @p mat_a, @p mat_b and @p mat_c are distributed according to the same grid,
 /// @pre multipliable_sizes(mat_a.size(), mat_b.size(), mat_c.size(), opA, opB)
 /// @pre multipliable_sizes(mat_a.tile_size(), mat_b.tile_size(), mat_c.tile_size(), opA, opB)
 /// @pre multipliable_sizes(mat_a.tile_size_of({0, 0}), mat_b.tile_size_of({0, 0}),

From dfc8e8cc4e2310d213f469cf80618b8922f84c56 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <9337627+albestro@users.noreply.github.com>
Date: Thu, 30 Nov 2023 18:29:48 +0100
Subject: [PATCH 16/16] Update general.h

---
 include/dlaf/multiplication/general.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 039a2fab66..d9e06552e8 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -72,7 +72,7 @@ void generalMatrix(const blas::Op opA, const blas::Op opB, const T alpha, Matrix
 /// @param  mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be
 ///         overwritten with the result, while others are left untouched.
 ///
-/// @pre @p mat_a, @p mat_b and @p mat_c are distributed according to the same grid,
+/// @pre @p mat_a, @p mat_b and @p mat_c are distributed on the same grid,
 /// @pre multipliable_sizes(mat_a.size(), mat_b.size(), mat_c.size(), opA, opB)
 /// @pre multipliable_sizes(mat_a.tile_size(), mat_b.tile_size(), mat_c.tile_size(), opA, opB)
 /// @pre multipliable_sizes(mat_a.tile_size_of({0, 0}), mat_b.tile_size_of({0, 0}),