From c26c5da86675c813618f70cc5f4babaac215f4bc Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Mon, 16 Oct 2023 15:27:01 +0200
Subject: [PATCH] Develop: Enable Gemm (local) to be used with MatrixRef (#969)

---
 include/dlaf/multiplication/general/api.h     |   5 +
 include/dlaf/multiplication/general/impl.h    |   9 +-
 .../test_multiplication_general.cpp           | 112 ++++++++++++++----
 3 files changed, 98 insertions(+), 28 deletions(-)
diff --git a/include/dlaf/multiplication/general/api.h b/include/dlaf/multiplication/general/api.h
index e9f7d558ad..5d3ebe743e 100644
--- a/include/dlaf/multiplication/general/api.h
+++ b/include/dlaf/multiplication/general/api.h
@@ -14,6 +14,7 @@
 
 #include <dlaf/common/pipeline.h>
 #include <dlaf/matrix/matrix.h>
+#include <dlaf/matrix/matrix_ref.h>
 #include <dlaf/types.h>
 
 #include "dlaf/matrix/matrix_ref.h"
@@ -33,6 +34,10 @@ struct GeneralSub {
                      common::Pipeline<comm::Communicator>& col_task_chain, const SizeType i_tile_from,
                      const SizeType i_tile_to, const T alpha, Matrix<const T, D>& mat_a,
                      Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c);
+
+  // Note: internal helper
+  static void callNN(const blas::Op opA, const blas::Op opB, const T alpha, MatrixRef<const T, D>& mat_a,
+                     MatrixRef<const T, D>& mat_b, const T beta, MatrixRef<T, D>& mat_c);
 };
 
 // ETI
diff --git a/include/dlaf/multiplication/general/impl.h b/include/dlaf/multiplication/general/impl.h
index 1918134028..b03b512ff8 100644
--- a/include/dlaf/multiplication/general/impl.h
+++ b/include/dlaf/multiplication/general/impl.h
@@ -49,13 +49,12 @@ void GeneralSub<B, D, T>::callNN(const SizeType idx_begin, const SizeType idx_en
 
 template <Backend B, Device D, class T>
 void GeneralSub<B, D, T>::callNN(const blas::Op opA, const blas::Op opB, const T alpha,
-                                 dlaf::matrix::internal::MatrixRef<const T, D>& mat_a,
-                                 dlaf::matrix::internal::MatrixRef<const T, D>& mat_b, const T beta,
-                                 dlaf::matrix::internal::MatrixRef<T, D>& mat_c) {
+                                 MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b,
+                                 const T beta, MatrixRef<T, D>& mat_c) {
   namespace ex = pika::execution::experimental;
 
-  for (SizeType j = 0; j < mat_b.nrTiles().cols(); ++j) {
-    for (SizeType i = 0; i < mat_a.nrTiles().rows(); ++i) {
+  for (SizeType j = 0; j < mat_c.nrTiles().cols(); ++j) {
+    for (SizeType i = 0; i < mat_c.nrTiles().rows(); ++i) {
       for (SizeType k = 0; k < mat_a.nrTiles().cols(); ++k) {
         ex::start_detached(
             dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)),
diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp
index ea13870301..11af474531 100644
--- a/test/unit/multiplication/test_multiplication_general.cpp
+++ b/test/unit/multiplication/test_multiplication_general.cpp
@@ -14,7 +14,9 @@
 #include <dlaf/common/assert.h>
 #include <dlaf/communication/communicator_grid.h>
 #include <dlaf/matrix/index.h>
+#include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/matrix_mirror.h>
+#include <dlaf/matrix/matrix_ref.h>
 #include <dlaf/multiplication/general.h>
 #include <dlaf/util_matrix.h>
 
@@ -30,15 +32,30 @@ using namespace dlaf::matrix;
 using namespace dlaf::test;
 
 template <class T>
-struct GeneralMultiplicationTestMC : public ::testing::Test {};
+struct GeneralSubMultiplicationTestMC : public ::testing::Test {};
 
-TYPED_TEST_SUITE(GeneralMultiplicationTestMC, MatrixElementTypes);
+TYPED_TEST_SUITE(GeneralSubMultiplicationTestMC, MatrixElementTypes);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-struct GeneralMultiplicationTestGPU : public ::testing::Test {};
+struct GeneralSubMultiplicationTestGPU : public ::testing::Test {};
 
-TYPED_TEST_SUITE(GeneralMultiplicationTestGPU, MatrixElementTypes);
+TYPED_TEST_SUITE(GeneralSubMultiplicationTestGPU, MatrixElementTypes);
+#endif
+
+::testing::Environment* const comm_grids_env =
+    ::testing::AddGlobalTestEnvironment(new CommunicatorGrid6RanksEnvironment);
+
+template <class T>
+struct GeneralSubMultiplicationDistTestMC : public TestWithCommGrids {};
+
+TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestMC, MatrixElementTypes);
+
+#ifdef DLAF_WITH_GPU
+template <class T>
+struct GeneralSubMultiplicationDistTestGPU : public TestWithCommGrids {};
+
+TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestGPU, MatrixElementTypes);
 #endif
 
 const std::vector<std::tuple<SizeType, SizeType, SizeType, SizeType>> sizes = {
@@ -65,8 +82,8 @@ GlobalElementSize globalTestSize(const LocalElementSize& size) {
 }
 
 template <class T, Backend B, Device D>
-void testGeneralMultiplication(const SizeType a, const SizeType b, const T alpha, const T beta,
-                               const SizeType m, const SizeType mb) {
+void testGeneralSubMultiplication(const SizeType a, const SizeType b, const T alpha, const T beta,
+                                  const SizeType m, const SizeType mb) {
   const SizeType a_el = a * mb;
   const SizeType b_el = std::min(b * mb, m);
 
@@ -97,11 +114,11 @@ void testGeneralMultiplication(const SizeType a, const SizeType b, const T alpha
                     40 * (mat_ch.size().rows() + 1) * TypeUtilities<T>::error);
 }
 
-TYPED_TEST(GeneralMultiplicationTestMC, CorrectnessLocal) {
+TYPED_TEST(GeneralSubMultiplicationTestMC, CorrectnessLocal) {
   for (const auto& [m, mb, a, b] : sizes) {
     const TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
     const TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
-    testGeneralMultiplication<TypeParam, Backend::MC, Device::CPU>(a, b, alpha, beta, m, mb);
+    testGeneralSubMultiplication<TypeParam, Backend::MC, Device::CPU>(a, b, alpha, beta, m, mb);
   }
 }
 
@@ -115,21 +132,6 @@ TYPED_TEST(GeneralMultiplicationTestGPU, CorrectnessLocal) {
 }
 #endif
 
-::testing::Environment* const comm_grids_env =
-    ::testing::AddGlobalTestEnvironment(new CommunicatorGrid6RanksEnvironment);
-
-template <class T>
-struct GeneralSubMultiplicationDistTestMC : public TestWithCommGrids {};
-
-TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestMC, MatrixElementTypes);
-
-#ifdef DLAF_WITH_GPU
-template <class T>
-struct GeneralSubMultiplicationDistTestGPU : public TestWithCommGrids {};
-
-TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestGPU, MatrixElementTypes);
-#endif
-
 template <class T, Backend B, Device D>
 void testGeneralSubMultiplication(comm::CommunicatorGrid grid, const SizeType a, const SizeType b,
                                   const T alpha, const T beta, const SizeType m, const SizeType mb) {
@@ -192,3 +194,67 @@ TYPED_TEST(GeneralSubMultiplicationDistTestGPU, CorrectnessDistributed) {
   }
 }
 #endif
+
+template <class T, Backend B, Device D>
+void testGeneralSubMultiplication(dlaf::matrix::internal::SubMatrixSpec sub_spec, const T alpha,
+                                  const T beta, const SizeType m, const SizeType mb) {
+  using dlaf::matrix::internal::MatrixRef;
+
+  const SizeType a_el = sub_spec.origin.row();
+  const SizeType b_el = sub_spec.origin.row() + sub_spec.size.rows();
+
+  auto [refA, refB, refC, refResult] =
+      matrix::test::getSubMatrixMatrixMultiplication(a_el, b_el, m, m, m, alpha, beta, blas::Op::NoTrans,
+                                                     blas::Op::NoTrans);
+
+  auto setMatrix = [&](auto elSetter, const LocalElementSize size, const TileElementSize block_size) {
+    Matrix<T, Device::CPU> matrix(size, block_size);
+    dlaf::matrix::util::set(matrix, elSetter);
+    return matrix;
+  };
+
+  Matrix<const T, Device::CPU> mat_ah = setMatrix(refA, {m, m}, {mb, mb});
+  Matrix<const T, Device::CPU> mat_bh = setMatrix(refB, {m, m}, {mb, mb});
+  Matrix<T, Device::CPU> mat_ch = setMatrix(refC, {m, m}, {mb, mb});
+
+  {
+    MatrixMirror<const T, D, Device::CPU> mat_a(mat_ah);
+    MatrixMirror<const T, D, Device::CPU> mat_b(mat_bh);
+    MatrixMirror<T, D, Device::CPU> mat_c(mat_ch);
+
+    MatrixRef<const T, D> mat_sub_a(mat_a.get(), sub_spec);
+    MatrixRef<const T, D> mat_sub_b(mat_b.get(), sub_spec);
+    MatrixRef<T, D> mat_sub_c(mat_c.get(), sub_spec);
+
+    multiplication::internal::GeneralSub<B, D, T>::callNN(blas::Op::NoTrans, blas::Op::NoTrans, alpha,
+                                                          mat_sub_a, mat_sub_b, beta, mat_sub_c);
+  }
+
+  CHECK_MATRIX_NEAR(refResult, mat_ch, 40 * (mat_ch.size().rows() + 1) * TypeUtilities<T>::error,
+                    40 * (mat_ch.size().rows() + 1) * TypeUtilities<T>::error);
+}
+
+TYPED_TEST(GeneralSubMultiplicationTestMC, MatrixRefCorrectnessLocal) {
+  for (const auto& [m, mb, a, b] : sizes) {
+    const TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
+    const TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
+
+    const SizeType a_el = a * mb;
+    const SizeType b_el = std::min(b * mb, m);
+    dlaf::matrix::internal::SubMatrixSpec spec{GlobalElementIndex{a_el, a_el},
+                                               GlobalElementSize{b_el - a_el, b_el - a_el}};
+    testGeneralSubMultiplication<TypeParam, Backend::MC, Device::CPU>(spec, alpha, beta, m, mb);
+  }
+}
+
+#ifdef DLAF_WITH_GPU
+TYPED_TEST(GeneralMultiplicationTestGPU, MatrixRefCorrectnessLocal) {
+  for (const auto& [m, mb, a, b] : sizes) {
+    const SizeType a_el = a * mb;
+    const SizeType b_el = std::min(b * mb, m);
+    dlaf::matrix::internal::SubMatrixSpec spec{GlobalElementIndex{a_el, a_el},
+                                               GlobalElementSize{b_el - a_el, b_el - a_el}};
+    testGeneralMultiplication<TypeParam, Backend::GPU, Device::GPU>(spec, alpha, beta, m, mb);
+  }
+}
+#endif