From c26c5da86675c813618f70cc5f4babaac215f4bc Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Mon, 16 Oct 2023 15:27:01 +0200 Subject: [PATCH] Develop: Enable Gemm (local) to be used with MatrixRef (#969) --- include/dlaf/multiplication/general/api.h | 5 + include/dlaf/multiplication/general/impl.h | 9 +- .../test_multiplication_general.cpp | 112 ++++++++++++++---- 3 files changed, 98 insertions(+), 28 deletions(-) diff --git a/include/dlaf/multiplication/general/api.h b/include/dlaf/multiplication/general/api.h index e9f7d558ad..5d3ebe743e 100644 --- a/include/dlaf/multiplication/general/api.h +++ b/include/dlaf/multiplication/general/api.h @@ -14,6 +14,7 @@ #include #include +#include #include #include "dlaf/matrix/matrix_ref.h" @@ -33,6 +34,10 @@ struct GeneralSub { common::Pipeline& col_task_chain, const SizeType i_tile_from, const SizeType i_tile_to, const T alpha, Matrix& mat_a, Matrix& mat_b, const T beta, Matrix& mat_c); + + // Note: internal helper + static void callNN(const blas::Op opA, const blas::Op opB, const T alpha, MatrixRef& mat_a, + MatrixRef& mat_b, const T beta, MatrixRef& mat_c); }; // ETI diff --git a/include/dlaf/multiplication/general/impl.h b/include/dlaf/multiplication/general/impl.h index 1918134028..b03b512ff8 100644 --- a/include/dlaf/multiplication/general/impl.h +++ b/include/dlaf/multiplication/general/impl.h @@ -49,13 +49,12 @@ void GeneralSub::callNN(const SizeType idx_begin, const SizeType idx_en template void GeneralSub::callNN(const blas::Op opA, const blas::Op opB, const T alpha, - dlaf::matrix::internal::MatrixRef& mat_a, - dlaf::matrix::internal::MatrixRef& mat_b, const T beta, - dlaf::matrix::internal::MatrixRef& mat_c) { + MatrixRef& mat_a, MatrixRef& mat_b, + const T beta, MatrixRef& mat_c) { namespace ex = pika::execution::experimental; - for (SizeType j = 0; j < mat_b.nrTiles().cols(); ++j) { - for (SizeType i = 0; i < mat_a.nrTiles().rows(); ++i) { + for (SizeType j = 0; j < mat_c.nrTiles().cols(); ++j) { + for (SizeType i = 0; i < mat_c.nrTiles().rows(); ++i) { for (SizeType k = 0; k < mat_a.nrTiles().cols(); ++k) { ex::start_detached( dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)), diff --git a/test/unit/multiplication/test_multiplication_general.cpp b/test/unit/multiplication/test_multiplication_general.cpp index ea13870301..11af474531 100644 --- a/test/unit/multiplication/test_multiplication_general.cpp +++ b/test/unit/multiplication/test_multiplication_general.cpp @@ -14,7 +14,9 @@ #include #include #include +#include #include +#include #include #include @@ -30,15 +32,30 @@ using namespace dlaf::matrix; using namespace dlaf::test; template -struct GeneralMultiplicationTestMC : public ::testing::Test {}; +struct GeneralSubMultiplicationTestMC : public ::testing::Test {}; -TYPED_TEST_SUITE(GeneralMultiplicationTestMC, MatrixElementTypes); +TYPED_TEST_SUITE(GeneralSubMultiplicationTestMC, MatrixElementTypes); #ifdef DLAF_WITH_GPU template -struct GeneralMultiplicationTestGPU : public ::testing::Test {}; +struct GeneralSubMultiplicationTestGPU : public ::testing::Test {}; -TYPED_TEST_SUITE(GeneralMultiplicationTestGPU, MatrixElementTypes); +TYPED_TEST_SUITE(GeneralSubMultiplicationTestGPU, MatrixElementTypes); +#endif + +::testing::Environment* const comm_grids_env = + ::testing::AddGlobalTestEnvironment(new CommunicatorGrid6RanksEnvironment); + +template +struct GeneralSubMultiplicationDistTestMC : public TestWithCommGrids {}; + +TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestMC, MatrixElementTypes); + +#ifdef DLAF_WITH_GPU +template +struct GeneralSubMultiplicationDistTestGPU : public TestWithCommGrids {}; + +TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestGPU, MatrixElementTypes); #endif const std::vector> sizes = { @@ -65,8 +82,8 @@ GlobalElementSize globalTestSize(const LocalElementSize& size) { } template -void testGeneralMultiplication(const SizeType a, const SizeType b, const T alpha, const T beta, - const SizeType m, const SizeType mb) { +void testGeneralSubMultiplication(const SizeType a, const SizeType b, const T alpha, const T beta, + const SizeType m, const SizeType mb) { const SizeType a_el = a * mb; const SizeType b_el = std::min(b * mb, m); @@ -97,11 +114,11 @@ void testGeneralMultiplication(const SizeType a, const SizeType b, const T alpha 40 * (mat_ch.size().rows() + 1) * TypeUtilities::error); } -TYPED_TEST(GeneralMultiplicationTestMC, CorrectnessLocal) { +TYPED_TEST(GeneralSubMultiplicationTestMC, CorrectnessLocal) { for (const auto& [m, mb, a, b] : sizes) { const TypeParam alpha = TypeUtilities::element(-1.3, .5); const TypeParam beta = TypeUtilities::element(-2.6, .7); - testGeneralMultiplication(a, b, alpha, beta, m, mb); + testGeneralSubMultiplication(a, b, alpha, beta, m, mb); } } @@ -115,21 +132,6 @@ TYPED_TEST(GeneralMultiplicationTestGPU, CorrectnessLocal) { } #endif -::testing::Environment* const comm_grids_env = - ::testing::AddGlobalTestEnvironment(new CommunicatorGrid6RanksEnvironment); - -template -struct GeneralSubMultiplicationDistTestMC : public TestWithCommGrids {}; - -TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestMC, MatrixElementTypes); - -#ifdef DLAF_WITH_GPU -template -struct GeneralSubMultiplicationDistTestGPU : public TestWithCommGrids {}; - -TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestGPU, MatrixElementTypes); -#endif - template void testGeneralSubMultiplication(comm::CommunicatorGrid grid, const SizeType a, const SizeType b, const T alpha, const T beta, const SizeType m, const SizeType mb) { @@ -192,3 +194,67 @@ TYPED_TEST(GeneralSubMultiplicationDistTestGPU, CorrectnessDistributed) { } } #endif + +template +void testGeneralSubMultiplication(dlaf::matrix::internal::SubMatrixSpec sub_spec, const T alpha, + const T beta, const SizeType m, const SizeType mb) { + using dlaf::matrix::internal::MatrixRef; + + const SizeType a_el = sub_spec.origin.row(); + const SizeType b_el = sub_spec.origin.row() + sub_spec.size.rows(); + + auto [refA, refB, refC, refResult] = + matrix::test::getSubMatrixMatrixMultiplication(a_el, b_el, m, m, m, alpha, beta, blas::Op::NoTrans, + blas::Op::NoTrans); + + auto setMatrix = [&](auto elSetter, const LocalElementSize size, const TileElementSize block_size) { + Matrix matrix(size, block_size); + dlaf::matrix::util::set(matrix, elSetter); + return matrix; + }; + + Matrix mat_ah = setMatrix(refA, {m, m}, {mb, mb}); + Matrix mat_bh = setMatrix(refB, {m, m}, {mb, mb}); + Matrix mat_ch = setMatrix(refC, {m, m}, {mb, mb}); + + { + MatrixMirror mat_a(mat_ah); + MatrixMirror mat_b(mat_bh); + MatrixMirror mat_c(mat_ch); + + MatrixRef mat_sub_a(mat_a.get(), sub_spec); + MatrixRef mat_sub_b(mat_b.get(), sub_spec); + MatrixRef mat_sub_c(mat_c.get(), sub_spec); + + multiplication::internal::GeneralSub::callNN(blas::Op::NoTrans, blas::Op::NoTrans, alpha, + mat_sub_a, mat_sub_b, beta, mat_sub_c); + } + + CHECK_MATRIX_NEAR(refResult, mat_ch, 40 * (mat_ch.size().rows() + 1) * TypeUtilities::error, + 40 * (mat_ch.size().rows() + 1) * TypeUtilities::error); +} + +TYPED_TEST(GeneralSubMultiplicationTestMC, MatrixRefCorrectnessLocal) { + for (const auto& [m, mb, a, b] : sizes) { + const TypeParam alpha = TypeUtilities::element(-1.3, .5); + const TypeParam beta = TypeUtilities::element(-2.6, .7); + + const SizeType a_el = a * mb; + const SizeType b_el = std::min(b * mb, m); + dlaf::matrix::internal::SubMatrixSpec spec{GlobalElementIndex{a_el, a_el}, + GlobalElementSize{b_el - a_el, b_el - a_el}}; + testGeneralSubMultiplication(spec, alpha, beta, m, mb); + } +} + +#ifdef DLAF_WITH_GPU +TYPED_TEST(GeneralMultiplicationTestGPU, MatrixRefCorrectnessLocal) { + for (const auto& [m, mb, a, b] : sizes) { + const SizeType a_el = a * mb; + const SizeType b_el = std::min(b * mb, m); + dlaf::matrix::internal::SubMatrixSpec spec{GlobalElementIndex{a_el, a_el}, + GlobalElementSize{b_el - a_el, b_el - a_el}}; + testGeneralMultiplication(spec, alpha, beta, m, mb); + } +} +#endif