Skip to content

Commit

Permalink
Develop: Enable Gemm (local) to be used with MatrixRef (#969)
Browse files Browse the repository at this point in the history
  • Loading branch information
albestro committed Oct 16, 2023
1 parent cd8da59 commit c26c5da
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 28 deletions.
5 changes: 5 additions & 0 deletions include/dlaf/multiplication/general/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <dlaf/common/pipeline.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/matrix_ref.h>
#include <dlaf/types.h>

#include "dlaf/matrix/matrix_ref.h"
Expand All @@ -33,6 +34,10 @@ struct GeneralSub {
common::Pipeline<comm::Communicator>& col_task_chain, const SizeType i_tile_from,
const SizeType i_tile_to, const T alpha, Matrix<const T, D>& mat_a,
Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c);

// Note: internal helper
static void callNN(const blas::Op opA, const blas::Op opB, const T alpha, MatrixRef<const T, D>& mat_a,
MatrixRef<const T, D>& mat_b, const T beta, MatrixRef<T, D>& mat_c);
};

// ETI
Expand Down
9 changes: 4 additions & 5 deletions include/dlaf/multiplication/general/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,12 @@ void GeneralSub<B, D, T>::callNN(const SizeType idx_begin, const SizeType idx_en

template <Backend B, Device D, class T>
void GeneralSub<B, D, T>::callNN(const blas::Op opA, const blas::Op opB, const T alpha,
dlaf::matrix::internal::MatrixRef<const T, D>& mat_a,
dlaf::matrix::internal::MatrixRef<const T, D>& mat_b, const T beta,
dlaf::matrix::internal::MatrixRef<T, D>& mat_c) {
MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b,
const T beta, MatrixRef<T, D>& mat_c) {
namespace ex = pika::execution::experimental;

for (SizeType j = 0; j < mat_b.nrTiles().cols(); ++j) {
for (SizeType i = 0; i < mat_a.nrTiles().rows(); ++i) {
for (SizeType j = 0; j < mat_c.nrTiles().cols(); ++j) {
for (SizeType i = 0; i < mat_c.nrTiles().rows(); ++i) {
for (SizeType k = 0; k < mat_a.nrTiles().cols(); ++k) {
ex::start_detached(
dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)),
Expand Down
112 changes: 89 additions & 23 deletions test/unit/multiplication/test_multiplication_general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
#include <dlaf/common/assert.h>
#include <dlaf/communication/communicator_grid.h>
#include <dlaf/matrix/index.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/matrix_mirror.h>
#include <dlaf/matrix/matrix_ref.h>
#include <dlaf/multiplication/general.h>
#include <dlaf/util_matrix.h>

Expand All @@ -30,15 +32,30 @@ using namespace dlaf::matrix;
using namespace dlaf::test;

template <class T>
struct GeneralMultiplicationTestMC : public ::testing::Test {};
struct GeneralSubMultiplicationTestMC : public ::testing::Test {};

TYPED_TEST_SUITE(GeneralMultiplicationTestMC, MatrixElementTypes);
TYPED_TEST_SUITE(GeneralSubMultiplicationTestMC, MatrixElementTypes);

#ifdef DLAF_WITH_GPU
template <class T>
struct GeneralMultiplicationTestGPU : public ::testing::Test {};
struct GeneralSubMultiplicationTestGPU : public ::testing::Test {};

TYPED_TEST_SUITE(GeneralMultiplicationTestGPU, MatrixElementTypes);
TYPED_TEST_SUITE(GeneralSubMultiplicationTestGPU, MatrixElementTypes);
#endif

::testing::Environment* const comm_grids_env =
::testing::AddGlobalTestEnvironment(new CommunicatorGrid6RanksEnvironment);

template <class T>
struct GeneralSubMultiplicationDistTestMC : public TestWithCommGrids {};

TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestMC, MatrixElementTypes);

#ifdef DLAF_WITH_GPU
template <class T>
struct GeneralSubMultiplicationDistTestGPU : public TestWithCommGrids {};

TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestGPU, MatrixElementTypes);
#endif

const std::vector<std::tuple<SizeType, SizeType, SizeType, SizeType>> sizes = {
Expand All @@ -65,8 +82,8 @@ GlobalElementSize globalTestSize(const LocalElementSize& size) {
}

template <class T, Backend B, Device D>
void testGeneralMultiplication(const SizeType a, const SizeType b, const T alpha, const T beta,
const SizeType m, const SizeType mb) {
void testGeneralSubMultiplication(const SizeType a, const SizeType b, const T alpha, const T beta,
const SizeType m, const SizeType mb) {
const SizeType a_el = a * mb;
const SizeType b_el = std::min(b * mb, m);

Expand Down Expand Up @@ -97,11 +114,11 @@ void testGeneralMultiplication(const SizeType a, const SizeType b, const T alpha
40 * (mat_ch.size().rows() + 1) * TypeUtilities<T>::error);
}

TYPED_TEST(GeneralMultiplicationTestMC, CorrectnessLocal) {
TYPED_TEST(GeneralSubMultiplicationTestMC, CorrectnessLocal) {
for (const auto& [m, mb, a, b] : sizes) {
const TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
const TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);
testGeneralMultiplication<TypeParam, Backend::MC, Device::CPU>(a, b, alpha, beta, m, mb);
testGeneralSubMultiplication<TypeParam, Backend::MC, Device::CPU>(a, b, alpha, beta, m, mb);
}
}

Expand All @@ -115,21 +132,6 @@ TYPED_TEST(GeneralMultiplicationTestGPU, CorrectnessLocal) {
}
#endif

::testing::Environment* const comm_grids_env =
::testing::AddGlobalTestEnvironment(new CommunicatorGrid6RanksEnvironment);

template <class T>
struct GeneralSubMultiplicationDistTestMC : public TestWithCommGrids {};

TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestMC, MatrixElementTypes);

#ifdef DLAF_WITH_GPU
template <class T>
struct GeneralSubMultiplicationDistTestGPU : public TestWithCommGrids {};

TYPED_TEST_SUITE(GeneralSubMultiplicationDistTestGPU, MatrixElementTypes);
#endif

template <class T, Backend B, Device D>
void testGeneralSubMultiplication(comm::CommunicatorGrid grid, const SizeType a, const SizeType b,
const T alpha, const T beta, const SizeType m, const SizeType mb) {
Expand Down Expand Up @@ -192,3 +194,67 @@ TYPED_TEST(GeneralSubMultiplicationDistTestGPU, CorrectnessDistributed) {
}
}
#endif

template <class T, Backend B, Device D>
void testGeneralSubMultiplication(dlaf::matrix::internal::SubMatrixSpec sub_spec, const T alpha,
const T beta, const SizeType m, const SizeType mb) {
using dlaf::matrix::internal::MatrixRef;

const SizeType a_el = sub_spec.origin.row();
const SizeType b_el = sub_spec.origin.row() + sub_spec.size.rows();

auto [refA, refB, refC, refResult] =
matrix::test::getSubMatrixMatrixMultiplication(a_el, b_el, m, m, m, alpha, beta, blas::Op::NoTrans,
blas::Op::NoTrans);

auto setMatrix = [&](auto elSetter, const LocalElementSize size, const TileElementSize block_size) {
Matrix<T, Device::CPU> matrix(size, block_size);
dlaf::matrix::util::set(matrix, elSetter);
return matrix;
};

Matrix<const T, Device::CPU> mat_ah = setMatrix(refA, {m, m}, {mb, mb});
Matrix<const T, Device::CPU> mat_bh = setMatrix(refB, {m, m}, {mb, mb});
Matrix<T, Device::CPU> mat_ch = setMatrix(refC, {m, m}, {mb, mb});

{
MatrixMirror<const T, D, Device::CPU> mat_a(mat_ah);
MatrixMirror<const T, D, Device::CPU> mat_b(mat_bh);
MatrixMirror<T, D, Device::CPU> mat_c(mat_ch);

MatrixRef<const T, D> mat_sub_a(mat_a.get(), sub_spec);
MatrixRef<const T, D> mat_sub_b(mat_b.get(), sub_spec);
MatrixRef<T, D> mat_sub_c(mat_c.get(), sub_spec);

multiplication::internal::GeneralSub<B, D, T>::callNN(blas::Op::NoTrans, blas::Op::NoTrans, alpha,
mat_sub_a, mat_sub_b, beta, mat_sub_c);
}

CHECK_MATRIX_NEAR(refResult, mat_ch, 40 * (mat_ch.size().rows() + 1) * TypeUtilities<T>::error,
40 * (mat_ch.size().rows() + 1) * TypeUtilities<T>::error);
}

TYPED_TEST(GeneralSubMultiplicationTestMC, MatrixRefCorrectnessLocal) {
for (const auto& [m, mb, a, b] : sizes) {
const TypeParam alpha = TypeUtilities<TypeParam>::element(-1.3, .5);
const TypeParam beta = TypeUtilities<TypeParam>::element(-2.6, .7);

const SizeType a_el = a * mb;
const SizeType b_el = std::min(b * mb, m);
dlaf::matrix::internal::SubMatrixSpec spec{GlobalElementIndex{a_el, a_el},
GlobalElementSize{b_el - a_el, b_el - a_el}};
testGeneralSubMultiplication<TypeParam, Backend::MC, Device::CPU>(spec, alpha, beta, m, mb);
}
}

#ifdef DLAF_WITH_GPU
TYPED_TEST(GeneralMultiplicationTestGPU, MatrixRefCorrectnessLocal) {
for (const auto& [m, mb, a, b] : sizes) {
const SizeType a_el = a * mb;
const SizeType b_el = std::min(b * mb, m);
dlaf::matrix::internal::SubMatrixSpec spec{GlobalElementIndex{a_el, a_el},
GlobalElementSize{b_el - a_el, b_el - a_el}};
testGeneralMultiplication<TypeParam, Backend::GPU, Device::GPU>(spec, alpha, beta, m, mb);
}
}
#endif

0 comments on commit c26c5da

Please sign in to comment.