From dd1fc411281a8fd440e49716e612b521c9ec926a Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Fri, 16 Aug 2024 14:42:48 +0200 Subject: [PATCH] basic test implementation --- .../eigensolver/test_reduction_to_band.cpp | 189 ++++++++++++------ 1 file changed, 133 insertions(+), 56 deletions(-) diff --git a/test/unit/eigensolver/test_reduction_to_band.cpp b/test/unit/eigensolver/test_reduction_to_band.cpp index c7846d0b90..df327dc789 100644 --- a/test/unit/eigensolver/test_reduction_to_band.cpp +++ b/test/unit/eigensolver/test_reduction_to_band.cpp @@ -8,7 +8,9 @@ // SPDX-License-Identifier: BSD-3-Clause // +#include #include +#include #include #include @@ -24,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -496,8 +499,7 @@ struct CAReductionToBandTest : public TestWithCommGrids {}; template using CAReductionToBandTestMC = ReductionToBandTest; -using JustThisType = ::testing::Types; -TYPED_TEST_SUITE(CAReductionToBandTestMC, JustThisType); +TYPED_TEST_SUITE(CAReductionToBandTestMC, MatrixElementTypes); template MatrixLocal allGatherT(Matrix& source, comm::CommunicatorGrid& comm_grid) { @@ -532,46 +534,138 @@ MatrixLocal allGatherT(Matrix& source, comm::Communicat } template -auto checkResult(const SizeType k, const SizeType band_size, Matrix& reference, - const MatrixLocal& mat_b, const MatrixLocal& mat_hh_1st, - const MatrixLocal& taus_1st, const MatrixLocal& mat_hh_2nd, - const std::vector& taus_2nd) { - dlaf::internal::silenceUnusedWarningFor(k, band_size, reference, mat_b); - +auto checkResult(const Distribution dist, const SizeType band_size, + Matrix& reference, const MatrixLocal& mat_b, + const MatrixLocal& mat_hh_1st, const MatrixLocal& taus_1st, + const MatrixLocal& mat_hh_2nd, const std::vector& taus_2nd) { const GlobalElementIndex offset(band_size, 0); // Now that all input are collected locally, it's time to apply the transformation, // ...but just if there is any if (offset.isIn(mat_hh_1st.size())) { - // Reduction to band returns a sequence of transformations applied from left and right to A - // allowing to reduce the matrix A to a band matrix B - // - // Hn* ... H2* H1* A H1 H2 ... Hn - // Q* A Q = B - // - // Applying the inverse of the same transformations, we can go from B to A - // Q B Q* = A - // Q = H1 H2 ... Hn - // H1 H2 ... Hn B Hn* ... H2* H1* - dlaf::common::internal::SingleThreadedBlasScope single; - // apply from left... - // const GlobalElementIndex left_offset = offset; - // const GlobalElementSize left_size{mat_b.size().rows() - band_size, mat_b.size().cols()}; - // lapack::unmqr(lapack::Side::Left, lapack::Op::NoTrans, left_size.rows(), left_size.cols(), k, - // mat_hh_1st.ptr(offset), mat_hh_1st.ld(), taus_1st.data(), mat_b.ptr(left_offset), - // mat_b.ld()); + const SizeType ntiles = mat_b.nrTiles().cols() - 1; + + // Apply in reverse order (blocked algorithm), which means both from last to first, inverting + // intra-step too, i.e. 2nd first and 1st last. + for (SizeType j = ntiles - 1; j >= 0; --j) { + const SizeType i = j + 1; + const SizeType i_el = + dist.template global_element_from_global_tile_and_tile_element(i, 0); + + const std::size_t nranks_with_data = + to_sizet(std::min(mat_b.nrTiles().rows() - i, dist.grid_size().rows())); + + // === 2nd pass + + // prepare workspace (height = max(nranks)) + reorder heads + std::vector col_rank_order(nranks_with_data, -1); + const comm::IndexT_MPI first_rank = dist.template rank_global_tile(i); + std::iota(col_rank_order.begin(), col_rank_order.end(), first_rank); + std::transform(col_rank_order.begin(), col_rank_order.end(), col_rank_order.begin(), + [size = dist.grid_size().rows()](const comm::IndexT_MPI& value) { + return std::modulus{}(value, size); + }); + + // HH2 + // TODO it does not deal with incomplete tiles yet + MatrixLocal hh_2nd({to_SizeType(nranks_with_data) * mat_b.blockSize().rows(), + mat_b.blockSize().cols()}, + mat_b.blockSize()); + + for (SizeType i = 0; i < to_SizeType(col_rank_order.size()); ++i) { + const SizeType ii = to_SizeType(col_rank_order[to_sizet(i)]); + matrix::internal::copy(mat_hh_2nd.tile({ii, j}), hh_2nd.tile({i, 0})); + } - // ... and from right - // const GlobalElementIndex right_offset = common::transposed(left_offset); - // const GlobalElementSize right_size = common::transposed(left_size); + const SizeType nrefls = [&]() { + // TODO workaround (j == ntiles - 1 ? 1 : 0) + return std::min(dist.template tile_size_of(j), + hh_2nd.size().rows() - (j == ntiles - 1 ? 1 : 0)); + }(); + + // T2 + const SizeType j_el = + dist.template global_element_from_global_tile_and_tile_element(j, 0); + MatrixLocal T_2nd({nrefls, nrefls}, mat_b.blockSize()); + lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise, hh_2nd.size().rows(), nrefls, + hh_2nd.ptr(), hh_2nd.ld(), taus_2nd.data() + j_el, T_2nd.ptr(), T_2nd.ld()); + + // Apply HH2 from L and R + lapack::larfb(lapack::Side::Left, lapack::Op::NoTrans, lapack::Direction::Forward, + lapack::StoreV::Columnwise, hh_2nd.size().rows(), mat_b.size().cols() - j_el, nrefls, + hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(), mat_b.ptr({i_el, j_el}), + mat_b.ld()); + lapack::larfb(lapack::Side::Right, lapack::Op::ConjTrans, lapack::Direction::Forward, + lapack::StoreV::Columnwise, mat_b.size().rows() - j_el, hh_2nd.size().rows(), nrefls, + hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(), mat_b.ptr({j_el, i_el}), + mat_b.ld()); + + // === 1st pass + + // HH1 (for all ranks) + // prepare workspace (height = local matrix for each rank) with zeros to fill voids + // Note: HH_1st workspaces is stored as a matrix where each column of tiles is for a specific rank. + MatrixLocal hh_1st({dist.size().rows() - i_el, + to_SizeType(nranks_with_data) * dist.tile_size().cols()}, + dist.tile_size()); + + std::size_t col_rank_current = 0; + for (SizeType i_a = i; i_a < dist.nr_tiles().rows(); ++i_a, ++col_rank_current) { + col_rank_current %= col_rank_order.size(); + + const SizeType i_hh = i_a - i; + + for (SizeType j_hh = 0; j_hh < hh_1st.nrTiles().cols(); ++j_hh) { + const auto& tile_hh = hh_1st.tile({i_hh, j_hh}); + + if (j_hh == to_SizeType(col_rank_current)) { + dlaf::matrix::internal::copy(mat_hh_1st.tile({i_a, j}), tile_hh); + } + else { + dlaf::tile::internal::set0(tile_hh); + } + } + } - // lapack::unmqr(lapack::Side::Right, lapack::Op::ConjTrans, right_size.rows(), right_size.cols(), k, - // mat_hh_1st.ptr(offset), mat_hh_1st.ld(), taus_1st.data(), mat_b.ptr(right_offset), - // mat_b.ld()); - } + // Note: well-formed heads + for (SizeType j = 0; j < hh_1st.nrTiles().cols(); ++j) { + const auto& tile_hh = hh_1st.tile({j, j}); + dlaf::tile::internal::laset(blas::Uplo::Upper, T(0), T(1), tile_hh); + } - dlaf::internal::silenceUnusedWarningFor(mat_hh_1st, taus_1st); + // Note: apply one HH1 per time, independently, order not relevant + for (SizeType col_rank = 0; col_rank < to_SizeType(col_rank_order.size()); ++col_rank) { + const SizeType rank = to_SizeType(col_rank_order[to_sizet(col_rank)]); + + // Compute T1 + const auto& tile_taus = taus_1st.tile({j, rank}); + const SizeType nrefls = tile_taus.size().rows(); + const auto& hh_1st_head = hh_1st.tile({col_rank, col_rank}); + + MatrixLocal T_1st({nrefls, nrefls}, mat_b.blockSize()); + lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise, + hh_1st.size().rows() - col_rank * dist.tile_size().rows(), nrefls, + hh_1st_head.ptr(), hh_1st.ld(), tile_taus.ptr(), T_1st.ptr(), T_1st.ld()); + + // Apply HH1 (of a rank) from L and R + { + const SizeType m = dist.size().rows() - (i + col_rank) * dist.tile_size().rows(); + const SizeType n = dist.size().cols() - j * dist.tile_size().cols(); + lapack::larfb(lapack::Side::Left, lapack::Op::NoTrans, lapack::Direction::Forward, + lapack::StoreV::Columnwise, m, n, nrefls, hh_1st_head.ptr(), hh_1st.ld(), + T_1st.ptr(), T_1st.ld(), mat_b.tile({i + col_rank, j}).ptr(), mat_b.ld()); + } + { + const SizeType m = dist.size().rows() - j * dist.tile_size().rows(); + const SizeType n = dist.size().cols() - (i + col_rank) * dist.tile_size().cols(); + lapack::larfb(lapack::Side::Right, lapack::Op::ConjTrans, lapack::Direction::Forward, + lapack::StoreV::Columnwise, m, n, nrefls, hh_1st_head.ptr(), hh_1st.ld(), + T_1st.ptr(), T_1st.ld(), mat_b.tile({j, i + col_rank}).ptr(), mat_b.ld()); + } + } + } + } // Eventually, check the result obtained by applying the inverse transformation equals the original matrix auto result = [&dist = reference.distribution(), @@ -580,12 +674,9 @@ auto checkResult(const SizeType k, const SizeType band_size, Matrix(1, mat_b.size().linear_size()) * TypeUtilities::error); - dlaf::internal::silenceUnusedWarningFor(mat_hh_2nd, taus_2nd); + CHECK_MATRIX_NEAR(result, reference, 0, + std::max(1, mat_b.size().linear_size()) * TypeUtilities::error); } template @@ -595,11 +686,11 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize const SizeType k_reflectors = std::max(SizeType(0), size.rows() - band_size - 1); DLAF_ASSERT(block_size.rows() % band_size == 0, block_size.rows(), band_size); - Distribution distribution({size.rows(), size.cols()}, block_size, grid.size(), grid.rank(), {0, 0}); + const Distribution dist({size.rows(), size.cols()}, block_size, grid.size(), grid.rank(), {0, 0}); // setup the reference input matrix Matrix reference = [&]() { - Matrix reference(distribution); + Matrix reference(dist); if (input_matrix_structure == InputMatrixStructure::banded) // Matrix already in band form, with band smaller than band_size matrix::util::set_random_hermitian_banded(reference, band_size - 1); @@ -608,21 +699,14 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize return reference; }(); - Matrix matrix_a_h(distribution); + Matrix matrix_a_h(dist); copy(reference, matrix_a_h); - print(format::numpy{}, "A", matrix_a_h); - eigensolver::internal::CARed2BandResult red2band_result = [&]() { MatrixMirror matrix_a(matrix_a_h); return eigensolver::internal::ca_reduction_to_band(grid, matrix_a.get(), band_size); }(); - // print(format::numpy{}, "R", matrix_a_h); - // print(format::numpy{}, "taus_1st", red2band_result.taus_1st); - // print(format::numpy{}, "mat_hh_2nd", red2band_result.hh_2nd); - // print(format::numpy{}, "taus_2nd", red2band_result.taus_2nd); - ASSERT_EQ(red2band_result.taus_1st.block_size().rows(), block_size.rows()); ASSERT_EQ(red2band_result.taus_2nd.block_size().rows(), block_size.rows()); @@ -642,17 +726,10 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize auto taus_2nd = allGatherTaus(k_reflectors, red2band_result.taus_2nd, grid); ASSERT_EQ(taus_2nd.size(), k_reflectors); - print(format::numpy{}, "mat_hh_2nd", mat_hh_2nd); - print(format::numpy{}, "mat_taus_1st", taus_1st); - std::cout << "taus_2nd = "; - for (auto tau : taus_2nd) - std::cout << tau << ", "; - std::cout << std::endl; - auto mat_band = makeLocal(matrix_a_h); splitReflectorsAndBand(mat_hh_1st, mat_band, band_size); - checkResult(k_reflectors, band_size, reference, mat_band, mat_hh_1st, taus_1st, mat_hh_2nd, taus_2nd); + checkResult(dist, band_size, reference, mat_band, mat_hh_1st, taus_1st, mat_hh_2nd, taus_2nd); } TYPED_TEST(CAReductionToBandTestMC, CorrectnessDistributed) {