diff --git a/master/annotated.html b/master/annotated.html index 810d95cfce..6ea3f91223 100644 --- a/master/annotated.html +++ b/master/annotated.html @@ -257,9 +257,10 @@  CMemoryView  Nmultiplication  Ninternal - CGeneralSub - CHermitian - CTriangular + CGeneral + CGeneralSub + CHermitian + CTriangular  Npermutations  Ninternal  CPermutations diff --git a/master/classes.html b/master/classes.html index f606daa7b5..819bed646d 100644 --- a/master/classes.html +++ b/master/classes.html @@ -87,7 +87,7 @@
FormatShort (dlaf::internal)
G
-
gemmSizes (dlaf::tile::internal)
GenEigensolver (dlaf::eigensolver::internal)
GeneralSub (dlaf::multiplication::internal)
GenToStd (dlaf::eigensolver::internal)
getter_random (dlaf::matrix::util::internal)
getter_random< std::complex< T > > (dlaf::matrix::util::internal)
GivensRotation (dlaf::eigensolver::internal)
+
gemmSizes (dlaf::tile::internal)
GenEigensolver (dlaf::eigensolver::internal)
General (dlaf::multiplication::internal)
GeneralSub (dlaf::multiplication::internal)
GenToStd (dlaf::eigensolver::internal)
getter_random (dlaf::matrix::util::internal)
getter_random< std::complex< T > > (dlaf::matrix::util::internal)
GivensRotation (dlaf::eigensolver::internal)
H
Helpers (dlaf::eigensolver::internal::bt_red_band)
Helpers (dlaf::factorization::internal::tfactor_l)
Helpers< Backend::GPU > (dlaf::eigensolver::internal::bt_red_band)
Helpers< Backend::GPU, Device::GPU, T > (dlaf::factorization::internal::tfactor_l)
Helpers< Backend::MC > (dlaf::eigensolver::internal::bt_red_band)
Helpers< Backend::MC, Device::CPU, T > (dlaf::factorization::internal::tfactor_l)
hemmSizes (dlaf::tile::internal)
her2kSizes (dlaf::tile::internal)
herkSizes (dlaf::tile::internal)
Hermitian (dlaf::multiplication::internal)
HHManager (dlaf::eigensolver::internal::bt_tridiag)
HHManager< Backend::GPU, Device::GPU, T > (dlaf::eigensolver::internal::bt_tridiag)
HHManager< Backend::MC, Device::CPU, T > (dlaf::eigensolver::internal::bt_tridiag)
diff --git a/master/hierarchy.html b/master/hierarchy.html index 69b23f0270..dc1419f580 100644 --- a/master/hierarchy.html +++ b/master/hierarchy.html @@ -153,148 +153,149 @@  Cdlaf::internal::FormatShort< T >  Cdlaf::tile::internal::gemmSizes  Cdlaf::eigensolver::internal::GenEigensolver< backend, device, T > - Cdlaf::multiplication::internal::GeneralSub< B, D, T > - Cdlaf::eigensolver::internal::GenToStd< backend, device, T > - Cdlaf::matrix::util::internal::getter_random< T >Callable that returns random values in the range [-1, 1] - Cdlaf::matrix::util::internal::getter_random< std::complex< T > >Callable that returns random complex numbers whose absolute values are less than 1 - Cdlaf::eigensolver::internal::GivensRotation< T > - Cdlaf::eigensolver::internal::bt_red_band::Helpers< B > - Cdlaf::factorization::internal::tfactor_l::Helpers< backend, device, T > - Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::GPU > - Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::GPU, Device::GPU, T > - Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::MC > - Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::MC, Device::CPU, T > - Cdlaf::tile::internal::hemmSizes - Cdlaf::tile::internal::her2kSizes - Cdlaf::tile::internal::herkSizes - Cdlaf::multiplication::internal::Hermitian< B, D, T > - Cdlaf::eigensolver::internal::bt_tridiag::HHManager< B, D, T > - Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::GPU, Device::GPU, T > - Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::MC, Device::CPU, T > - Cstd::integral_constant - Cdlaf::internal::IsFalse< T > - Cdlaf::common::is_data< Data > - Cstd::is_floating_point - Cdlaf::internal::IsFloatingPointOrComplex< T > - Cdlaf::internal::IsFloatingPointOrComplex< std::complex< T > > - Cdlaf::common::IterableRange2D< IndexT, Tag >An Iterable representing a 2D range - Cdlaf::common::IteratorRange2D< IndexT, Tag >An Iterator returning indices in column-major order - Cdlaf::matrix::LayoutInfo - Cdlaf::matrix::Matrix< T, D > - Cdlaf::matrix::Matrix< const T, Source > - Cdlaf::matrix::Matrix< const T, SourceTarget > - Cdlaf::matrix::Matrix< T, Source > - Cdlaf::matrix::Matrix< T, SourceTarget > - Cdlaf::matrix::Matrix< T, Target > - Cdlaf::matrix::internal::MatrixBase - Cdlaf::matrix::Matrix< const T, D > - Cdlaf::matrix::internal::MatrixRef< const T, D > - Cdlaf::matrix::MatrixMirror< T, Target, Source > - Cdlaf::matrix::MatrixMirror< const T, SourceTarget, SourceTarget > - Cdlaf::matrix::MatrixMirror< T, SourceTarget, SourceTarget > - Cdlaf::matrix::MatrixMirror< const T, Target, Source > - Cdlaf::matrix::internal::MatrixRef< T, D > - Cdlaf::memory::MemoryChunk< T, D >The class MemoryChunk represents a layer of abstraction over the underlying device memory - Cdlaf::memory::MemoryView< T, D > - Cdlaf::memory::MemoryView< T, Device::CPU > - Cdlaf::comm::Message< Data > - Cdlaf::internal::moveNonConstTile< T > - Cdlaf::comm::mpi_datatype< T >Mapper between language types and basic MPI_Datatype - Cdlaf::comm::mpi_datatype< const T >Helper for mapping also const types - Cdlaf::comm::mpi_init - Cdlaf::comm::internal::MPICallHelper< F > - Cdlaf::auxiliary::internal::Norm< backend, device, T > - Cdlaf::auxiliary::internal::Norm< Backend::MC, Device::CPU, T > - Cdlaf::format::numpy - Cdlaf::matrix::internal::numpy_datatype< T > - Cdlaf::matrix::internal::numpy_datatype< std::complex< T > > - Cdlaf::matrix::Panel< axis, T, D, Storage > - Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::No > - Cdlaf::matrix::Panel< orthogonal(axis), const T, D, StoreTransposed::No > - Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::Yes > - Cdlaf::internal::PartialTransformBase< Tag, B, F > - Cdlaf::internal::PartialTransform< Tag, B, F > - Cdlaf::internal::PartialTransformDetach< Tag, B, F > - Cdlaf::comm::internal::PartialTransformMPIBase< F > - Cdlaf::comm::internal::PartialTransformMPI< F > - Cdlaf::comm::internal::PartialTransformMPIDetach< F > - Cdlaf::permutations::internal::Permutations< B, D, T, coord > - Cdlaf::common::Pipeline< T > - Cdlaf::internal::Policy< B > - Cdlaf::internal::PrependPack< Pack, T > - Cdlaf::internal::PrependPack< Pack< Ts... >, T > - Cdlaf::factorization::internal::QR< backend, device, T > - Cdlaf::factorization::internal::QR_Tfactor< backend, device, T > - Cdlaf::eigensolver::internal::ReductionToBand< B, D, T > - Cdlaf::common::RoundRobin< T > - Cdlaf::common::RoundRobin< dlaf::matrix::Panel< Coord::Col, T, Device::CPU > > - Cdlaf::ScopedInitializer - Cdlaf::eigensolver::internal::ScopedSenderWait - Cdlaf::internal::SenderSingleValueTypeImpl< ValueTypes > - Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::read > > > > - Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::readwrite > > > > - Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< std::reference_wrapper< T > > > > - Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< T > > > - Cdlaf::common::internal::SingleThreadedBlasScope - Cdlaf::common::internal::source_location - Cdlaf::matrix::SubDistributionSpecContains information to create a sub-distribution - Cdlaf::matrix::Matrix< const T, D >::SubPipelineTag - Cdlaf::matrix::SubTileSpecContains the information to create a subtile - Cdlaf::eigensolver::internal::SweepWorker< T > - Cdlaf::eigensolver::internal::SweepWorkerDist< T > - Cdlaf::matrix::Tile< T, D > - Cdlaf::matrix::Tile< const T, D > - Cdlaf::eigensolver::internal::bt_tridiag::TileAccessHelper - Cdlaf::eigensolver::internal::TileCollector - Cdlaf::matrix::internal::TileData< T, D > - Cdlaf::matrix::internal::TilePipeline< T, D > - Cdlaf::common::Timer< clock > - Cdlaf::internal::TransformPack< Pack, Transformer > - Cdlaf::internal::TransformPack< Pack< Ts... >, Transformer > - Cdlaf::multiplication::internal::Triangular< backend, device, T > - Cdlaf::solver::internal::Triangular< backend, device, T > - Cdlaf::eigensolver::internal::TridiagResult< T, D > - Cdlaf::eigensolver::internal::TridiagSolver< backend, device, T > - Cdlaf::tile::internal::trmmSizes - Cdlaf::tile::internal::trsmSizes - Cstd::true_type - Cdlaf::common::internal::is_coord< Index2D< T, Tag > > - Cdlaf::common::internal::is_coord< Size2D< T, Tag > > - Cdlaf::internal::Contains< T, T, Ts... > - Cdlaf::TuneParameters - Cstd::conditional::type - Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack< U, Us... > > + Cdlaf::multiplication::internal::General< B, D, T > + Cdlaf::multiplication::internal::GeneralSub< B, D, T > + Cdlaf::eigensolver::internal::GenToStd< backend, device, T > + Cdlaf::matrix::util::internal::getter_random< T >Callable that returns random values in the range [-1, 1] + Cdlaf::matrix::util::internal::getter_random< std::complex< T > >Callable that returns random complex numbers whose absolute values are less than 1 + Cdlaf::eigensolver::internal::GivensRotation< T > + Cdlaf::eigensolver::internal::bt_red_band::Helpers< B > + Cdlaf::factorization::internal::tfactor_l::Helpers< backend, device, T > + Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::GPU > + Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::GPU, Device::GPU, T > + Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::MC > + Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::MC, Device::CPU, T > + Cdlaf::tile::internal::hemmSizes + Cdlaf::tile::internal::her2kSizes + Cdlaf::tile::internal::herkSizes + Cdlaf::multiplication::internal::Hermitian< B, D, T > + Cdlaf::eigensolver::internal::bt_tridiag::HHManager< B, D, T > + Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::GPU, Device::GPU, T > + Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::MC, Device::CPU, T > + Cstd::integral_constant + Cdlaf::internal::IsFalse< T > + Cdlaf::common::is_data< Data > + Cstd::is_floating_point + Cdlaf::internal::IsFloatingPointOrComplex< T > + Cdlaf::internal::IsFloatingPointOrComplex< std::complex< T > > + Cdlaf::common::IterableRange2D< IndexT, Tag >An Iterable representing a 2D range + Cdlaf::common::IteratorRange2D< IndexT, Tag >An Iterator returning indices in column-major order + Cdlaf::matrix::LayoutInfo + Cdlaf::matrix::Matrix< T, D > + Cdlaf::matrix::Matrix< const T, Source > + Cdlaf::matrix::Matrix< const T, SourceTarget > + Cdlaf::matrix::Matrix< T, Source > + Cdlaf::matrix::Matrix< T, SourceTarget > + Cdlaf::matrix::Matrix< T, Target > + Cdlaf::matrix::internal::MatrixBase + Cdlaf::matrix::Matrix< const T, D > + Cdlaf::matrix::internal::MatrixRef< const T, D > + Cdlaf::matrix::MatrixMirror< T, Target, Source > + Cdlaf::matrix::MatrixMirror< const T, SourceTarget, SourceTarget > + Cdlaf::matrix::MatrixMirror< T, SourceTarget, SourceTarget > + Cdlaf::matrix::MatrixMirror< const T, Target, Source > + Cdlaf::matrix::internal::MatrixRef< T, D > + Cdlaf::memory::MemoryChunk< T, D >The class MemoryChunk represents a layer of abstraction over the underlying device memory + Cdlaf::memory::MemoryView< T, D > + Cdlaf::memory::MemoryView< T, Device::CPU > + Cdlaf::comm::Message< Data > + Cdlaf::internal::moveNonConstTile< T > + Cdlaf::comm::mpi_datatype< T >Mapper between language types and basic MPI_Datatype + Cdlaf::comm::mpi_datatype< const T >Helper for mapping also const types + Cdlaf::comm::mpi_init + Cdlaf::comm::internal::MPICallHelper< F > + Cdlaf::auxiliary::internal::Norm< backend, device, T > + Cdlaf::auxiliary::internal::Norm< Backend::MC, Device::CPU, T > + Cdlaf::format::numpy + Cdlaf::matrix::internal::numpy_datatype< T > + Cdlaf::matrix::internal::numpy_datatype< std::complex< T > > + Cdlaf::matrix::Panel< axis, T, D, Storage > + Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::No > + Cdlaf::matrix::Panel< orthogonal(axis), const T, D, StoreTransposed::No > + Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::Yes > + Cdlaf::internal::PartialTransformBase< Tag, B, F > + Cdlaf::internal::PartialTransform< Tag, B, F > + Cdlaf::internal::PartialTransformDetach< Tag, B, F > + Cdlaf::comm::internal::PartialTransformMPIBase< F > + Cdlaf::comm::internal::PartialTransformMPI< F > + Cdlaf::comm::internal::PartialTransformMPIDetach< F > + Cdlaf::permutations::internal::Permutations< B, D, T, coord > + Cdlaf::common::Pipeline< T > + Cdlaf::internal::Policy< B > + Cdlaf::internal::PrependPack< Pack, T > + Cdlaf::internal::PrependPack< Pack< Ts... >, T > + Cdlaf::factorization::internal::QR< backend, device, T > + Cdlaf::factorization::internal::QR_Tfactor< backend, device, T > + Cdlaf::eigensolver::internal::ReductionToBand< B, D, T > + Cdlaf::common::RoundRobin< T > + Cdlaf::common::RoundRobin< dlaf::matrix::Panel< Coord::Col, T, Device::CPU > > + Cdlaf::ScopedInitializer + Cdlaf::eigensolver::internal::ScopedSenderWait + Cdlaf::internal::SenderSingleValueTypeImpl< ValueTypes > + Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::read > > > > + Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::readwrite > > > > + Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< std::reference_wrapper< T > > > > + Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< T > > > + Cdlaf::common::internal::SingleThreadedBlasScope + Cdlaf::common::internal::source_location + Cdlaf::matrix::SubDistributionSpecContains information to create a sub-distribution + Cdlaf::matrix::Matrix< const T, D >::SubPipelineTag + Cdlaf::matrix::SubTileSpecContains the information to create a subtile + Cdlaf::eigensolver::internal::SweepWorker< T > + Cdlaf::eigensolver::internal::SweepWorkerDist< T > + Cdlaf::matrix::Tile< T, D > + Cdlaf::matrix::Tile< const T, D > + Cdlaf::eigensolver::internal::bt_tridiag::TileAccessHelper + Cdlaf::eigensolver::internal::TileCollector + Cdlaf::matrix::internal::TileData< T, D > + Cdlaf::matrix::internal::TilePipeline< T, D > + Cdlaf::common::Timer< clock > + Cdlaf::internal::TransformPack< Pack, Transformer > + Cdlaf::internal::TransformPack< Pack< Ts... >, Transformer > + Cdlaf::multiplication::internal::Triangular< backend, device, T > + Cdlaf::solver::internal::Triangular< backend, device, T > + Cdlaf::eigensolver::internal::TridiagResult< T, D > + Cdlaf::eigensolver::internal::TridiagSolver< backend, device, T > + Cdlaf::tile::internal::trmmSizes + Cdlaf::tile::internal::trsmSizes + Cstd::true_type + Cdlaf::common::internal::is_coord< Index2D< T, Tag > > + Cdlaf::common::internal::is_coord< Size2D< T, Tag > > + Cdlaf::internal::Contains< T, T, Ts... > + Cdlaf::TuneParameters  Cstd::conditional::type - Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack< U, Us... > > - Cdlaf::comm::internal::type_handler< T > - Cdlaf::TypeInfo< T > - Cdlaf::TypeInfo< const T > - Cdlaf::TypeInfo< std::complex< T > > - Cdlaf::internal::TypeList<... > - Cdlaf::internal::Unique< Pack > - Cdlaf::internal::UniqueHelper< PackUnique, PackRest > - Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack<> > - Cdlaf::internal::UniqueHelper< Pack<>, Pack< Ts... > > - Cdlaf::internal::Unique< Pack< Ts... > > - Cdlaf::internal::UniquePack< Pack > - Cdlaf::internal::UniquePackHelper< PackUnique, PackRest > - Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack<> > - Cdlaf::internal::UniquePackHelper< Pack<>, Pack< Ts... > > - Cdlaf::internal::UniquePack< Pack< Ts... > > - Cdlaf::common::internal::Unwrapper< T > - Cdlaf::common::internal::Unwrapper< pika::execution::experimental::async_rw_mutex_access_wrapper< T1, T2, at > > - Cdlaf::common::internal::Unwrapper< std::reference_wrapper< T > > - Cdlaf::common::internal::Unwrapping< F > - Cdlaf::eigensolver::internal::VAccessHelper - Cstd::vector - Cdlaf::common::internal::vector< ReadOnlySenderType > - Cdlaf::common::internal::vector< T > - Cdlaf::matrix::internal::View - Cdlaf::matrix::SubMatrixView - Cdlaf::matrix::SubPanelView - Cdlaf::eigensolver::internal::WorkSpace< T, D > - Cdlaf::eigensolver::internal::WorkSpaceHost< T > - Cdlaf::eigensolver::internal::WorkSpaceHostMirror< T, D > + Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack< U, Us... > > + Cstd::conditional::type + Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack< U, Us... > > + Cdlaf::comm::internal::type_handler< T > + Cdlaf::TypeInfo< T > + Cdlaf::TypeInfo< const T > + Cdlaf::TypeInfo< std::complex< T > > + Cdlaf::internal::TypeList<... > + Cdlaf::internal::Unique< Pack > + Cdlaf::internal::UniqueHelper< PackUnique, PackRest > + Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack<> > + Cdlaf::internal::UniqueHelper< Pack<>, Pack< Ts... > > + Cdlaf::internal::Unique< Pack< Ts... > > + Cdlaf::internal::UniquePack< Pack > + Cdlaf::internal::UniquePackHelper< PackUnique, PackRest > + Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack<> > + Cdlaf::internal::UniquePackHelper< Pack<>, Pack< Ts... > > + Cdlaf::internal::UniquePack< Pack< Ts... > > + Cdlaf::common::internal::Unwrapper< T > + Cdlaf::common::internal::Unwrapper< pika::execution::experimental::async_rw_mutex_access_wrapper< T1, T2, at > > + Cdlaf::common::internal::Unwrapper< std::reference_wrapper< T > > + Cdlaf::common::internal::Unwrapping< F > + Cdlaf::eigensolver::internal::VAccessHelper + Cstd::vector + Cdlaf::common::internal::vector< ReadOnlySenderType > + Cdlaf::common::internal::vector< T > + Cdlaf::matrix::internal::View + Cdlaf::matrix::SubMatrixView + Cdlaf::matrix::SubPanelView + Cdlaf::eigensolver::internal::WorkSpace< T, D > + Cdlaf::eigensolver::internal::WorkSpaceHost< T > + Cdlaf::eigensolver::internal::WorkSpaceHostMirror< T, D > diff --git a/master/matrix__base_8h_source.html b/master/matrix__base_8h_source.html index 8b1de2b19c..3f394467a3 100644 --- a/master/matrix__base_8h_source.html +++ b/master/matrix__base_8h_source.html @@ -223,17 +223,19 @@
168  << ", tile_size=" << matrix.tile_size()
169  << ", tiles_grid=" << matrix.nr_tiles()
170  << ", rank_index=" << matrix.rank_index()
-
171  << ", comm_grid=" << matrix.grid_size();
-
172  // clang-format on
-
173  }
-
174 
-
175 private:
-
176  Distribution distribution_;
-
177 };
-
178 
-
179 }
-
180 }
+
171  << ", comm_grid=" << matrix.grid_size()
+
172  << ", src_rank=" << matrix.distribution().source_rank_index()
+
173  << ", offset=" << matrix.distribution().offset();
+
174  // clang-format on
+
175  }
+
176 
+
177 private:
+
178  Distribution distribution_;
+
179 };
+
180 
181 }
+
182 }
+
183 }
dlaf::common::Index2D< IndexT_MPI, TAG_MPI >
dlaf::common::Size2D< SizeType, matrix::LocalTile_TAG >
dlaf::matrix::Distribution
Definition: distribution.h:111
diff --git a/master/multiplication_2general_2api_8h_source.html b/master/multiplication_2general_2api_8h_source.html index dbbf98b32e..9340329d79 100644 --- a/master/multiplication_2general_2api_8h_source.html +++ b/master/multiplication_2general_2api_8h_source.html @@ -85,44 +85,55 @@
14 
15 #include <dlaf/common/pipeline.h>
16 #include <dlaf/matrix/matrix.h>
-
17 #include <dlaf/types.h>
-
18 
-
19 namespace dlaf::multiplication {
-
20 namespace internal {
-
21 
-
22 template <Backend B, Device D, class T>
-
23 struct GeneralSub {
-
24  static void callNN(const SizeType i_tile_from, const SizeType i_tile_to, const blas::Op opA,
-
25  const blas::Op opB, const T alpha, Matrix<const T, D>& mat_a,
-
26  Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c);
-
27  static void callNN(common::Pipeline<comm::Communicator>& row_task_chain,
-
28  common::Pipeline<comm::Communicator>& col_task_chain, const SizeType i_tile_from,
-
29  const SizeType i_tile_to, const T alpha, Matrix<const T, D>& mat_a,
-
30  Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c);
-
31 };
-
32 
-
33 // ETI
-
34 #define DLAF_MULTIPLICATION_GENERAL_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
-
35  KWORD template struct GeneralSub<BACKEND, DEVICE, DATATYPE>;
-
36 
-
37 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, float)
-
38 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, double)
-
39 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, std::complex<float>)
-
40 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, std::complex<double>)
-
41 
-
42 #ifdef DLAF_WITH_GPU
-
43 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, float)
-
44 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, double)
-
45 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, std::complex<float>)
-
46 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, std::complex<double>)
-
47 #endif
-
48 
-
49 }
-
50 }
+
17 #include <dlaf/matrix/matrix_ref.h>
+
18 #include <dlaf/types.h>
+
19 
+
20 namespace dlaf::multiplication {
+
21 namespace internal {
+
22 using dlaf::matrix::internal::MatrixRef;
+
23 
+
24 template <Backend B, Device D, class T>
+
25 struct General {
+
26  static void callNN(const T alpha, MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b,
+
27  const T beta, MatrixRef<T, D>& mat_c);
+
28 };
+
29 
+
30 template <Backend B, Device D, class T>
+
31 struct GeneralSub {
+
32  static void callNN(const SizeType i_tile_from, const SizeType i_tile_to, const blas::Op opA,
+
33  const blas::Op opB, const T alpha, Matrix<const T, D>& mat_a,
+
34  Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c);
+
35  static void callNN(common::Pipeline<comm::Communicator>& row_task_chain,
+
36  common::Pipeline<comm::Communicator>& col_task_chain, const SizeType i_tile_from,
+
37  const SizeType i_tile_to, const T alpha, Matrix<const T, D>& mat_a,
+
38  Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c);
+
39 };
+
40 
+
41 #define DLAF_MULTIPLICATION_GENERAL_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
+
42  KWORD template struct General<BACKEND, DEVICE, DATATYPE>; \
+
43  KWORD template struct GeneralSub<BACKEND, DEVICE, DATATYPE>;
+
44 
+
45 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, float)
+
46 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, double)
+
47 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, std::complex<float>)
+
48 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::MC, Device::CPU, std::complex<double>)
+
49 
+
50 #ifdef DLAF_WITH_GPU
+
51 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, float)
+
52 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, double)
+
53 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, std::complex<float>)
+
54 DLAF_MULTIPLICATION_GENERAL_ETI(extern, Backend::GPU, Device::GPU, std::complex<double>)
+
55 #endif
+
56 
+
57 }
+
58 }
dlaf::common::Pipeline
Definition: pipeline.h:31
+
dlaf::matrix::internal::MatrixRef
Definition: matrix_ref.h:108
matrix.h
+
matrix_ref.h
pipeline.h
-
dlaf::multiplication::internal::GeneralSub
Definition: api.h:23
+
dlaf::multiplication::internal::General
Definition: api.h:25
+
dlaf::multiplication::internal::GeneralSub
Definition: api.h:31
types.h
diff --git a/master/multiplication_2general_2impl_8h_source.html b/master/multiplication_2general_2impl_8h_source.html index 6589daf26c..ae55ee4c7f 100644 --- a/master/multiplication_2general_2impl_8h_source.html +++ b/master/multiplication_2general_2impl_8h_source.html @@ -82,164 +82,198 @@
11 #pragma once
12 
13 #include <dlaf/blas/tile.h>
-
14 #include <dlaf/common/assert.h>
-
15 #include <dlaf/common/index2d.h>
-
16 #include <dlaf/common/pipeline.h>
-
17 #include <dlaf/common/round_robin.h>
-
18 #include <dlaf/communication/broadcast_panel.h>
-
19 #include <dlaf/communication/communicator_grid.h>
-
20 #include <dlaf/matrix/distribution.h>
-
21 #include <dlaf/matrix/index.h>
-
22 #include <dlaf/matrix/panel.h>
-
23 #include <dlaf/multiplication/general/api.h>
-
24 #include <dlaf/sender/when_all_lift.h>
-
25 
-
26 namespace dlaf::multiplication {
-
27 namespace internal {
-
28 
-
29 template <Backend B, Device D, class T>
-
30 void GeneralSub<B, D, T>::callNN(const SizeType idx_begin, const SizeType idx_end, const blas::Op opA,
-
31  const blas::Op opB, const T alpha, Matrix<const T, D>& mat_a,
-
32  Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c) {
-
33  namespace ex = pika::execution::experimental;
-
34 
-
35  for (SizeType j = idx_begin; j < idx_end; ++j) {
-
36  for (SizeType i = idx_begin; i < idx_end; ++i) {
-
37  for (SizeType k = idx_begin; k < idx_end; ++k) {
-
38  ex::start_detached(
-
39  dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)),
-
40  mat_b.read(GlobalTileIndex(k, j)), k == idx_begin ? beta : T(1),
-
41  mat_c.readwrite(GlobalTileIndex(i, j))) |
-
42  tile::gemm(dlaf::internal::Policy<B>()));
-
43  }
-
44  }
-
45  }
-
46 }
-
47 
-
48 // This implementation is based on
-
49 //
-
50 // Van De Geijn, Robert A., and Jerrell Watts.
-
51 // SUMMA: Scalable universal matrix multiplication algorithm.
-
52 // Concurrency: Practice and Experience 9.4 (1997): 255-274
-
53 template <Backend B, Device D, class T>
-
54 void GeneralSub<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_chain,
-
55  common::Pipeline<comm::Communicator>& col_task_chain,
-
56  const SizeType idx_begin, const SizeType idx_end, const T alpha,
-
57  Matrix<const T, D>& mat_a, Matrix<const T, D>& mat_b, const T beta,
-
58  Matrix<T, D>& mat_c) {
-
59  namespace ex = pika::execution::experimental;
-
60 
-
61  if (idx_begin == idx_end)
-
62  return;
-
63 
-
64  const auto& dist_a = mat_a.distribution();
-
65  const auto rank = dist_a.rankIndex();
-
66 
-
67  // which rank has the last tile involved
-
68  const bool rankHasLastRow = rank.row() == dist_a.template rankGlobalTile<Coord::Row>(idx_end - 1);
-
69  const bool rankHasLastCol = rank.col() == dist_a.template rankGlobalTile<Coord::Col>(idx_end - 1);
-
70 
-
71  // translate from global to local indices
-
72  const SizeType i_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_begin);
-
73  const SizeType i_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_end);
-
74 
-
75  const SizeType j_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_begin);
-
76  const SizeType j_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_end);
-
77 
-
78  const SizeType mb = dist_a.blockSize().rows();
-
79  const SizeType lastTileElement = std::min(idx_end * mb - 1, dist_a.size().rows() - 1);
-
80  const SizeType nrefls = lastTileElement - idx_begin * mb + 1;
+
14 #include <dlaf/blas/tile_extensions.h>
+
15 #include <dlaf/common/assert.h>
+
16 #include <dlaf/common/index2d.h>
+
17 #include <dlaf/common/pipeline.h>
+
18 #include <dlaf/common/round_robin.h>
+
19 #include <dlaf/communication/broadcast_panel.h>
+
20 #include <dlaf/communication/communicator_grid.h>
+
21 #include <dlaf/matrix/distribution.h>
+
22 #include <dlaf/matrix/index.h>
+
23 #include <dlaf/matrix/matrix.h>
+
24 #include <dlaf/matrix/matrix_ref.h>
+
25 #include <dlaf/matrix/panel.h>
+
26 #include <dlaf/multiplication/general/api.h>
+
27 #include <dlaf/sender/when_all_lift.h>
+
28 #include <dlaf/util_matrix.h>
+
29 
+
30 namespace dlaf::multiplication {
+
31 namespace internal {
+
32 
+
33 template <Backend B, Device D, class T>
+
34 void General<B, D, T>::callNN(const T alpha, MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b,
+
35  const T beta, MatrixRef<T, D>& mat_c) {
+
36  namespace ex = pika::execution::experimental;
+
37 
+
38  if (mat_a.nr_tiles().cols() == 0) {
+
39  // Note: if beta == 1, we optimize by not even scheduling anything
+
40  if (beta != T(1)) {
+
41  for (SizeType j = 0; j < mat_c.distribution().local_nr_tiles().cols(); ++j)
+
42  for (SizeType i = 0; i < mat_c.distribution().local_nr_tiles().rows(); ++i)
+
43  ex::start_detached(dlaf::internal::whenAllLift(beta, mat_c.readwrite(LocalTileIndex(i, j))) |
+
44  tile::scal(dlaf::internal::Policy<B>()));
+
45  }
+
46  return;
+
47  }
+
48 
+
49  for (SizeType j = 0; j < mat_c.distribution().local_nr_tiles().cols(); ++j) {
+
50  for (SizeType i = 0; i < mat_c.distribution().local_nr_tiles().rows(); ++i) {
+
51  for (SizeType k = 0; k < mat_a.distribution().local_nr_tiles().cols(); ++k) {
+
52  ex::start_detached(dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha,
+
53  mat_a.read(LocalTileIndex(i, k)),
+
54  mat_b.read(LocalTileIndex(k, j)),
+
55  k == 0 ? beta : T(1),
+
56  mat_c.readwrite(LocalTileIndex(i, j))) |
+
57  tile::gemm(dlaf::internal::Policy<B>()));
+
58  }
+
59  }
+
60  }
+
61 }
+
62 
+
63 template <Backend B, Device D, class T>
+
64 void GeneralSub<B, D, T>::callNN(const SizeType idx_begin, const SizeType idx_end, const blas::Op opA,
+
65  const blas::Op opB, const T alpha, Matrix<const T, D>& mat_a,
+
66  Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c) {
+
67  namespace ex = pika::execution::experimental;
+
68 
+
69  for (SizeType j = idx_begin; j < idx_end; ++j) {
+
70  for (SizeType i = idx_begin; i < idx_end; ++i) {
+
71  for (SizeType k = idx_begin; k < idx_end; ++k) {
+
72  ex::start_detached(
+
73  dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)),
+
74  mat_b.read(GlobalTileIndex(k, j)), k == idx_begin ? beta : T(1),
+
75  mat_c.readwrite(GlobalTileIndex(i, j))) |
+
76  tile::gemm(dlaf::internal::Policy<B>()));
+
77  }
+
78  }
+
79  }
+
80 }
81 
-
82  // Note: if last tile is incomplete, compute the size of it
-
83  const bool isEndRangePartial = nrefls % mb != 0;
-
84  const SizeType partialSize = (nrefls % mb);
-
85 
-
86  // Note:
-
87  // Workspace needed is limited to the range [i_begin:i_end). Its allocation is obtained by creating an
-
88  // ad hoc distribution that starts in the origin of the matrix and with a size covering all needed
-
89  // elements. This would lead to a [0:i_end) range, but by using panel offset at initialization, the
-
90  // part before the range will be left out from allocation, actually getting [i_begin:i_end)
-
91  const GlobalTileIndex panel_offset(idx_begin, idx_begin);
-
92  const matrix::Distribution dist_panel({lastTileElement + 1, lastTileElement + 1}, dist_a.blockSize(),
-
93  dist_a.commGridSize(), dist_a.rankIndex(),
-
94  dist_a.sourceRankIndex());
-
95 
-
96  constexpr std::size_t n_workspaces = 2;
-
97  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_panel, panel_offset);
-
98  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_panel, panel_offset);
-
99 
-
100  // This loops over the global indices for k, because every rank have to participate in communication
-
101  for (SizeType k = idx_begin; k < idx_end; ++k) {
-
102  auto& panelA = panelsA.nextResource();
-
103  auto& panelB = panelsB.nextResource();
+
82 // This implementation is based on
+
83 //
+
84 // Van De Geijn, Robert A., and Jerrell Watts.
+
85 // SUMMA: Scalable universal matrix multiplication algorithm.
+
86 // Concurrency: Practice and Experience 9.4 (1997): 255-274
+
87 template <Backend B, Device D, class T>
+
88 void GeneralSub<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_chain,
+
89  common::Pipeline<comm::Communicator>& col_task_chain,
+
90  const SizeType idx_begin, const SizeType idx_end, const T alpha,
+
91  Matrix<const T, D>& mat_a, Matrix<const T, D>& mat_b, const T beta,
+
92  Matrix<T, D>& mat_c) {
+
93  namespace ex = pika::execution::experimental;
+
94 
+
95  if (idx_begin == idx_end)
+
96  return;
+
97 
+
98  const auto& dist_a = mat_a.distribution();
+
99  const auto rank = dist_a.rankIndex();
+
100 
+
101  // which rank has the last tile involved
+
102  const bool rankHasLastRow = rank.row() == dist_a.template rankGlobalTile<Coord::Row>(idx_end - 1);
+
103  const bool rankHasLastCol = rank.col() == dist_a.template rankGlobalTile<Coord::Col>(idx_end - 1);
104 
-
105  const bool isKPartial = k == idx_end - 1 && isEndRangePartial;
-
106  const SizeType kSize = isKPartial ? partialSize : mb;
-
107  if (isKPartial) {
-
108  panelA.setWidth(kSize);
-
109  panelB.setHeight(kSize);
-
110  }
+
105  // translate from global to local indices
+
106  const SizeType i_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_begin);
+
107  const SizeType i_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_end);
+
108 
+
109  const SizeType j_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_begin);
+
110  const SizeType j_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_end);
111 
-
112  const auto rank_k = dist_a.rankGlobalTile({k, k});
-
113 
-
114  // Setup the column workspace for the root ranks, i.e. the ones in the current col
-
115  if (rank_k.col() == rank.col()) {
-
116  const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Col>(k);
-
117  for (SizeType i = i_beg; i < i_end; ++i) {
-
118  const LocalTileIndex ik(i, k_local);
-
119  const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
-
120  const SizeType nrows = isRowPartial ? partialSize : mb;
-
121  panelA.setTile(ik, (isRowPartial || isKPartial)
-
122  ? splitTile(mat_a.read(ik), {{0, 0}, {nrows, kSize}})
-
123  : mat_a.read(ik));
-
124  }
-
125  }
-
126  // Setup the row workspace for the root ranks, i.e. the ones in the current row
-
127  if (rank_k.row() == rank.row()) {
-
128  const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Row>(k);
-
129  for (SizeType j = j_beg; j < j_end; ++j) {
-
130  const LocalTileIndex kj(k_local, j);
-
131  const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
-
132  const SizeType ncols = isColPartial ? partialSize : mb;
-
133  panelB.setTile(kj, (isKPartial || isColPartial)
-
134  ? splitTile(mat_b.read(kj), {{0, 0}, {kSize, ncols}})
-
135  : mat_b.read(kj));
-
136  }
-
137  }
+
112  const SizeType mb = dist_a.blockSize().rows();
+
113  const SizeType lastTileElement = std::min(idx_end * mb - 1, dist_a.size().rows() - 1);
+
114  const SizeType nrefls = lastTileElement - idx_begin * mb + 1;
+
115 
+
116  // Note: if last tile is incomplete, compute the size of it
+
117  const bool isEndRangePartial = nrefls % mb != 0;
+
118  const SizeType partialSize = (nrefls % mb);
+
119 
+
120  // Note:
+
121  // Workspace needed is limited to the range [i_begin:i_end). Its allocation is obtained by creating an
+
122  // ad hoc distribution that starts in the origin of the matrix and with a size covering all needed
+
123  // elements. This would lead to a [0:i_end) range, but by using panel offset at initialization, the
+
124  // part before the range will be left out from allocation, actually getting [i_begin:i_end)
+
125  const GlobalTileIndex panel_offset(idx_begin, idx_begin);
+
126  const matrix::Distribution dist_panel({lastTileElement + 1, lastTileElement + 1}, dist_a.blockSize(),
+
127  dist_a.commGridSize(), dist_a.rankIndex(),
+
128  dist_a.sourceRankIndex());
+
129 
+
130  constexpr std::size_t n_workspaces = 2;
+
131  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_panel, panel_offset);
+
132  common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_panel, panel_offset);
+
133 
+
134  // This loops over the global indices for k, because every rank have to participate in communication
+
135  for (SizeType k = idx_begin; k < idx_end; ++k) {
+
136  auto& panelA = panelsA.nextResource();
+
137  auto& panelB = panelsB.nextResource();
138 
-
139  // Broadcast both column and row panel from root to others (row-wise and col-wise, respectively)
-
140  broadcast(rank_k.col(), panelA, row_task_chain);
-
141  broadcast(rank_k.row(), panelB, col_task_chain);
-
142 
-
143  // This is the core loop where the k step performs the update over the entire local matrix using
-
144  // the col and row workspaces.
-
145  // Everything needed for the update is available locally thanks to previous broadcasts.
-
146  for (SizeType i = i_beg; i < i_end; ++i) {
-
147  const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
-
148  const SizeType nrows = isRowPartial ? partialSize : mb;
-
149 
-
150  for (SizeType j = j_beg; j < j_end; ++j) {
-
151  const LocalTileIndex ij(i, j);
-
152 
-
153  const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
-
154  const SizeType ncols = isColPartial ? partialSize : mb;
-
155 
-
156  ex::start_detached(
-
157  dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha, panelA.read(ij),
-
158  panelB.read(ij), k == idx_begin ? beta : T(1),
-
159  (isRowPartial || isColPartial)
-
160  ? splitTile(mat_c.readwrite(ij), {{0, 0}, {nrows, ncols}})
-
161  : mat_c.readwrite(ij)) |
-
162  tile::gemm(dlaf::internal::Policy<B>()));
-
163  }
-
164  }
-
165 
-
166  panelA.reset();
-
167  panelB.reset();
-
168  }
-
169 }
-
170 }
-
171 }
+
139  const bool isKPartial = k == idx_end - 1 && isEndRangePartial;
+
140  const SizeType kSize = isKPartial ? partialSize : mb;
+
141  if (isKPartial) {
+
142  panelA.setWidth(kSize);
+
143  panelB.setHeight(kSize);
+
144  }
+
145 
+
146  const auto rank_k = dist_a.rankGlobalTile({k, k});
+
147 
+
148  // Setup the column workspace for the root ranks, i.e. the ones in the current col
+
149  if (rank_k.col() == rank.col()) {
+
150  const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Col>(k);
+
151  for (SizeType i = i_beg; i < i_end; ++i) {
+
152  const LocalTileIndex ik(i, k_local);
+
153  const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
+
154  const SizeType nrows = isRowPartial ? partialSize : mb;
+
155  panelA.setTile(ik, (isRowPartial || isKPartial)
+
156  ? splitTile(mat_a.read(ik), {{0, 0}, {nrows, kSize}})
+
157  : mat_a.read(ik));
+
158  }
+
159  }
+
160  // Setup the row workspace for the root ranks, i.e. the ones in the current row
+
161  if (rank_k.row() == rank.row()) {
+
162  const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Row>(k);
+
163  for (SizeType j = j_beg; j < j_end; ++j) {
+
164  const LocalTileIndex kj(k_local, j);
+
165  const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
+
166  const SizeType ncols = isColPartial ? partialSize : mb;
+
167  panelB.setTile(kj, (isKPartial || isColPartial)
+
168  ? splitTile(mat_b.read(kj), {{0, 0}, {kSize, ncols}})
+
169  : mat_b.read(kj));
+
170  }
+
171  }
+
172 
+
173  // Broadcast both column and row panel from root to others (row-wise and col-wise, respectively)
+
174  broadcast(rank_k.col(), panelA, row_task_chain);
+
175  broadcast(rank_k.row(), panelB, col_task_chain);
+
176 
+
177  // This is the core loop where the k step performs the update over the entire local matrix using
+
178  // the col and row workspaces.
+
179  // Everything needed for the update is available locally thanks to previous broadcasts.
+
180  for (SizeType i = i_beg; i < i_end; ++i) {
+
181  const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
+
182  const SizeType nrows = isRowPartial ? partialSize : mb;
+
183 
+
184  for (SizeType j = j_beg; j < j_end; ++j) {
+
185  const LocalTileIndex ij(i, j);
+
186 
+
187  const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
+
188  const SizeType ncols = isColPartial ? partialSize : mb;
+
189 
+
190  ex::start_detached(
+
191  dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha, panelA.read(ij),
+
192  panelB.read(ij), k == idx_begin ? beta : T(1),
+
193  (isRowPartial || isColPartial)
+
194  ? splitTile(mat_c.readwrite(ij), {{0, 0}, {nrows, ncols}})
+
195  : mat_c.readwrite(ij)) |
+
196  tile::gemm(dlaf::internal::Policy<B>()));
+
197  }
+
198  }
+
199 
+
200  panelA.reset();
+
201  panelB.reset();
+
202  }
+
203 }
+
204 }
+
205 }
tile.h
broadcast_panel.h
dlaf::comm::broadcast
void broadcast(comm::IndexT_MPI rank_root, matrix::Panel< axis, T, D, storage > &panel, common::Pipeline< comm::Communicator > &serial_comm)
Definition: broadcast_panel.h:58
@@ -249,9 +283,13 @@
index2d.h
index.h
dlaf::matrix::splitTile
ReadOnlyTileSender< T, D > splitTile(ReadOnlyTileSender< T, D > tile, const SubTileSpec &spec)
Definition: tile.h:507
+
matrix.h
+
matrix_ref.h
panel.h
pipeline.h
round_robin.h
+
tile_extensions.h
+
util_matrix.h