diff --git a/master/annotated.html b/master/annotated.html
index 810d95cfce..6ea3f91223 100644
--- a/master/annotated.html
+++ b/master/annotated.html
@@ -257,9 +257,10 @@
CMemoryView | |
►Nmultiplication | |
►Ninternal | |
- CGeneralSub | |
- CHermitian | |
- CTriangular | |
+ CGeneral | |
+ CGeneralSub | |
+ CHermitian | |
+ CTriangular | |
►Npermutations | |
►Ninternal | |
CPermutations | |
diff --git a/master/classes.html b/master/classes.html
index f606daa7b5..819bed646d 100644
--- a/master/classes.html
+++ b/master/classes.html
@@ -87,7 +87,7 @@
FormatShort (dlaf::internal)
- G
-- gemmSizes (dlaf::tile::internal)
- GenEigensolver (dlaf::eigensolver::internal)
- GeneralSub (dlaf::multiplication::internal)
- GenToStd (dlaf::eigensolver::internal)
- getter_random (dlaf::matrix::util::internal)
- getter_random< std::complex< T > > (dlaf::matrix::util::internal)
- GivensRotation (dlaf::eigensolver::internal)
+gemmSizes (dlaf::tile::internal)GenEigensolver (dlaf::eigensolver::internal)General (dlaf::multiplication::internal)GeneralSub (dlaf::multiplication::internal)GenToStd (dlaf::eigensolver::internal)getter_random (dlaf::matrix::util::internal)getter_random< std::complex< T > > (dlaf::matrix::util::internal)GivensRotation (dlaf::eigensolver::internal)
- H
- Helpers (dlaf::eigensolver::internal::bt_red_band)
- Helpers (dlaf::factorization::internal::tfactor_l)
- Helpers< Backend::GPU > (dlaf::eigensolver::internal::bt_red_band)
- Helpers< Backend::GPU, Device::GPU, T > (dlaf::factorization::internal::tfactor_l)
- Helpers< Backend::MC > (dlaf::eigensolver::internal::bt_red_band)
- Helpers< Backend::MC, Device::CPU, T > (dlaf::factorization::internal::tfactor_l)
- hemmSizes (dlaf::tile::internal)
- her2kSizes (dlaf::tile::internal)
- herkSizes (dlaf::tile::internal)
- Hermitian (dlaf::multiplication::internal)
- HHManager (dlaf::eigensolver::internal::bt_tridiag)
- HHManager< Backend::GPU, Device::GPU, T > (dlaf::eigensolver::internal::bt_tridiag)
- HHManager< Backend::MC, Device::CPU, T > (dlaf::eigensolver::internal::bt_tridiag)
diff --git a/master/hierarchy.html b/master/hierarchy.html
index 69b23f0270..dc1419f580 100644
--- a/master/hierarchy.html
+++ b/master/hierarchy.html
@@ -153,148 +153,149 @@
Cdlaf::internal::FormatShort< T > | |
Cdlaf::tile::internal::gemmSizes | |
Cdlaf::eigensolver::internal::GenEigensolver< backend, device, T > | |
- Cdlaf::multiplication::internal::GeneralSub< B, D, T > | |
- Cdlaf::eigensolver::internal::GenToStd< backend, device, T > | |
- ►Cdlaf::matrix::util::internal::getter_random< T > | Callable that returns random values in the range [-1, 1] |
- Cdlaf::matrix::util::internal::getter_random< std::complex< T > > | Callable that returns random complex numbers whose absolute values are less than 1 |
- Cdlaf::eigensolver::internal::GivensRotation< T > | |
- Cdlaf::eigensolver::internal::bt_red_band::Helpers< B > | |
- Cdlaf::factorization::internal::tfactor_l::Helpers< backend, device, T > | |
- Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::GPU > | |
- Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::GPU, Device::GPU, T > | |
- Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::MC > | |
- Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::MC, Device::CPU, T > | |
- Cdlaf::tile::internal::hemmSizes | |
- Cdlaf::tile::internal::her2kSizes | |
- Cdlaf::tile::internal::herkSizes | |
- Cdlaf::multiplication::internal::Hermitian< B, D, T > | |
- Cdlaf::eigensolver::internal::bt_tridiag::HHManager< B, D, T > | |
- Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::GPU, Device::GPU, T > | |
- Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::MC, Device::CPU, T > | |
- ►Cstd::integral_constant | |
- Cdlaf::internal::IsFalse< T > | |
- Cdlaf::common::is_data< Data > | |
- ►Cstd::is_floating_point | |
- ►Cdlaf::internal::IsFloatingPointOrComplex< T > | |
- Cdlaf::internal::IsFloatingPointOrComplex< std::complex< T > > | |
- Cdlaf::common::IterableRange2D< IndexT, Tag > | An Iterable representing a 2D range |
- Cdlaf::common::IteratorRange2D< IndexT, Tag > | An Iterator returning indices in column-major order |
- Cdlaf::matrix::LayoutInfo | |
- Cdlaf::matrix::Matrix< T, D > | |
- Cdlaf::matrix::Matrix< const T, Source > | |
- Cdlaf::matrix::Matrix< const T, SourceTarget > | |
- Cdlaf::matrix::Matrix< T, Source > | |
- Cdlaf::matrix::Matrix< T, SourceTarget > | |
- Cdlaf::matrix::Matrix< T, Target > | |
- ►Cdlaf::matrix::internal::MatrixBase | |
- Cdlaf::matrix::Matrix< const T, D > | |
- Cdlaf::matrix::internal::MatrixRef< const T, D > | |
- Cdlaf::matrix::MatrixMirror< T, Target, Source > | |
- ►Cdlaf::matrix::MatrixMirror< const T, SourceTarget, SourceTarget > | |
- Cdlaf::matrix::MatrixMirror< T, SourceTarget, SourceTarget > | |
- Cdlaf::matrix::MatrixMirror< const T, Target, Source > | |
- Cdlaf::matrix::internal::MatrixRef< T, D > | |
- Cdlaf::memory::MemoryChunk< T, D > | The class MemoryChunk represents a layer of abstraction over the underlying device memory |
- Cdlaf::memory::MemoryView< T, D > | |
- Cdlaf::memory::MemoryView< T, Device::CPU > | |
- Cdlaf::comm::Message< Data > | |
- Cdlaf::internal::moveNonConstTile< T > | |
- ►Cdlaf::comm::mpi_datatype< T > | Mapper between language types and basic MPI_Datatype |
- Cdlaf::comm::mpi_datatype< const T > | Helper for mapping also const types |
- Cdlaf::comm::mpi_init | |
- Cdlaf::comm::internal::MPICallHelper< F > | |
- Cdlaf::auxiliary::internal::Norm< backend, device, T > | |
- Cdlaf::auxiliary::internal::Norm< Backend::MC, Device::CPU, T > | |
- Cdlaf::format::numpy | |
- Cdlaf::matrix::internal::numpy_datatype< T > | |
- Cdlaf::matrix::internal::numpy_datatype< std::complex< T > > | |
- Cdlaf::matrix::Panel< axis, T, D, Storage > | |
- Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::No > | |
- ►Cdlaf::matrix::Panel< orthogonal(axis), const T, D, StoreTransposed::No > | |
- Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::Yes > | |
- ►Cdlaf::internal::PartialTransformBase< Tag, B, F > | |
- Cdlaf::internal::PartialTransform< Tag, B, F > | |
- Cdlaf::internal::PartialTransformDetach< Tag, B, F > | |
- ►Cdlaf::comm::internal::PartialTransformMPIBase< F > | |
- Cdlaf::comm::internal::PartialTransformMPI< F > | |
- Cdlaf::comm::internal::PartialTransformMPIDetach< F > | |
- Cdlaf::permutations::internal::Permutations< B, D, T, coord > | |
- Cdlaf::common::Pipeline< T > | |
- Cdlaf::internal::Policy< B > | |
- Cdlaf::internal::PrependPack< Pack, T > | |
- Cdlaf::internal::PrependPack< Pack< Ts... >, T > | |
- Cdlaf::factorization::internal::QR< backend, device, T > | |
- Cdlaf::factorization::internal::QR_Tfactor< backend, device, T > | |
- Cdlaf::eigensolver::internal::ReductionToBand< B, D, T > | |
- Cdlaf::common::RoundRobin< T > | |
- Cdlaf::common::RoundRobin< dlaf::matrix::Panel< Coord::Col, T, Device::CPU > > | |
- Cdlaf::ScopedInitializer | |
- Cdlaf::eigensolver::internal::ScopedSenderWait | |
- Cdlaf::internal::SenderSingleValueTypeImpl< ValueTypes > | |
- Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::read > > > > | |
- Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::readwrite > > > > | |
- Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< std::reference_wrapper< T > > > > | |
- Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< T > > > | |
- Cdlaf::common::internal::SingleThreadedBlasScope | |
- Cdlaf::common::internal::source_location | |
- Cdlaf::matrix::SubDistributionSpec | Contains information to create a sub-distribution |
- Cdlaf::matrix::Matrix< const T, D >::SubPipelineTag | |
- Cdlaf::matrix::SubTileSpec | Contains the information to create a subtile |
- ►Cdlaf::eigensolver::internal::SweepWorker< T > | |
- Cdlaf::eigensolver::internal::SweepWorkerDist< T > | |
- Cdlaf::matrix::Tile< T, D > | |
- Cdlaf::matrix::Tile< const T, D > | |
- Cdlaf::eigensolver::internal::bt_tridiag::TileAccessHelper | |
- Cdlaf::eigensolver::internal::TileCollector | |
- Cdlaf::matrix::internal::TileData< T, D > | |
- Cdlaf::matrix::internal::TilePipeline< T, D > | |
- Cdlaf::common::Timer< clock > | |
- Cdlaf::internal::TransformPack< Pack, Transformer > | |
- Cdlaf::internal::TransformPack< Pack< Ts... >, Transformer > | |
- Cdlaf::multiplication::internal::Triangular< backend, device, T > | |
- Cdlaf::solver::internal::Triangular< backend, device, T > | |
- Cdlaf::eigensolver::internal::TridiagResult< T, D > | |
- Cdlaf::eigensolver::internal::TridiagSolver< backend, device, T > | |
- Cdlaf::tile::internal::trmmSizes | |
- Cdlaf::tile::internal::trsmSizes | |
- ►Cstd::true_type | |
- Cdlaf::common::internal::is_coord< Index2D< T, Tag > > | |
- Cdlaf::common::internal::is_coord< Size2D< T, Tag > > | |
- Cdlaf::internal::Contains< T, T, Ts... > | |
- Cdlaf::TuneParameters | |
- ►Cstd::conditional::type | |
- Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack< U, Us... > > | |
+ Cdlaf::multiplication::internal::General< B, D, T > | |
+ Cdlaf::multiplication::internal::GeneralSub< B, D, T > | |
+ Cdlaf::eigensolver::internal::GenToStd< backend, device, T > | |
+ ►Cdlaf::matrix::util::internal::getter_random< T > | Callable that returns random values in the range [-1, 1] |
+ Cdlaf::matrix::util::internal::getter_random< std::complex< T > > | Callable that returns random complex numbers whose absolute values are less than 1 |
+ Cdlaf::eigensolver::internal::GivensRotation< T > | |
+ Cdlaf::eigensolver::internal::bt_red_band::Helpers< B > | |
+ Cdlaf::factorization::internal::tfactor_l::Helpers< backend, device, T > | |
+ Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::GPU > | |
+ Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::GPU, Device::GPU, T > | |
+ Cdlaf::eigensolver::internal::bt_red_band::Helpers< Backend::MC > | |
+ Cdlaf::factorization::internal::tfactor_l::Helpers< Backend::MC, Device::CPU, T > | |
+ Cdlaf::tile::internal::hemmSizes | |
+ Cdlaf::tile::internal::her2kSizes | |
+ Cdlaf::tile::internal::herkSizes | |
+ Cdlaf::multiplication::internal::Hermitian< B, D, T > | |
+ Cdlaf::eigensolver::internal::bt_tridiag::HHManager< B, D, T > | |
+ Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::GPU, Device::GPU, T > | |
+ Cdlaf::eigensolver::internal::bt_tridiag::HHManager< Backend::MC, Device::CPU, T > | |
+ ►Cstd::integral_constant | |
+ Cdlaf::internal::IsFalse< T > | |
+ Cdlaf::common::is_data< Data > | |
+ ►Cstd::is_floating_point | |
+ ►Cdlaf::internal::IsFloatingPointOrComplex< T > | |
+ Cdlaf::internal::IsFloatingPointOrComplex< std::complex< T > > | |
+ Cdlaf::common::IterableRange2D< IndexT, Tag > | An Iterable representing a 2D range |
+ Cdlaf::common::IteratorRange2D< IndexT, Tag > | An Iterator returning indices in column-major order |
+ Cdlaf::matrix::LayoutInfo | |
+ Cdlaf::matrix::Matrix< T, D > | |
+ Cdlaf::matrix::Matrix< const T, Source > | |
+ Cdlaf::matrix::Matrix< const T, SourceTarget > | |
+ Cdlaf::matrix::Matrix< T, Source > | |
+ Cdlaf::matrix::Matrix< T, SourceTarget > | |
+ Cdlaf::matrix::Matrix< T, Target > | |
+ ►Cdlaf::matrix::internal::MatrixBase | |
+ Cdlaf::matrix::Matrix< const T, D > | |
+ Cdlaf::matrix::internal::MatrixRef< const T, D > | |
+ Cdlaf::matrix::MatrixMirror< T, Target, Source > | |
+ ►Cdlaf::matrix::MatrixMirror< const T, SourceTarget, SourceTarget > | |
+ Cdlaf::matrix::MatrixMirror< T, SourceTarget, SourceTarget > | |
+ Cdlaf::matrix::MatrixMirror< const T, Target, Source > | |
+ Cdlaf::matrix::internal::MatrixRef< T, D > | |
+ Cdlaf::memory::MemoryChunk< T, D > | The class MemoryChunk represents a layer of abstraction over the underlying device memory |
+ Cdlaf::memory::MemoryView< T, D > | |
+ Cdlaf::memory::MemoryView< T, Device::CPU > | |
+ Cdlaf::comm::Message< Data > | |
+ Cdlaf::internal::moveNonConstTile< T > | |
+ ►Cdlaf::comm::mpi_datatype< T > | Mapper between language types and basic MPI_Datatype |
+ Cdlaf::comm::mpi_datatype< const T > | Helper for mapping also const types |
+ Cdlaf::comm::mpi_init | |
+ Cdlaf::comm::internal::MPICallHelper< F > | |
+ Cdlaf::auxiliary::internal::Norm< backend, device, T > | |
+ Cdlaf::auxiliary::internal::Norm< Backend::MC, Device::CPU, T > | |
+ Cdlaf::format::numpy | |
+ Cdlaf::matrix::internal::numpy_datatype< T > | |
+ Cdlaf::matrix::internal::numpy_datatype< std::complex< T > > | |
+ Cdlaf::matrix::Panel< axis, T, D, Storage > | |
+ Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::No > | |
+ ►Cdlaf::matrix::Panel< orthogonal(axis), const T, D, StoreTransposed::No > | |
+ Cdlaf::matrix::Panel< axis, const T, D, StoreTransposed::Yes > | |
+ ►Cdlaf::internal::PartialTransformBase< Tag, B, F > | |
+ Cdlaf::internal::PartialTransform< Tag, B, F > | |
+ Cdlaf::internal::PartialTransformDetach< Tag, B, F > | |
+ ►Cdlaf::comm::internal::PartialTransformMPIBase< F > | |
+ Cdlaf::comm::internal::PartialTransformMPI< F > | |
+ Cdlaf::comm::internal::PartialTransformMPIDetach< F > | |
+ Cdlaf::permutations::internal::Permutations< B, D, T, coord > | |
+ Cdlaf::common::Pipeline< T > | |
+ Cdlaf::internal::Policy< B > | |
+ Cdlaf::internal::PrependPack< Pack, T > | |
+ Cdlaf::internal::PrependPack< Pack< Ts... >, T > | |
+ Cdlaf::factorization::internal::QR< backend, device, T > | |
+ Cdlaf::factorization::internal::QR_Tfactor< backend, device, T > | |
+ Cdlaf::eigensolver::internal::ReductionToBand< B, D, T > | |
+ Cdlaf::common::RoundRobin< T > | |
+ Cdlaf::common::RoundRobin< dlaf::matrix::Panel< Coord::Col, T, Device::CPU > > | |
+ Cdlaf::ScopedInitializer | |
+ Cdlaf::eigensolver::internal::ScopedSenderWait | |
+ Cdlaf::internal::SenderSingleValueTypeImpl< ValueTypes > | |
+ Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::read > > > > | |
+ Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::readwrite > > > > | |
+ Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< std::reference_wrapper< T > > > > | |
+ Cdlaf::internal::SenderSingleValueTypeImpl< TypeList< TypeList< T > > > | |
+ Cdlaf::common::internal::SingleThreadedBlasScope | |
+ Cdlaf::common::internal::source_location | |
+ Cdlaf::matrix::SubDistributionSpec | Contains information to create a sub-distribution |
+ Cdlaf::matrix::Matrix< const T, D >::SubPipelineTag | |
+ Cdlaf::matrix::SubTileSpec | Contains the information to create a subtile |
+ ►Cdlaf::eigensolver::internal::SweepWorker< T > | |
+ Cdlaf::eigensolver::internal::SweepWorkerDist< T > | |
+ Cdlaf::matrix::Tile< T, D > | |
+ Cdlaf::matrix::Tile< const T, D > | |
+ Cdlaf::eigensolver::internal::bt_tridiag::TileAccessHelper | |
+ Cdlaf::eigensolver::internal::TileCollector | |
+ Cdlaf::matrix::internal::TileData< T, D > | |
+ Cdlaf::matrix::internal::TilePipeline< T, D > | |
+ Cdlaf::common::Timer< clock > | |
+ Cdlaf::internal::TransformPack< Pack, Transformer > | |
+ Cdlaf::internal::TransformPack< Pack< Ts... >, Transformer > | |
+ Cdlaf::multiplication::internal::Triangular< backend, device, T > | |
+ Cdlaf::solver::internal::Triangular< backend, device, T > | |
+ Cdlaf::eigensolver::internal::TridiagResult< T, D > | |
+ Cdlaf::eigensolver::internal::TridiagSolver< backend, device, T > | |
+ Cdlaf::tile::internal::trmmSizes | |
+ Cdlaf::tile::internal::trsmSizes | |
+ ►Cstd::true_type | |
+ Cdlaf::common::internal::is_coord< Index2D< T, Tag > > | |
+ Cdlaf::common::internal::is_coord< Size2D< T, Tag > > | |
+ Cdlaf::internal::Contains< T, T, Ts... > | |
+ Cdlaf::TuneParameters | |
►Cstd::conditional::type | |
- Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack< U, Us... > > | |
- Cdlaf::comm::internal::type_handler< T > | |
- ►Cdlaf::TypeInfo< T > | |
- Cdlaf::TypeInfo< const T > | |
- Cdlaf::TypeInfo< std::complex< T > > | |
- Cdlaf::internal::TypeList<... > | |
- Cdlaf::internal::Unique< Pack > | |
- Cdlaf::internal::UniqueHelper< PackUnique, PackRest > | |
- Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack<> > | |
- ►Cdlaf::internal::UniqueHelper< Pack<>, Pack< Ts... > > | |
- Cdlaf::internal::Unique< Pack< Ts... > > | |
- Cdlaf::internal::UniquePack< Pack > | |
- Cdlaf::internal::UniquePackHelper< PackUnique, PackRest > | |
- Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack<> > | |
- ►Cdlaf::internal::UniquePackHelper< Pack<>, Pack< Ts... > > | |
- Cdlaf::internal::UniquePack< Pack< Ts... > > | |
- Cdlaf::common::internal::Unwrapper< T > | |
- Cdlaf::common::internal::Unwrapper< pika::execution::experimental::async_rw_mutex_access_wrapper< T1, T2, at > > | |
- Cdlaf::common::internal::Unwrapper< std::reference_wrapper< T > > | |
- Cdlaf::common::internal::Unwrapping< F > | |
- Cdlaf::eigensolver::internal::VAccessHelper | |
- ►Cstd::vector | |
- Cdlaf::common::internal::vector< ReadOnlySenderType > | |
- Cdlaf::common::internal::vector< T > | |
- ►Cdlaf::matrix::internal::View | |
- Cdlaf::matrix::SubMatrixView | |
- Cdlaf::matrix::SubPanelView | |
- Cdlaf::eigensolver::internal::WorkSpace< T, D > | |
- Cdlaf::eigensolver::internal::WorkSpaceHost< T > | |
- Cdlaf::eigensolver::internal::WorkSpaceHostMirror< T, D > | |
+ Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack< U, Us... > > | |
+ ►Cstd::conditional::type | |
+ Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack< U, Us... > > | |
+ Cdlaf::comm::internal::type_handler< T > | |
+ ►Cdlaf::TypeInfo< T > | |
+ Cdlaf::TypeInfo< const T > | |
+ Cdlaf::TypeInfo< std::complex< T > > | |
+ Cdlaf::internal::TypeList<... > | |
+ Cdlaf::internal::Unique< Pack > | |
+ Cdlaf::internal::UniqueHelper< PackUnique, PackRest > | |
+ Cdlaf::internal::UniqueHelper< Pack< Ts... >, Pack<> > | |
+ ►Cdlaf::internal::UniqueHelper< Pack<>, Pack< Ts... > > | |
+ Cdlaf::internal::Unique< Pack< Ts... > > | |
+ Cdlaf::internal::UniquePack< Pack > | |
+ Cdlaf::internal::UniquePackHelper< PackUnique, PackRest > | |
+ Cdlaf::internal::UniquePackHelper< Pack< Ts... >, Pack<> > | |
+ ►Cdlaf::internal::UniquePackHelper< Pack<>, Pack< Ts... > > | |
+ Cdlaf::internal::UniquePack< Pack< Ts... > > | |
+ Cdlaf::common::internal::Unwrapper< T > | |
+ Cdlaf::common::internal::Unwrapper< pika::execution::experimental::async_rw_mutex_access_wrapper< T1, T2, at > > | |
+ Cdlaf::common::internal::Unwrapper< std::reference_wrapper< T > > | |
+ Cdlaf::common::internal::Unwrapping< F > | |
+ Cdlaf::eigensolver::internal::VAccessHelper | |
+ ►Cstd::vector | |
+ Cdlaf::common::internal::vector< ReadOnlySenderType > | |
+ Cdlaf::common::internal::vector< T > | |
+ ►Cdlaf::matrix::internal::View | |
+ Cdlaf::matrix::SubMatrixView | |
+ Cdlaf::matrix::SubPanelView | |
+ Cdlaf::eigensolver::internal::WorkSpace< T, D > | |
+ Cdlaf::eigensolver::internal::WorkSpaceHost< T > | |
+ Cdlaf::eigensolver::internal::WorkSpaceHostMirror< T, D > | |
diff --git a/master/matrix__base_8h_source.html b/master/matrix__base_8h_source.html
index 8b1de2b19c..3f394467a3 100644
--- a/master/matrix__base_8h_source.html
+++ b/master/matrix__base_8h_source.html
@@ -223,17 +223,19 @@
169 <<
", tiles_grid=" << matrix.
nr_tiles()
-
-
-
-
-
-
-
-
-
-
+
+ 172 <<
", src_rank=" << matrix.
distribution().source_rank_index()
+
+
+
+
+
+
+
+
+
+
Definition: distribution.h:111
diff --git a/master/multiplication_2general_2api_8h_source.html b/master/multiplication_2general_2api_8h_source.html
index dbbf98b32e..9340329d79 100644
--- a/master/multiplication_2general_2api_8h_source.html
+++ b/master/multiplication_2general_2api_8h_source.html
@@ -85,44 +85,55 @@
-
-
- 19 namespace dlaf::multiplication {
-
-
- 22 template <Backend B, Device D,
class T>
-
- 24 static void callNN(
const SizeType i_tile_from,
const SizeType i_tile_to,
const blas::Op opA,
- 25 const blas::Op opB,
const T alpha, Matrix<const T, D>& mat_a,
- 26 Matrix<const T, D>& mat_b,
const T beta, Matrix<T, D>& mat_c);
-
-
- 29 const SizeType i_tile_to,
const T alpha, Matrix<const T, D>& mat_a,
- 30 Matrix<const T, D>& mat_b,
const T beta, Matrix<T, D>& mat_c);
-
-
-
- 34 #define DLAF_MULTIPLICATION_GENERAL_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
- 35 KWORD template struct GeneralSub<BACKEND, DEVICE, DATATYPE>;
-
- 37 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU,
float)
- 38 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU,
double)
- 39 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU, std::complex<float>)
- 40 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU, std::complex<double>)
-
-
- 43 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU,
float)
- 44 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU,
double)
- 45 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU, std::complex<float>)
- 46 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU, std::complex<double>)
-
-
-
-
+
+
+
+ 20 namespace dlaf::multiplication {
+
+
+
+ 24 template <Backend B, Device D,
class T>
+
+
+
+
+
+ 30 template <Backend B, Device D,
class T>
+
+ 32 static void callNN(
const SizeType i_tile_from,
const SizeType i_tile_to,
const blas::Op opA,
+ 33 const blas::Op opB,
const T alpha, Matrix<const T, D>& mat_a,
+ 34 Matrix<const T, D>& mat_b,
const T beta, Matrix<T, D>& mat_c);
+
+
+ 37 const SizeType i_tile_to,
const T alpha, Matrix<const T, D>& mat_a,
+ 38 Matrix<const T, D>& mat_b,
const T beta, Matrix<T, D>& mat_c);
+
+
+ 41 #define DLAF_MULTIPLICATION_GENERAL_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
+ 42 KWORD template struct General<BACKEND, DEVICE, DATATYPE>; \
+ 43 KWORD template struct GeneralSub<BACKEND, DEVICE, DATATYPE>;
+
+ 45 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU,
float)
+ 46 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU,
double)
+ 47 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU, std::complex<float>)
+ 48 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::MC, Device::CPU, std::complex<double>)
+
+
+ 51 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU,
float)
+ 52 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU,
double)
+ 53 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU, std::complex<float>)
+ 54 DLAF_MULTIPLICATION_GENERAL_ETI(
extern, Backend::GPU, Device::GPU, std::complex<double>)
+
+
+
+
Definition: pipeline.h:31
+Definition: matrix_ref.h:108
+
-
+
+
diff --git a/master/multiplication_2general_2impl_8h_source.html b/master/multiplication_2general_2impl_8h_source.html
index 6589daf26c..ae55ee4c7f 100644
--- a/master/multiplication_2general_2impl_8h_source.html
+++ b/master/multiplication_2general_2impl_8h_source.html
@@ -82,164 +82,198 @@
- 14 #include <dlaf/common/assert.h>
-
-
-
-
-
-
-
-
- 23 #include <dlaf/multiplication/general/api.h>
- 24 #include <dlaf/sender/when_all_lift.h>
-
- 26 namespace dlaf::multiplication {
-
-
- 29 template <Backend B, Device D,
class T>
- 30 void GeneralSub<B, D, T>::callNN(
const SizeType idx_begin,
const SizeType idx_end,
const blas::Op opA,
- 31 const blas::Op opB,
const T alpha, Matrix<const T, D>& mat_a,
- 32 Matrix<const T, D>& mat_b,
const T beta, Matrix<T, D>& mat_c) {
- 33 namespace ex = pika::execution::experimental;
-
- 35 for (SizeType j = idx_begin; j < idx_end; ++j) {
- 36 for (SizeType i = idx_begin; i < idx_end; ++i) {
- 37 for (SizeType k = idx_begin; k < idx_end; ++k) {
-
- 39 dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)),
- 40 mat_b.read(GlobalTileIndex(k, j)), k == idx_begin ? beta : T(1),
- 41 mat_c.readwrite(GlobalTileIndex(i, j))) |
-
-
-
-
-
-
-
-
-
-
-
- 53 template <Backend B, Device D,
class T>
- 54 void GeneralSub<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_chain,
- 55 common::Pipeline<comm::Communicator>& col_task_chain,
- 56 const SizeType idx_begin,
const SizeType idx_end,
const T alpha,
- 57 Matrix<const T, D>& mat_a, Matrix<const T, D>& mat_b,
const T beta,
- 58 Matrix<T, D>& mat_c) {
- 59 namespace ex = pika::execution::experimental;
-
- 61 if (idx_begin == idx_end)
-
-
- 64 const auto& dist_a = mat_a.distribution();
- 65 const auto rank = dist_a.rankIndex();
-
-
- 68 const bool rankHasLastRow = rank.row() == dist_a.template rankGlobalTile<Coord::Row>(idx_end - 1);
- 69 const bool rankHasLastCol = rank.col() == dist_a.template rankGlobalTile<Coord::Col>(idx_end - 1);
-
-
- 72 const SizeType i_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_begin);
- 73 const SizeType i_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_end);
-
- 75 const SizeType j_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_begin);
- 76 const SizeType j_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_end);
-
- 78 const SizeType mb = dist_a.blockSize().rows();
- 79 const SizeType lastTileElement = std::min(idx_end * mb - 1, dist_a.size().rows() - 1);
- 80 const SizeType nrefls = lastTileElement - idx_begin * mb + 1;
+
+ 15 #include <dlaf/common/assert.h>
+
+
+
+
+
+
+
+
+
+
+ 26 #include <dlaf/multiplication/general/api.h>
+ 27 #include <dlaf/sender/when_all_lift.h>
+
+
+ 30 namespace dlaf::multiplication {
+
+
+ 33 template <Backend B, Device D,
class T>
+ 34 void General<B, D, T>::callNN(
const T alpha, MatrixRef<const T, D>& mat_a, MatrixRef<const T, D>& mat_b,
+ 35 const T beta, MatrixRef<T, D>& mat_c) {
+ 36 namespace ex = pika::execution::experimental;
+
+ 38 if (mat_a.nr_tiles().cols() == 0) {
+
+
+ 41 for (SizeType j = 0; j < mat_c.distribution().local_nr_tiles().cols(); ++j)
+ 42 for (SizeType i = 0; i < mat_c.distribution().local_nr_tiles().rows(); ++i)
+ 43 ex::start_detached(dlaf::internal::whenAllLift(beta, mat_c.readwrite(LocalTileIndex(i, j))) |
+
+
+
+
+
+ 49 for (SizeType j = 0; j < mat_c.distribution().local_nr_tiles().cols(); ++j) {
+ 50 for (SizeType i = 0; i < mat_c.distribution().local_nr_tiles().rows(); ++i) {
+ 51 for (SizeType k = 0; k < mat_a.distribution().local_nr_tiles().cols(); ++k) {
+ 52 ex::start_detached(dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha,
+ 53 mat_a.read(LocalTileIndex(i, k)),
+ 54 mat_b.read(LocalTileIndex(k, j)),
+
+ 56 mat_c.readwrite(LocalTileIndex(i, j))) |
+
+
+
+
+
+
+ 63 template <Backend B, Device D,
class T>
+ 64 void GeneralSub<B, D, T>::callNN(
const SizeType idx_begin,
const SizeType idx_end,
const blas::Op opA,
+ 65 const blas::Op opB,
const T alpha, Matrix<const T, D>& mat_a,
+ 66 Matrix<const T, D>& mat_b,
const T beta, Matrix<T, D>& mat_c) {
+ 67 namespace ex = pika::execution::experimental;
+
+ 69 for (SizeType j = idx_begin; j < idx_end; ++j) {
+ 70 for (SizeType i = idx_begin; i < idx_end; ++i) {
+ 71 for (SizeType k = idx_begin; k < idx_end; ++k) {
+
+ 73 dlaf::internal::whenAllLift(opA, opB, alpha, mat_a.read(GlobalTileIndex(i, k)),
+ 74 mat_b.read(GlobalTileIndex(k, j)), k == idx_begin ? beta : T(1),
+ 75 mat_c.readwrite(GlobalTileIndex(i, j))) |
+
+
+
+
+
-
- 83 const bool isEndRangePartial = nrefls % mb != 0;
- 84 const SizeType partialSize = (nrefls % mb);
-
-
-
-
-
-
- 91 const GlobalTileIndex panel_offset(idx_begin, idx_begin);
- 92 const matrix::Distribution dist_panel({lastTileElement + 1, lastTileElement + 1}, dist_a.blockSize(),
- 93 dist_a.commGridSize(), dist_a.rankIndex(),
- 94 dist_a.sourceRankIndex());
-
- 96 constexpr std::size_t n_workspaces = 2;
- 97 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_panel, panel_offset);
- 98 common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_panel, panel_offset);
-
-
- 101 for (SizeType k = idx_begin; k < idx_end; ++k) {
- 102 auto& panelA = panelsA.nextResource();
- 103 auto& panelB = panelsB.nextResource();
+
+
+
+
+
+ 87 template <Backend B, Device D,
class T>
+ 88 void GeneralSub<B, D, T>::callNN(common::Pipeline<comm::Communicator>& row_task_chain,
+ 89 common::Pipeline<comm::Communicator>& col_task_chain,
+ 90 const SizeType idx_begin,
const SizeType idx_end,
const T alpha,
+ 91 Matrix<const T, D>& mat_a, Matrix<const T, D>& mat_b,
const T beta,
+ 92 Matrix<T, D>& mat_c) {
+ 93 namespace ex = pika::execution::experimental;
+
+ 95 if (idx_begin == idx_end)
+
+
+ 98 const auto& dist_a = mat_a.distribution();
+ 99 const auto rank = dist_a.rankIndex();
+
+
+ 102 const bool rankHasLastRow = rank.row() == dist_a.template rankGlobalTile<Coord::Row>(idx_end - 1);
+ 103 const bool rankHasLastCol = rank.col() == dist_a.template rankGlobalTile<Coord::Col>(idx_end - 1);
- 105 const bool isKPartial = k == idx_end - 1 && isEndRangePartial;
- 106 const SizeType kSize = isKPartial ? partialSize : mb;
-
- 108 panelA.setWidth(kSize);
- 109 panelB.setHeight(kSize);
-
+
+ 106 const SizeType i_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_begin);
+ 107 const SizeType i_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Row>(idx_end);
+
+ 109 const SizeType j_beg = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_begin);
+ 110 const SizeType j_end = dist_a.template nextLocalTileFromGlobalTile<Coord::Col>(idx_end);
- 112 const auto rank_k = dist_a.rankGlobalTile({k, k});
-
-
- 115 if (rank_k.col() == rank.col()) {
- 116 const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Col>(k);
- 117 for (SizeType i = i_beg; i < i_end; ++i) {
- 118 const LocalTileIndex ik(i, k_local);
- 119 const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
- 120 const SizeType nrows = isRowPartial ? partialSize : mb;
- 121 panelA.setTile(ik, (isRowPartial || isKPartial)
- 122 ?
splitTile(mat_a.read(ik), {{0, 0}, {nrows, kSize}})
-
-
-
-
- 127 if (rank_k.row() == rank.row()) {
- 128 const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Row>(k);
- 129 for (SizeType j = j_beg; j < j_end; ++j) {
- 130 const LocalTileIndex kj(k_local, j);
- 131 const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
- 132 const SizeType ncols = isColPartial ? partialSize : mb;
- 133 panelB.setTile(kj, (isKPartial || isColPartial)
- 134 ?
splitTile(mat_b.read(kj), {{0, 0}, {kSize, ncols}})
-
-
-
+ 112 const SizeType mb = dist_a.blockSize().rows();
+ 113 const SizeType lastTileElement = std::min(idx_end * mb - 1, dist_a.size().rows() - 1);
+ 114 const SizeType nrefls = lastTileElement - idx_begin * mb + 1;
+
+
+ 117 const bool isEndRangePartial = nrefls % mb != 0;
+ 118 const SizeType partialSize = (nrefls % mb);
+
+
+
+
+
+
+ 125 const GlobalTileIndex panel_offset(idx_begin, idx_begin);
+ 126 const matrix::Distribution dist_panel({lastTileElement + 1, lastTileElement + 1}, dist_a.blockSize(),
+ 127 dist_a.commGridSize(), dist_a.rankIndex(),
+ 128 dist_a.sourceRankIndex());
+
+ 130 constexpr std::size_t n_workspaces = 2;
+ 131 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsA(n_workspaces, dist_panel, panel_offset);
+ 132 common::RoundRobin<matrix::Panel<Coord::Row, T, D>> panelsB(n_workspaces, dist_panel, panel_offset);
+
+
+ 135 for (SizeType k = idx_begin; k < idx_end; ++k) {
+ 136 auto& panelA = panelsA.nextResource();
+ 137 auto& panelB = panelsB.nextResource();
-
- 140 broadcast(rank_k.col(), panelA, row_task_chain);
- 141 broadcast(rank_k.row(), panelB, col_task_chain);
-
-
-
-
- 146 for (SizeType i = i_beg; i < i_end; ++i) {
- 147 const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
- 148 const SizeType nrows = isRowPartial ? partialSize : mb;
-
- 150 for (SizeType j = j_beg; j < j_end; ++j) {
- 151 const LocalTileIndex ij(i, j);
-
- 153 const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
- 154 const SizeType ncols = isColPartial ? partialSize : mb;
-
-
- 157 dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha, panelA.read(ij),
- 158 panelB.read(ij), k == idx_begin ? beta : T(1),
- 159 (isRowPartial || isColPartial)
- 160 ?
splitTile(mat_c.readwrite(ij), {{0, 0}, {nrows, ncols}})
- 161 : mat_c.readwrite(ij)) |
-
-
-
-
-
-
-
-
-
-
+ 139 const bool isKPartial = k == idx_end - 1 && isEndRangePartial;
+ 140 const SizeType kSize = isKPartial ? partialSize : mb;
+
+ 142 panelA.setWidth(kSize);
+ 143 panelB.setHeight(kSize);
+
+
+ 146 const auto rank_k = dist_a.rankGlobalTile({k, k});
+
+
+ 149 if (rank_k.col() == rank.col()) {
+ 150 const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Col>(k);
+ 151 for (SizeType i = i_beg; i < i_end; ++i) {
+ 152 const LocalTileIndex ik(i, k_local);
+ 153 const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
+ 154 const SizeType nrows = isRowPartial ? partialSize : mb;
+ 155 panelA.setTile(ik, (isRowPartial || isKPartial)
+ 156 ?
splitTile(mat_a.read(ik), {{0, 0}, {nrows, kSize}})
+
+
+
+
+ 161 if (rank_k.row() == rank.row()) {
+ 162 const auto k_local = dist_a.template localTileFromGlobalTile<Coord::Row>(k);
+ 163 for (SizeType j = j_beg; j < j_end; ++j) {
+ 164 const LocalTileIndex kj(k_local, j);
+ 165 const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
+ 166 const SizeType ncols = isColPartial ? partialSize : mb;
+ 167 panelB.setTile(kj, (isKPartial || isColPartial)
+ 168 ?
splitTile(mat_b.read(kj), {{0, 0}, {kSize, ncols}})
+
+
+
+
+
+ 174 broadcast(rank_k.col(), panelA, row_task_chain);
+ 175 broadcast(rank_k.row(), panelB, col_task_chain);
+
+
+
+
+ 180 for (SizeType i = i_beg; i < i_end; ++i) {
+ 181 const bool isRowPartial = (i == i_end - 1 && isEndRangePartial && rankHasLastRow);
+ 182 const SizeType nrows = isRowPartial ? partialSize : mb;
+
+ 184 for (SizeType j = j_beg; j < j_end; ++j) {
+ 185 const LocalTileIndex ij(i, j);
+
+ 187 const bool isColPartial = (j == j_end - 1 && isEndRangePartial && rankHasLastCol);
+ 188 const SizeType ncols = isColPartial ? partialSize : mb;
+
+
+ 191 dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, alpha, panelA.read(ij),
+ 192 panelB.read(ij), k == idx_begin ? beta : T(1),
+ 193 (isRowPartial || isColPartial)
+ 194 ?
splitTile(mat_c.readwrite(ij), {{0, 0}, {nrows, ncols}})
+ 195 : mat_c.readwrite(ij)) |
+
+
+
+
+
+
+
+
+
+
void broadcast(comm::IndexT_MPI rank_root, matrix::Panel< axis, T, D, storage > &panel, common::Pipeline< comm::Communicator > &serial_comm)
Definition: broadcast_panel.h:58
@@ -249,9 +283,13 @@
ReadOnlyTileSender< T, D > splitTile(ReadOnlyTileSender< T, D > tile, const SubTileSpec &spec)
Definition: tile.h:507
+
+
+
+