From c306bae149b9c8aafbe4f565d2095edd5a25d9de Mon Sep 17 00:00:00 2001 From: "Peter Doak (epd)" Date: Fri, 13 Sep 2024 17:10:07 -0400 Subject: [PATCH 01/11] adding some data classes ported from DCA, fixing up the includes --- CMake/mrpapp_cuda.cmake | 19 +- CMake/mrpapp_hip.cmake | 13 +- CMakeLists.txt | 17 +- gaps3D.h | 4 +- linalg/linalg.hpp | 63 ++ linalg/matrix.hpp | 619 +++++++++++++++ linalg/matrixop.hpp | 1367 ++++++++++++++++++++++++++++++++++ linalg/reshapable_matrix.hpp | 453 +++++++++++ linalg/vector.hpp | 435 +++++++++++ pairing.h | 296 ++++---- platform/dca_gpu.h | 37 + utilities.h | 456 +++++------- 12 files changed, 3333 insertions(+), 446 deletions(-) create mode 100644 linalg/linalg.hpp create mode 100644 linalg/matrix.hpp create mode 100644 linalg/matrixop.hpp create mode 100644 linalg/reshapable_matrix.hpp create mode 100644 linalg/vector.hpp create mode 100644 platform/dca_gpu.h diff --git a/CMake/mrpapp_cuda.cmake b/CMake/mrpapp_cuda.cmake index 4c4c4ffe6..bc19a0ddc 100644 --- a/CMake/mrpapp_cuda.cmake +++ b/CMake/mrpapp_cuda.cmake @@ -1,4 +1,4 @@ -# // Copyright (C) 2023 UT-Battelle, LLC +# // Copyright (C) 2024 UT-Battelle, LLC # // All rights reserved. # // # // See LICENSE for terms of usage. @@ -6,22 +6,33 @@ # Checks for CUDA and accordingly sets MRPAPP_HAVE_CUDA # In addition, set MRPAPP_GPU_LIBS. +message("checking CUDA environment") set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "Name of the real architecture to build for.") set(MRPAPP_HAVE_CUDA FALSE CACHE INTERNAL "") set(MRPAPP_GPU_LIBS "" CACHE INTERNAL "") -# Find CUDA. +include(mrpapp_defines) include(CheckLanguage) +if(NOT CMAKE_CUDA_FLAGS MATCHES "allow-unsupported-compiler") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --allow-unsupported-compiler") +endif() + +set(CMAKE_CUDA_HOST_COMPILER + ${CMAKE_CXX_COMPILER} + CACHE STRING "nvcc host compiler passed via -ccbin") + +# Find CUDA. find_package(CUDAToolkit REQUIRED) check_language(CUDA) if (CMAKE_CUDA_COMPILER) + message("Found CUDA compiler!") enable_language(CUDA) set(MRPAPP_HAVE_CUDA TRUE CACHE INTERNAL "") set(MRPAPP_HAVE_GPU TRUE CACHE INTERNAL "") - dca_add_haves_define(MRPAPP_HAVE_CUDA) - dca_add_haves_define(MRPAPP_HAVE_GPU) + mrpapp_add_define(MRPAPP_HAVE_CUDA) + mrpapp_add_define(MRPAPP_HAVE_GPU) list(APPEND MRPAPP_GPU_LIBS CUDA::cudart CUDA::cublas) set(MRPAPP_CUDA_PROPERTIES "CMAKE_CUDA_ARCHITECTURES 70") diff --git a/CMake/mrpapp_hip.cmake b/CMake/mrpapp_hip.cmake index 1d9c73d20..9b88faff7 100644 --- a/CMake/mrpapp_hip.cmake +++ b/CMake/mrpapp_hip.cmake @@ -1,6 +1,12 @@ ################################################################################ # Author: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab # +# // Copyright (C) 2024 UT-Battelle, LLC +# // All rights reserved. +# // +# // See LICENSE for terms of usage. +# // +# # Checks for HIP and and accordingly sets MRPAPP_HAVE_HIP set(ROCM_ROOT @@ -53,6 +59,7 @@ set(MRPAPP_HAVE_HIP FALSE CACHE INTERNAL "") set(MRPAPP_HAVE_MAGMA FALSE CACHE INTERNAL "") set(MRPAPP_GPU_LIBS "" CACHE INTERNAL "") +include(mrpapp_defines) include(CheckLanguage) check_language(HIP) if (CMAKE_HIP_COMPILER) @@ -61,9 +68,9 @@ if (CMAKE_HIP_COMPILER) set(MRPAPP_HAVE_HIP TRUE CACHE INTERNAL "") set(MRPAPP_HAVE_GPU TRUE CACHE INTERNAL "") # Probably probably these should be public properties of the hip targets - dca_add_haves_define(MRPAPP_HAVE_HIP) - dca_add_haves_define(MRPAPP_HAVE_GPU) - dca_add_haves_define(__HIP_PLATFORM_AMD__) + mrpapp_add_define(MRPAPP_HAVE_HIP) + mrpapp_add_define(MRPAPP_HAVE_GPU) + mrpapp_add_define(__HIP_PLATFORM_AMD__) list(APPEND MRPAPP_GPU_LIBS hip::host roc::hipblas) set(MRPAPP_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx906,gfx908") set(CMAKE_HIP_STANDARD 17) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa2c27c0c..2511a9b2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,3 +1,9 @@ +# // Copyright (C) 2024 UT-Battelle, LLC +# // All rights reserved. +# // +# // See LICENSE for terms of usage. +# // + ###################################################################### # CMake version and policies ###################################################################### @@ -35,7 +41,7 @@ endif(USE_MPI) option(ENABLE_CUDA "Build with GPU support through CUDA" OFF) option(ENABLE_HIP "Build with GPU support through HIP" OFF) set(ENABLE_GPU "$,$>,ON,OFF>") - +message ("enable gpu: ${ENABLE_GPU}") set(MRPAPP_GPU_LIBS "" CACHE INTERNAL "") if(ENABLE_GPU) @@ -45,10 +51,9 @@ if(ENABLE_GPU) if(ENABLE_HIP) include(mrapp_hip) endif(ENABLE_HIP) - if(MRPAPP_HAVE_CUDA OR MRPAPP_HAVE_HIP) include(DetermineDeviceArchitectures) - message(STATUS "GPU device architectures: ${QMC_GPU_ARCHS}") + message(STATUS "GPU device architectures: ${MRPAPP_GPU_ARCHS}") endif() endif(ENABLE_GPU) @@ -61,6 +66,9 @@ set(MRPAPP_SRC set(MRPAPP_MODEL "1BAND" CACHE STRING "RPA model options") set_property(CACHE MRPAPP_MODEL PROPERTY STRINGS SRRUO SRRUO3D SRRUO3DSUH 1BAND 1BANDWSPIN BILAYER_FESC BILAYER_1BAND ORTHOIIBILAYER BSCCOBILAYER BILAYER_FESC BAFEAS KFE2SE2 FOURORBITAL TBFILE COUPLEDLADDERS NDNIO2 MODELFROMFILESO KAGOME 1BANDABWSPIN 1BANDALTERMAGNET 1BANDAB) +include(mrpapp_defines) +mrpapp_write_definitions_file() + add_executable(mrpapp ${MRPAPP_SRC}) target_include_directories(mrpapp PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}" ${PROJECT_SOURCE_DIR}/PartialPsimag @@ -74,4 +82,5 @@ set(mpi_targs MPI::MPI_C MPI::MPI_CXX) set(MPI_TARGETS "$<$:${mpi_targs}>") target_link_libraries(mrpapp ${MRPAPP_GPU_LIBS} LAPACK::LAPACK BLAS::BLAS ${MPI_TARGETS} ) -add_custom_target(genexdebug COMMAND ${CMAKE_COMMAND} -E echo "$<$:USE_MPI>") +add_custom_target(genexdebug COMMAND ${CMAKE_COMMAND} -E echo "$,$>,ON,OFF>") +#$<$:USE_MPI>") diff --git a/gaps3D.h b/gaps3D.h index e59d72183..642913fe0 100644 --- a/gaps3D.h +++ b/gaps3D.h @@ -98,9 +98,9 @@ class gap3D { // simple s+- for 5-orbital 1111 model 2D // description param.parity = 1.0; // even parity (d-wave) gap if (band == 0) { // bonding band - return param.Delta0 * (abs(cos(k[0])) - cos(k[1])); + return param.Delta0 * (std::abs(cos(k[0])) - cos(k[1])); } else { // antibonding band (shift kx by pi) - return param.Delta0 * (-abs(cos(k[0])) - cos(k[1])); + return param.Delta0 * (-std::abs(cos(k[0])) - cos(k[1])); } } else if (param.gAmpl == "SrRuO_helical" || param.gAmpl == "SrRuO_chiral" || diff --git a/linalg/linalg.hpp b/linalg/linalg.hpp new file mode 100644 index 000000000..0652e9cdd --- /dev/null +++ b/linalg/linalg.hpp @@ -0,0 +1,63 @@ +// based on linalg.hpp from DCA++ +// distributed under BSD-3-clause license +// Copyright (C) 2018 ETH Zurich +// Copyright (C) 2024 UT-Battelle, LLC +// All rights reserved. +// +// +// See LICENSE for terms of usage. +// See CITATION.md for citation guidelines, if MRPAPP is used for scientific publications. +// +// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch) +// +// This file includes all the header files in include/dca/linalg. +// TODO: This file is temporary and will be removed or updated. + +#include "dca/linalg/vector.hpp" +#include "dca/linalg/matrix.hpp" +#include "dca/linalg/matrixop.hpp" + +#ifdef DCA_HAVE_GPU +#include "dca/platform/dca_gpu.h" +#include "dca/platform/dca_gpu_blas.h" +#include "dca/linalg/util/handle_functions.hpp" +#include "dca/linalg/util/info_gpu.hpp" +#include "dca/linalg/util/stream_container.hpp" +#include "dca/linalg/util/stream_functions.hpp" +#include "dca/linalg/util/util_gpublas.hpp" +#endif // DCA_HAVE_GPU + +// BLAS +#include "dca/linalg/blas/blas1.hpp" +#include "dca/linalg/blas/blas2.hpp" +#include "dca/linalg/blas/blas3.hpp" + +#include "dca/linalg/lapack/bennet_update.hpp" +#include "dca/linalg/lapack/inverse.hpp" +#include "dca/linalg/lapack/lapack.hpp" +#include "dca/linalg/lapack/silence_lapack.hpp" +#include "dca/linalg/lapack/solve.hpp" + +#ifdef DCA_HAVE_GPU +// CUBLAS +#include "dca/linalg/blas/cublas1.hpp" +#include "dca/linalg/blas/cublas3.hpp" +#include "dca/linalg/blas/cublas_conversion_char_types.hpp" +#include "dca/linalg/blas/kernels_gpu.hpp" + +#include "dca/linalg/lapack/laset_gpu.hpp" +#include "dca/linalg/lapack/magma.hpp" +#include "dca/linalg/lapack/multiply_diagonal_gpu.hpp" +#endif // DCA_HAVE_GPU + +// Device selector struct +#include "dca/linalg/device_type.hpp" +#include "dca/linalg/blas/use_device.hpp" +#include "dca/linalg/lapack/use_device.hpp" + +// Utils +#include "dca/linalg/util/allocators/allocators.hpp" +#include "dca/linalg/util/copy.hpp" +#include "dca/linalg/util/lapack_exception.hpp" +#include "dca/linalg/util/util_lapack.hpp" +#include "dca/linalg/util/util_matrixop.hpp" diff --git a/linalg/matrix.hpp b/linalg/matrix.hpp new file mode 100644 index 000000000..e55181138 --- /dev/null +++ b/linalg/matrix.hpp @@ -0,0 +1,619 @@ +// Copyright (C) 2023 ETH Zurich +// Copyright (C) 2023 UT-Battelle, LLC +// All rights reserved. +// +// See LICENSE for terms of usage. +// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. +// +// Author: Peter Staar (taa@zurich.ibm.com) +// Raffaele Solca' (rasolca@itp.phys.ethz.ch) +// Peter W. Doak (doakpw@ornl.gov +// +/// \file provides the Matrix object for different device types and allocators. + +#ifndef DCA_LINALG_MATRIX_HPP +#define DCA_LINALG_MATRIX_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dca/linalg/vector.hpp" +#include "dca/linalg/util/allocators/allocators.hpp" +#include "dca/linalg/device_type.hpp" +#include "dca/linalg/util/copy.hpp" +#include "dca/linalg/util/memory.hpp" +#include "dca/linalg/util/stream_functions.hpp" +#include "dca/util/type_help.hpp" + +namespace dca { +namespace linalg { +// dca::linalg:: + +/** Matrix class for interfacing with Blas, Cublas, Rocblas + * its row major i.e, row is fast. + */ +template > +class Matrix : public ALLOC { +public: + using ThisType = Matrix; + using ValueType = ScalarType; + using Allocator = ALLOC; + constexpr static DeviceType device = device_name; + + Matrix(const std::string& name = default_name_); + + Matrix(int size); + Matrix(const std::string& name, int size); + + // Preconditions: capacity >= size. + Matrix(int size, int capacity); + Matrix(const std::string& name, int size, int capacity); + + Matrix(std::pair size); + Matrix(const std::string& name, std::pair size); + + // Preconditions: capacity.first >= size.first, capacity.second >= size.second. + Matrix(std::pair size, std::pair capacity); + Matrix(const std::string& name, std::pair size, std::pair capacity); + + // Copy and move constructor: + // Constructs a matrix with name name, size rhs.size() and a copy of the elements of rhs. + Matrix(const Matrix& rhs, const std::string& name = default_name_); + // Constructs a matrix with name name, size rhs.size(). The elements of rhs are moved. + // Postcondition: rhs is a (0 x 0) matrix. + Matrix(Matrix&& rhs, const std::string& = default_name_); + + // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs + // elements are stored on a different device. + template + Matrix(const Matrix& rhs, const std::string& = default_name_); + + // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs + // elements are stored on a different device. + template + Matrix(const Matrix& rhs, const std::string& = default_name_); + + ~Matrix(); + + // Assignment operators: + // Resizes the matrix to rhs.size() and copy the elements of rhs. + // Postcondition: The name of the matrix is unchanged. + Matrix& operator=(const Matrix& rhs); + // Resizes the matrix to rhs.size() and move the elements of rhs. + // Postcondition: The name of the matrix is unchanged; rhs is a (0 x 0) matrix. + Matrix& operator=(Matrix&& rhs); + + // Resizes the matrix to rhs.size() and copy the elements, stored on a different device, of rhs. + // Postcondition: The name of the matrix is unchanged. + template + Matrix& operator=(const Matrix& rhs); + + template + Matrix& operator=(const Matrix& rhs); + + // Returns true if this is equal to other, false otherwise. + // Two matrices are equal, if they have the same size and contain the same elements. Name and + // capacity are ignored. + // Special case: two matrices without elements are equal. + bool operator==(const Matrix& other) const; + + // Returns true if this is not equal to other, false otherwise. + // See description of operator== for the definition of equality. + bool operator!=(const Matrix& other) const; + + // Returns the (i,j)-th element of the matrix. + // Preconditions: 0 <= i < size().first, 0 <= j < size().second. + // This method is available only if device_name == CPU. + template > + ScalarType& operator()(int i, int j) { + assert(i >= 0 && i < size_.first); + assert(j >= 0 && j < size_.second); + return data_[i + j * leadingDimension()]; + } + template > + const ScalarType& operator()(int i, int j) const { + assert(i >= 0 && i < size_.first); + assert(j >= 0 && j < size_.second); + return data_[i + j * leadingDimension()]; + } + + const std::string& get_name() const { + return name_; + } + void set_name(const std::string& new_name) { + name_ = new_name; + } + + // Returns the pointer to the (0,0)-th element. + ValueType* ptr() { + return data_; + } + const ValueType* ptr() const { + return data_; + } + + // Returns the pointer to the (i,j)-th element i < size().first and 0 < j < size().second, or + // a pointer past the end of the range if i == size().first or j == size().second. + // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second. + ValueType* ptr(int i, int j) { + assert(i >= 0 && i <= size_.first); + assert(j >= 0 && j <= size_.second); + return data_ + i + j * leadingDimension(); + } + const ValueType* ptr(int i, int j) const { + assert(i >= 0 && i <= size_.first); + assert(j >= 0 && j <= size_.second); + return data_ + i + j * leadingDimension(); + } + + bool is_square() const { + return (size_.first == size_.second); + } + + const std::pair size() const { + return size_; + } + const std::pair& capacity() const { + return capacity_; + } + int nrRows() const { + return size_.first; + } + int nrCols() const { + return size_.second; + } + + int getActualSize() { + return nrElements(capacity_); + } + + int leadingDimension() const { + return capacity_.first; + } + + // Resizes *this to a (new_size * new_size) matrix. + // Elements added may have any value. + // Remark: The capacity of the matrix and element pointers do not change + // if new_size <= capacity().first and new_size <= capacity().second. + void resize(int new_size) { + resize(std::make_pair(new_size, new_size)); + } + // Resizes *this to a (new_size.first * new_size.second) matrix. + // Elements added may have any value. + // Remark: The capacity of the matrix and element pointers do not change + // if new_size.first <= capacity().first and new_size.second <= capacity().second. + void resize(std::pair new_size); + + // Resizes *this to a (new_size * new_size) matrix. + // The previous elements are not copied, therefore all the elements + // may have any value after the call to this method. + // Remark: The capacity of the matrix and element pointers do not change + // if new_size <= capacity().first and new_size <= capacity().second. + void resizeNoCopy(int new_size) { + resizeNoCopy(std::make_pair(new_size, new_size)); + } + // Resizes *this to a (new_size.first * new_size.second) matrix. + // The previous elements are not copied, therefore all the elements + // may have any value after the call to this method. + // Remark: The capacity of the matrix and element pointers do not change + // if new_size.first <= capacity().first and new_size.second <= capacity().second. + void resizeNoCopy(std::pair new_size); + + // Releases the memory allocated by *this and sets size and capacity to zero. + void clear(); + + // Swaps the contents of the matrix except the name with those of rhs. + void swap(Matrix& rhs); + // Swaps the contents of the matrix, included the name, with those of rhs. + void swapWithName(Matrix& rhs); + + // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) + // + synchronization of stream + template + void set(const Matrix& rhs, int thread_id, int stream_id); + + template + void set(const Matrix& rhs, const util::GpuStream& stream); + + // Asynchronous assignment. + template + void setAsync(const Matrix& rhs, const util::GpuStream& stream); + + // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) + template + void setAsync(const Matrix& rhs, int thread_id, int stream_id); + + void setToZero(const util::GpuStream& stream); + + // Prints the values of the matrix elements. + void print() const; + // Prints the properties of *this. + void printFingerprint() const; + // Returns the allocated device memory in bytes. + std::size_t deviceFingerprint() const; + + std::string toStr() const; +private: + static std::pair capacityMultipleOfBlockSize(std::pair size); + inline static size_t nrElements(std::pair size) { + return static_cast(size.first) * static_cast(size.second); + } + static constexpr int block_size_ = 32; + static const std::string default_name_; + + std::string name_; + + std::pair size_; + std::pair capacity_; + + ValueType* data_ = nullptr; + + template + friend class dca::linalg::Matrix; +}; + +template +const std::string Matrix::default_name_ = "no-name"; + +template +Matrix::Matrix(const std::string& name) : Matrix(name, 0) {} + +template +Matrix::Matrix(int size) : Matrix(std::make_pair(size, size)) {} + +template +Matrix::Matrix(const std::string& name, int size) + : Matrix(name, std::make_pair(size, size)) {} + +template +Matrix::Matrix(int size, int capacity) + : Matrix(std::make_pair(size, size), std::make_pair(capacity, capacity)) {} + +template +Matrix::Matrix(const std::string& name, int size, int capacity) + : Matrix(name, std::make_pair(size, size), std::make_pair(capacity, capacity)) {} + +template +Matrix::Matrix(std::pair size) : Matrix(size, size) {} + +template +Matrix::Matrix(const std::string& name, std::pair size) + : Matrix(name, size, size) {} + +template +Matrix::Matrix(std::pair size, std::pair capacity) + : Matrix(default_name_, size, capacity) {} + +template +template +Matrix::Matrix(const Matrix& rhs, + const std::string& name) + : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) { + data_ = Allocator::allocate(nrElements(capacity_)); + util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); +} + +template +template +Matrix::Matrix(const Matrix& rhs, + const std::string& name) + : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) { + if (sizeof(ScalarType) != sizeof(ScalarRhs)) + throw std::runtime_error("conversion of both type and location of Matrix not currently possible!"); + data_ = ALLOC::allocate(nrElements(capacity_)); + util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); +} + +template +Matrix::Matrix(const std::string& name, std::pair size, + std::pair capacity) + : name_(name), size_(size), capacity_(capacityMultipleOfBlockSize(capacity)) { + assert(size_.first >= 0 && size_.second >= 0); + assert(capacity.first >= 0 && capacity.second >= 0); + assert(capacity.first >= size_.first && capacity.second >= size_.second); + assert(capacity_.first >= capacity.first && capacity_.second >= capacity.second); + + data_ = ALLOC::allocate(nrElements(capacity_)); + util::Memory::setToZero(data_, nrElements(capacity_)); +} + +template +Matrix::Matrix(const Matrix& rhs, + const std::string& name) + : name_(name) { + *this = rhs; +} + +template +Matrix::Matrix(Matrix&& rhs, const std::string& name) + : name_(name), size_(rhs.size_), capacity_(rhs.capacity_), data_(rhs.data_) { + rhs.capacity_ = std::make_pair(0, 0); + rhs.size_ = std::make_pair(0, 0); + rhs.data_ = nullptr; +} + +template +Matrix::~Matrix() { + Allocator::deallocate(data_); +} + +template +void Matrix::resize(std::pair new_size) { + if (new_size.first == 0 || new_size.second ==0) { + size_ = new_size; + return; + } else if (new_size.first > capacity_.first || new_size.second > capacity_.second) { + std::pair new_capacity = capacityMultipleOfBlockSize(new_size); + + ValueType* new_data = nullptr; + new_data = Allocator::allocate(nrElements(new_capacity)); + // hip memorycpy2D routines don't tolerate leadingDimension = 0 + const std::pair copy_size(std::min(new_size.first, size_.first), + std::min(new_size.second, size_.second)); + util::memoryCopy(new_data, new_capacity.first, data_, leadingDimension(), copy_size); + Allocator::deallocate(data_); + data_ = new_data; + capacity_ = new_capacity; + size_ = new_size; + } + else { + size_ = new_size; + } +} + +template +Matrix& Matrix::operator=( + const Matrix& rhs) { + resizeNoCopy(rhs.size_); + if (device_name == CPU) + util::memoryCopyCpu(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); + else + util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); + return *this; +} + +template +Matrix& Matrix::operator=( + Matrix&& rhs) { + swap(rhs); + return *this; +} + +template +template +Matrix& Matrix::operator=( + const Matrix& rhs) { + resizeNoCopy(rhs.size_); + util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); + return *this; +} + +#ifdef DCA_HAVE_GPU +template +template +Matrix& Matrix::operator=( + const Matrix& rhs) { + static_assert(sizeof(ScalarType) == sizeof(ScalarRhs), + "sizeof ScalarType and ScalarRhs are not equal"); + resizeNoCopy(rhs.size_); + util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); + return *this; +} + +#endif + +template +bool Matrix::operator==(const Matrix& other) const { + if (device_name == GPU) + return Matrix(*this) == Matrix(other); + + if (size() != other.size()) + return nrRows() * nrCols() == 0 and other.nrRows() * other.nrCols() == 0; + + for (int j = 0; j < nrCols(); ++j) + for (int i = 0; i < nrRows(); ++i) + if ((*this)(i, j) != other(i, j)) + return false; + + return true; +} + +template +bool Matrix::operator!=(const Matrix& other) const { + return not(*this == other); +} + +template +void Matrix::resizeNoCopy(std::pair new_size) { + if (new_size.first > capacity_.first || new_size.second > capacity_.second) { + size_ = new_size; + capacity_ = capacityMultipleOfBlockSize(new_size); + + Allocator::deallocate(data_); + data_ = Allocator::allocate(nrElements(capacity_)); + } + else { + size_ = new_size; + } +} + +template +void Matrix::clear() { + Allocator::deallocate(data_); + size_ = capacity_ = std::make_pair(0, 0); +} + +template +void Matrix::swap(Matrix& rhs) { + std::swap(size_, rhs.size_); + std::swap(capacity_, rhs.capacity_); + std::swap(data_, rhs.data_); +} + +template +void Matrix::swapWithName(Matrix& rhs) { + std::swap(name_, rhs.name_); + swap(rhs); +} + +template +template +void Matrix::set(const Matrix& rhs, + int thread_id, int stream_id) { + resize(rhs.size_); + // This specialization is required since without unified memory CUDA doesn't known which memory locality the pointer has. + if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU) + util::memoryCopyH2D(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, + thread_id, stream_id); + else if constexpr (device_name == DeviceType::CPU && rhs_device_name == DeviceType::GPU) + util::memoryCopyD2H(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, + thread_id, stream_id); + else if constexpr (device_name == DeviceType::CPU && rhs_device_name == DeviceType::CPU) + util::memoryCopyCpu(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, + thread_id, stream_id); +} + +template +template +void Matrix::set(const Matrix& rhs, + const util::GpuStream& stream [[maybe_unused]]) { + resize(rhs.size_); + if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU) + util::memoryCopyH2D(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); + else if constexpr (device_name == DeviceType::CPU && rhs_device_name == DeviceType::GPU) + util::memoryCopyD2H(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); +} + +template +template +void Matrix::setAsync(const Matrix& rhs, + const util::GpuStream& stream) { + resizeNoCopy(rhs.size_); + util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); +} + +template +template +void Matrix::setAsync(const Matrix& rhs, + const int thread_id, const int stream_id) { + setAsync(rhs, util::getStream(thread_id, stream_id)); +} + +template +void Matrix::setToZero(const util::GpuStream& stream) { + util::Memory::setToZeroAsync(data_, leadingDimension() * nrCols(), stream); +} + +template +void Matrix::print() const { + if (device_name == GPU) + return Matrix(*this).print(); + + printFingerprint(); + + std::stringstream ss; + ss.precision(6); + ss << std::scientific; + + ss << "\n"; + for (int i = 0; i < nrRows(); ++i) { + for (int j = 0; j < nrCols(); ++j) + ss << "\t" << operator()(i, j); + ss << "\n"; + } + + std::cout << ss.str() << std::endl; +} + +template +std::string Matrix::toStr() const { + if (device_name == GPU) + return Matrix(*this).toStr(); + + std::stringstream ss; + ss.precision(16); + ss << std::scientific; + + ss << "\n"; + for (int i = 0; i < nrRows(); ++i) { + for (int j = 0; j < nrCols(); ++j) + ss << "\t" << operator()(i, j); + ss << "\n"; + } + + return ss.str(); +} + +template +void Matrix::printFingerprint() const { + std::stringstream ss; + + ss << "\n"; + ss << " name: " << name_ << "\n"; + ss << " size: " << size_.first << ", " << size_.second << "\n"; + ss << " capacity: " << capacity_.first << ", " << capacity_.second << "\n"; + ss << " memory-size: " << nrElements(capacity_) * sizeof(ScalarType) * 1.e-6 << "(Mbytes)\n"; + + std::cout << ss.str() << std::endl; +} + +template +std::pair Matrix::capacityMultipleOfBlockSize( + std::pair size) { + assert(size.first >= 0); + assert(size.second >= 0); + + auto get_new_size = [=](const int size) { + return size <= 16 ? size : (size + block_size_ - 1) / block_size_ * block_size_; + }; + + size.first = get_new_size(size.first); + size.second = get_new_size(size.second); + + return size; +} + +template +std::size_t Matrix::deviceFingerprint() const { + if (device_name == GPU) + return capacity_.first * capacity_.second * sizeof(ScalarType); + else + return 0; +} + +/// Factory function for diangonal matrices, type is inferred from the type of Vector. +template +auto makeDiagonalMatrix(Vector& diag) { + int dsize = diag.size(); + Matrix matrix("diag_matrix", dsize); + for (int i = 0; i < dsize; ++i) { + matrix(i, i) = diag[i]; + } + return matrix; +} + +/// Factory function for diangonal matrices, type is inferred from the type of Vector. +template +auto makeDiagonalMatrixInv(Vector& diag) { + int dsize = diag.size(); + Matrix matrix("diag_matrix", dsize); + // insure that if ScalarType is complex the 1 is as well. + // then std::complex will give us a proper complex multiplicative inverse + ScalarType the_one{}; + the_one += 1.0; + for (int i = 0; i < dsize; ++i) { + matrix(i, i) = the_one / diag[i]; + } + return matrix; +} + +} // namespace linalg +} // namespace dca + +#endif // DCA_LINALG_MATRIX_HPP diff --git a/linalg/matrixop.hpp b/linalg/matrixop.hpp new file mode 100644 index 000000000..87c650d25 --- /dev/null +++ b/linalg/matrixop.hpp @@ -0,0 +1,1367 @@ +// Copyright (C) 2023 ETH Zurich +// Copyright (C) 2023 UT-Battelle, LLC +// All rights reserved. +// +// See LICENSE for terms of usage. +// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. +// +// Author: Peter Staar (taa@zurich.ibm.com) +// Raffaele Solca' (rasolca@itp.phys.ethz.ch) +// Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) +// Peter W. Doak (doakpw@ornl.gov) +// +/** \file provides the matrix interface for the following matrix operations: + * - copyCol, copyRow, copyCols, copyRows + * - difference + * - real + * - insertCol, insertRow (for CPU matrices only) + * - inverse + * - inverseAndDeterminant + * - removeCol, removeCols, removeRow, removeRows, removeRowAndCol, removeRowAndCols + * - scaleCol, scaleRow, scaleRows + * - swapCol, swapRow, swapRowAndCol + * - swapCols, swapRows (for GPU matrices only) + * - gemm + * - multiply + * - trsm + * - determinant + * - logDeterminant + * - eigensolver (non-symmetric / symmetric / Hermitian) + * - pseudoInverse + * + * CPU Matrix has choice of allocator, although currently all must match + * GPU or mixed CPU GPU must use default allocator. + */ + +#ifndef DCA_LINALG_MATRIXOP_HPP +#define DCA_LINALG_MATRIXOP_HPP + +#include +#include +#include + +#include "dca/linalg/blas/use_device.hpp" +#include "dca/linalg/lapack/use_device.hpp" +#include "dca/linalg/matrix.hpp" +#include "dca/linalg/util/util_lapack.hpp" +#include "dca/linalg/util/util_matrixop.hpp" +#include "dca/linalg/vector.hpp" +#include "dca/math/util/phase.hpp" + +#ifdef DCA_HAVE_GPU +#include "dca/linalg/blas/kernels_gpu.hpp" +#endif + +namespace dca { +namespace linalg { +namespace matrixop { +// dca::linalg::matrixop:: + +// Copies the matrix mat in a. +// Preconditions: lda >= mat.nrRows(). +template +inline void copyMatrixToArray(const Matrix& mat, Scalar* a, int lda) { + assert(lda >= mat.nrRows()); + lapack::lacpy("A", mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(), a, lda); +} + +// Copies the m by n matrix stored in a to the matrix mat. +// Preconditions: lda >= m. +template +inline void copyArrayToMatrix(int m, int n, const Scalar* a, int lda, + Matrix& mat) { + assert(lda >= m); + mat.resizeNoCopy(std::make_pair(m, n)); + lapack::lacpy("A", mat.nrRows(), mat.nrCols(), a, lda, mat.ptr(), mat.leadingDimension()); +} + +// Copies the jx-th column of mat_x into the jy-th column of mat_y. +// In/Out: mat_y +// Preconditions: mat_x.nrRows() == mat_y.nrRows(), +// 0 <= jx < mat_x.nrCols(), 0 <= jy < mat_y.nrCols(). +template +inline void copyCol(const Matrix& mat_x, int jx, + Matrix& mat_y, int jy, int thread_id = 0, int stream_id = 0) { + assert(jx >= 0 && jx < mat_x.nrCols()); + assert(jy >= 0 && jy < mat_y.nrCols()); + assert(mat_x.nrRows() == mat_y.nrRows()); + + blas::UseDevice::copy(mat_x.nrRows(), mat_x.ptr(0, jx), 1, mat_y.ptr(0, jy), 1, + thread_id, stream_id); +} + +// Copies the j_x[i]-th column of mat_x into the j_y[i]-th column of mat_y, for 0 <= i < j_x.size(). +// In/Out: mat_y +// Preconditions: j_x.size() <= j_y.size(), mat_x.nrRows() == mat_y.nrRows() +// 0 <= j_x[i] < mat_x.nrCols() for 0 <= i < j_x.size(), +// 0 <= j_y[i] < mat_y.nrCols() for 0 <= i < j_x.size(). +template +inline void copyCols(const Matrix& mat_x, const Vec& j_x, + Matrix& mat_y, const Vec& j_y, int /*thread_id*/ = 0, + int /*stream_id*/ = 0) { + assert(j_x.size() <= j_y.size()); + + for (int ind_j = 0; ind_j < j_x.size(); ++ind_j) + copyCol(mat_x, j_x[ind_j], mat_y, j_y[ind_j]); +} +#ifdef DCA_HAVE_GPU +template +inline void copyCols(const Matrix& mat_x, const Vector& j_x, + Matrix& mat_y, const Vector& j_y, int thread_id = 0, + int stream_id = 0) { + assert(j_x.size() <= j_y.size()); + assert(mat_x.nrRows() == mat_y.nrRows()); + + blas::copyCols(mat_x.nrRows(), j_x.size(), j_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), + j_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); + checkErrorsCudaDebug(); +} + +// Copies the j_x columns of mat_x into the mat_y, for 0 <= i < j_x.size(). +// In/Out: mat_y +// Preconditions: mat_x.nrRows() == mat_y.nrRows() +// 0 <= j_x[i] < mat_x.nrCols() for 0 <= i < j_x.size(), +template +inline void copyCols(const Matrix& mat_x, const Vector& j_x, + Matrix& mat_y, int thread_id = 0, int stream_id = 0) { + assert(mat_x.nrRows() == mat_y.nrRows()); + + blas::copyCols(mat_x.nrRows(), j_x.size(), j_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), + mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Copies the ix-th row of mat_x into the iy-th row of mat_y. +// In/Out: mat_y +// Preconditions: mat_x.nrCols() == mat_y.nrCols(), +// 0 <= ix < mat_x.nrRows(), 0 <= iy < mat_y.nrRows(). +template +inline void copyRow(const Matrix& mat_x, int ix, + Matrix& mat_y, int iy, int thread_id = 0, int stream_id = 0) { + assert(ix >= 0 && ix < mat_x.nrRows()); + assert(iy >= 0 && iy < mat_y.nrRows()); + assert(mat_x.nrCols() == mat_y.nrCols()); + + blas::UseDevice::copy(mat_x.nrCols(), mat_x.ptr(ix, 0), mat_x.leadingDimension(), + mat_y.ptr(iy, 0), mat_y.leadingDimension(), thread_id, + stream_id); +} + +// Copies the i_x[i]-th row of mat_x into the i_y[i]-th row of mat_y, for 0 <= i < i_x.size(). +// In/Out: mat_y +// Preconditions: i_x.size() <= i_y.size(), mat_x.nrCols() == mat_y.nrCols() +// 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(), +// 0 <= i_y[i] < mat_y.nrRows() for 0 <= i < i_x.size(). +template +inline void copyRows(const Matrix& mat_x, const Vec& i_x, + Matrix& mat_y, const Vec& i_y, const int /*thread_id*/ = 0, + const int /*stream_id*/ = 0) { + assert(i_x.size() <= i_y.size()); + assert(mat_x.nrCols() == mat_y.nrCols()); + + for (int j = 0; j < mat_x.nrCols(); ++j) + for (int ind_i = 0; ind_i < i_x.size(); ++ind_i) + mat_y(i_y[ind_i], j) = mat_x(i_x[ind_i], j); +} +#ifdef DCA_HAVE_GPU +template +inline void copyRows(const Matrix& mat_x, const Vector& i_x, + Matrix& mat_y, const Vector& i_y, const int thread_id, + const int stream_id) { + assert(i_x.size() <= i_y.size()); + assert(mat_x.nrCols() == mat_y.nrCols()); + + blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), + i_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); + checkErrorsCudaDebug(); +} + +// Copies the i_x rows of mat_x into mat_y, for 0 <= i < i_x.size(). +// In/Out: mat_y +// Preconditions: mat_x.nrCols() == mat_y.nrCols() +// 0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(). +template +inline void copyRows(const Matrix& mat_x, const Vector& i_x, + Matrix& mat_y, const int thread_id, const int stream_id) { + assert(mat_x.nrCols() == mat_y.nrCols()); + + blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(), + mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Returns the difference of two matrices in terms of max_i,j(|a(i, j) - b(i, j)|). +// If the difference is larger than the threshold a std::logic_error exception is thrown, +// and if NDEBUG is not defined each difference which exceeds the threshold is printed. +// Preconditions: a.size() == b.size(). +template +auto difference(const Matrix& a, const Matrix& b, + double diff_threshold = 1e-3) { + auto max_diff = std::abs(Scalar(0)); + assert(a.size() == b.size()); + + for (int j = 0; j < a.nrCols(); ++j) { + for (int i = 0; i < a.nrRows(); ++i) { + max_diff = std::max(max_diff, std::abs(a(i, j) - b(i, j))); + } + } + + if (max_diff > diff_threshold) { +#ifndef NDEBUG + std::stringstream s; + for (int i = 0; i < a.nrRows(); ++i) { + for (int j = 0; j < a.nrCols(); ++j) { + if (std::abs(a(i, j) - b(i, j)) <= diff_threshold) + s << 0. << "\t"; + else + s << a(i, j) - b(i, j) << "\t"; + } + s << "\n"; + } + s << std::endl; + std::cout << s.str(); +#endif // NDEBUG + std::cerr << "matrix difference in excess of threshold!\n"; + throw std::logic_error(__FUNCTION__); + } + + return max_diff; +} +template +auto difference(const Matrix& a, const Matrix& b, + double diff_threshold = 1e-3) { + Matrix cp_a(a); + return difference(cp_a, b, diff_threshold); +} +template +auto difference(const Matrix& a, const Matrix& b, + double diff_threshold = 1e-3) { + Matrix cp_b(b); + return difference(a, cp_b, diff_threshold); +} + +// Returns the real part of a matrix. +// In: a +// TODO test. +template +Matrix real(const Matrix, CPU, ALLOC>& a) { + Matrix a_re(a.size()); + for (int j = 0; j < a.nrCols(); ++j) + for (int i = 0; i < a.nrRows(); ++i) + a_re(i, j) = std::real(a(i, j)); + return a_re; +} + +// Insert a column at position j. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= j < mat.nrCols() + 1. +// Postconditions: The elements of the inserted column are set to 0. +template +void insertCol(Matrix& mat, int j) { + assert(j >= 0 && j < mat.nrCols() + 1); + + mat.resize(std::make_pair(mat.nrRows(), mat.nrCols() + 1)); + + if (mat.nrRows() > 0 && j < mat.nrCols() - 1) + memmove(mat.ptr(0, j + 1), mat.ptr(0, j), + sizeof(Scalar) * (mat.nrCols() - 1 - j) * mat.leadingDimension()); + + for (int i = 0; i < mat.nrRows(); ++i) + mat(i, j) = 0; +} + +// Insert a row at position i. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= i < mat.nrRows() + 1. +// Postconditions: The elements of the inserted row are set to 0. +template +void insertRow(Matrix& mat, int i) { + assert(i >= 0 && i < mat.nrRows() + 1); + + mat.resize(std::make_pair(mat.nrRows() + 1, mat.nrCols())); + + if (i < mat.nrRows() - 1) + for (int j = 0; j < mat.nrCols(); ++j) + memmove(mat.ptr(i + 1, j), mat.ptr(i, j), sizeof(Scalar) * (mat.nrRows() - 1 - i)); + + for (int j = 0; j < mat.nrCols(); ++j) + mat(i, j) = 0; +} + +// Computes the inverse of the matrix using the LU factorization. +// In/Out: mat +// Out: ipiv, work +// Preconditions: mat is a square matrix. +// Postconditions: ipiv and work are resized to the needed dimension. +// \todo consider doing inverse at full precision reguardless of incoming Scalar precision +template class ALLOC, + template class MatrixType> +void inverse(MatrixType>& mat, Vector& ipiv, + Vector& work) { + assert(mat.is_square()); + + ipiv.resizeNoCopy(mat.nrRows()); + + lapack::UseDevice::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(), + mat.leadingDimension(), ipiv.ptr()); + // Get optimal worksize. + int lwork = util::getInverseWorkSize(mat); + work.resizeNoCopy(lwork); + + lapack::UseDevice::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.ptr(), + work.ptr(), lwork); +} + + template class ALLOC, + template class MatrixType> +void inverse(MatrixType>& mat) { + Vector ipiv; + Vector work; + inverse(mat, ipiv, work); +} + +template +void smallInverse(Matrix& m_inv, Vector& ipiv, + Vector& work) { + assert(m_inv.is_square()); + switch (m_inv.nrCols()) { + case 1: + m_inv(0, 0) = Scalar(1.) / m_inv(0, 0); + break; + case 2: { + const Scalar det = m_inv(0, 0) * m_inv(1, 1) - m_inv(0, 1) * m_inv(1, 0); + + std::swap(m_inv(0, 0), m_inv(1, 1)); + m_inv(0, 0) /= det; + m_inv(1, 1) /= det; + // Thomas this looks like your bug fix was this it? + // std::swap(m_inv(1, 0), m_inv(0, 1)); + m_inv(1, 0) /= -det; + m_inv(0, 1) /= -det; + break; + } + case 3: { + const Matrix m(m_inv); + const Scalar det = m(0, 0) * (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) - + m(1, 0) * (m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1)) + + m(2, 0) * (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)); + m_inv(0, 0) = (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) / det; + m_inv(0, 1) = -(m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1)) / det; + m_inv(0, 2) = (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)) / det; + m_inv(1, 0) = -(m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) / det; + m_inv(1, 1) = (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0)) / det; + m_inv(1, 2) = -(m(0, 0) * m(1, 2) - m(1, 0) * m(0, 2)) / det; + m_inv(2, 0) = (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0)) / det; + m_inv(2, 1) = -(m(0, 0) * m(2, 1) - m(0, 1) * m(2, 0)) / det; + m_inv(2, 2) = (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0)) / det; + break; + } + default: + inverse(m_inv, ipiv, work); + } +} + +template +void smallInverse(Matrix& m_inv) { + Vector ipiv; + Vector work; + smallInverse(m_inv, ipiv, work); +} + +// Computes in place the inverse of mat and the determinant of the inverse. +// In/Out: mat +// Returns: the determinant of mat^-1 +// Precondition: mat is a non-singular real matrix. +template class MatrixType> +Scalar inverseAndDeterminant(MatrixType& mat) { + assert(mat.is_square()); + std::vector ipiv(mat.nrRows()); + + lapack::UseDevice::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(), + ipiv.data()); + + Scalar det = 1; + for (int i = 0; i < mat.nrCols(); ++i) { + det *= mat(i, i); + if (ipiv[i] != i + 1) + det *= -1; + } + + const int lwork = util::getInverseWorkSize(mat); + std::vector work(lwork); + lapack::UseDevice::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.data(), + work.data(), lwork); + + return 1. / det; +} + +// Remove the j-th column. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= j < mat.nrCols(). +template +void removeCol(Matrix& mat, int j) { + assert(j >= 0 && j < mat.nrCols()); + + if (mat.nrRows() > 0 && j < mat.nrCols() - 1) + memmove(mat.ptr(0, j), mat.ptr(0, j + 1), + sizeof(Scalar) * (mat.nrCols() - j - 1) * mat.leadingDimension()); + + mat.resize(std::make_pair(mat.nrRows(), mat.nrCols() - 1)); +} + +#ifdef DCA_HAVE_GPU +template +void removeCol(Matrix& mat, int j) { + assert(j >= 0 && j < mat.nrCols()); + + if (mat.nrRows() > 0 && j < mat.nrCols() - 1) + blas::moveLeft(mat.nrRows(), mat.nrCols() - j, mat.ptr(0, j), mat.leadingDimension()); + + mat.resize(std::make_pair(mat.nrRows(), mat.nrCols() - 1)); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Remove columns in range [first, last]. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= first, last < mat.nrCols(). +template +void removeCols(Matrix& mat, int first, int last) { + const int n_removed = last - first + 1; + const int n = mat.nrRows(); + const int m = mat.nrCols(); + assert(last < m and last >= first and first >= 0); + + if (n > 0 and last < m - 1) + std::memmove(mat.ptr(0, first), mat.ptr(0, last + 1), + mat.leadingDimension() * (m - last - 1) * sizeof(Scalar)); + + mat.resize(std::make_pair(n, m - n_removed)); +} + +// Remove the i-th row. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= i < mat.nrRows(). +template +void removeRow(Matrix& mat, int i) { + assert(i >= 0 && i < mat.nrRows()); + + if (i < mat.nrRows() - 1) + for (int j = 0; j < mat.nrCols(); ++j) + memmove(mat.ptr(i, j), mat.ptr(i + 1, j), sizeof(Scalar) * (mat.nrRows() - i - 1)); + + mat.resize(std::make_pair(mat.nrRows() - 1, mat.nrCols())); +} + +#ifdef DCA_HAVE_GPU +template +void removeRow(Matrix& mat, int i) { + assert(i >= 0 && i < mat.nrRows()); + + if (mat.nrCols() > 0 && i < mat.nrRows() - 1) + blas::moveUp(mat.nrRows() - i, mat.nrCols(), mat.ptr(i, 0), mat.leadingDimension()); + + mat.resize(std::make_pair(mat.nrRows() - 1, mat.nrCols())); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Remove rows in range [first, last]. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= first, last < mat.nrRows(). +template +void removeRows(Matrix& mat, int first, int last) { + const int n_removed = last - first + 1; + const int n = mat.nrRows(); + const int m = mat.nrCols(); + assert(last < n and last >= first and first >= 0); + + if (last < n - 1) + for (int j = 0; j < m; ++j) + std::memmove(mat.ptr(first, j), mat.ptr(last + 1, j), (n - last - 1) * sizeof(Scalar)); + + mat.resize(std::make_pair(n - n_removed, m)); +} + +// Remove the i-th row and the j-th column. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= i < mat.nrRows(), 0 <= j < mat.nrCols(). +template +inline void removeRowAndCol(Matrix& mat, int i, int j) { + removeRow(mat, i); + removeCol(mat, j); +} + +// Remove the i-th row and the i-th column. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= i < mat.nrRows(), i < mat.nrCols(). +template +inline void removeRowAndCol(Matrix& mat, int i) { + removeRowAndCol(mat, i, i); +} + +// Remove rows and columns in range [first, last]. The data is moved accordingly. +// In/Out: mat +// Preconditions: 0 <= first, last < min(mat.nrRows(), mat.nrCols()). +template +void removeRowsAndCols(Matrix& mat, int first, int last) { + removeCols(mat, first, last); + removeRows(mat, first, last); +} + +// Scales the j-th column of mat by val. +// In/Out: mat +// Preconditions: 0 <= j < mat.nrCols(). +template +inline void scaleCol(Matrix& mat, int j, Scalar val, int thread_id = 0, + int stream_id = 0) { + assert(j >= 0 && j < mat.nrCols()); + blas::UseDevice::scal(mat.nrRows(), val, mat.ptr(0, j), 1, thread_id, stream_id); +} + +// Scales the i-th row of mat by val. +// In/Out: mat +// Preconditions: 0 <= i < mat.nrRow(). +template +inline void scaleRow(Matrix& mat, int i, Scalar val, int thread_id = 0, + int stream_id = 0) { + assert(i >= 0 && i < mat.nrRows()); + blas::UseDevice::scal(mat.nrCols(), val, mat.ptr(i, 0), mat.leadingDimension(), + thread_id, stream_id); +} + +// Scales the i[k]-th row of mat by val[k] for 0 <= k < i.size(). +// In/Out: mat +// Preconditions: i.size() == val.size(), 0 <= i[k] < mat.nrRow() for 0 <= k < i.size(). +template +inline void scaleRows(Matrix& mat, const Vector& i, + const Vector& val, int /*thread_id*/ = 0, int /*stream_id*/ = 0) { + assert(i.size() == val.size()); + + for (int j = 0; j < mat.nrCols(); ++j) + for (int ind = 0; ind < i.size(); ++ind) + mat(i[ind], j) *= val[ind]; +} +#ifdef DCA_HAVE_GPU +template +inline void scaleRows(Matrix& mat, const Vector& i, + const Vector& val, int thread_id = 0, int stream_id = 0) { + assert(i.size() == val.size()); + + blas::scaleRows(mat.nrCols(), i.size(), i.ptr(), val.ptr(), mat.ptr(), mat.leadingDimension(), + thread_id, stream_id); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Swaps the j1-th column with the j2-th column of mat. +// In/Out: mat +// Preconditions: 0 <= j1 < mat.nrCols(), 0 <= j2 < mat_y.nrCols(). +template +inline void swapCol(Matrix& mat, int j1, int j2, int thread_id = 0, + int stream_id = 0) { + assert(j1 >= 0 && j1 < mat.nrCols()); + assert(j2 >= 0 && j2 < mat.nrCols()); + blas::UseDevice::swap(mat.nrRows(), mat.ptr(0, j1), 1, mat.ptr(0, j2), 1, thread_id, + stream_id); +} + +// Swaps the j_1[i]-th column with the j_2[i]-th column of mat, for 0 <= i < j_1.size(). +// In/Out: mat +// Preconditions: j_1.size() <= j_2.size() +// 0 <= j_1[i] < mat.nrCols() for 0 <= i < j_1.size(), +// 0 <= j_2[i] < mat.nrCols() for 0 <= i < j_1.size(). +// j_1[i] != j_1[j] for i != j, j_2[i] != j_2[j] for i != j, +// j_1[i] != j_2[j] for all i, j. +#ifdef DCA_HAVE_GPU +template +inline void swapCols(Matrix& mat, const Vector& j_1, + const Vector& j_2, int thread_id = 0, int stream_id = 0) { + assert(j_1.size() <= j_2.size()); + blas::swapCols(mat.nrRows(), j_1.size(), j_1.ptr(), j_2.ptr(), mat.ptr(), mat.leadingDimension(), + thread_id, stream_id); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Swaps the i1-th row with the i2-th row of mat. +// In/Out: mat +// Preconditions: 0 <= i1 < mat.nrRows(), 0 <= i2 < mat_y.nrRows(). +template +inline void swapRow(Matrix& mat, int i1, int i2, int thread_id = 0, + int stream_id = 0) { + assert(i1 >= 0 && i1 < mat.nrRows()); + assert(i2 >= 0 && i2 < mat.nrRows()); + blas::UseDevice::swap(mat.nrCols(), mat.ptr(i1, 0), mat.leadingDimension(), + mat.ptr(i2, 0), mat.leadingDimension(), thread_id, stream_id); +} + +// Swaps the i_1[i]-th row with the i_2[i]-th row of mat, for 0 <= i < i_1.size(). +// In/Out: mat +// Preconditions: i_2.size() == i_2.size() +// 0 <= i_1[i] < mat.nrRows() for 0 <= i < i_1.size(), +// 0 <= i_2[i] < mat.nrRows() for 0 <= i < i_1.size(). +// i_1[i] != i_1[j] for i != j, i_2[i] != i_2[j] for i != j, +// i_1[i] != i_2[j] for all i, j. +#ifdef DCA_HAVE_GPU +template +inline void swapRows(Matrix& mat, const Vector& i_1, + const Vector& i_2, int thread_id = 0, int stream_id = 0) { + assert(i_1.size() == i_2.size()); + blas::swapRows(mat.nrCols(), i_1.size(), i_1.ptr(), i_2.ptr(), mat.ptr(), mat.leadingDimension(), + thread_id, stream_id); + checkErrorsCudaDebug(); +} +#endif // DCA_HAVE_GPU + +// Swaps the i1-th row with the i2-th row and the i1-th column with the i2-th column of mat. +// In/Out: mat +// Preconditions: 0 <= i1 < mat.nrRows(), i1 < mat.nrCols(), +// 0 <= i2 < mat.nrRows(), i2 < mat.nrCols(). +template +inline void swapRowAndCol(Matrix& mat, int i1, int i2, int thread_id = 0, + int stream_id = 0) { + swapRow(mat, i1, i2, thread_id, stream_id); + swapCol(mat, i1, i2, thread_id, stream_id); +} + +// Performs the matrix-vector multiplication y <- alpha * op(a) * x + beta * y, +// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and +// op(X) == conjugate_transposed(X) if transX == 'C' (X = a). +// In/Out: y ('In' only if beta != 0) +// Preconditions: transa should be one of the following: 'N', 'T' or 'C', +// a.nrRows() == y.size() if transa == 'N', a.nrCols() == y.size() otherwise, +// a.nrCols() == x.size() if transa == 'N', a.nrRows() == x.size() otherwise. +template +void gemv(char transa, Scalar alpha, const Matrix& a, + const Vector& x, Scalar beta, Vector& y) { + if (transa == 'N') { + assert(a.nrRows() == y.size()); + assert(a.nrCols() == x.size()); + } + else { + assert(a.nrRows() == x.size()); + assert(a.nrCols() == y.size()); + } + + int lda = a.leadingDimension(); + + blas::gemv(&transa, a.nrRows(), a.nrCols(), alpha, a.ptr(), lda, x.ptr(), 1, beta, y.ptr(), 1); +} + +// Performs the matrix-vector multiplication y <- op(a) * x, +// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and +// op(X) == conjugate_transposed(X) if transX == 'C' (X = a). +// Out: y +// Preconditions: transa should be one of the following: 'N', 'T' or 'C', +// a.nrRows() == y.size() if transa == 'N', a.nrCols() == y.size() otherwise, +// a.nrCols() == x.size() if transa == 'N', a.nrRows() == x.size() otherwise. +template +void gemv(char transa, const Matrix& a, const Vector& x, + Vector& y) { + gemv(transa, 1., a, x, 0., y); +} + +// Performs the matrix-matrix multiplication c <- alpha * op(a) * op(b) + beta * c, +// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and +// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b). +// In/Out: c ('In' only if beta != 0) +// Preconditions: transa and transb should be one of the following: 'N', 'T' or 'C', +// a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise, +// b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise, +// ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and +// kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise. +template class MatrixA, + template class MatrixB, + template class MatrixC, class ALLOC1, class ALLOC2, class ALLOC3> +void gemm(char transa, char transb, Scalar alpha, const MatrixA& a, + const MatrixB& b, Scalar beta, + MatrixC& c, int thread_id = 0, int stream_id = 0) { + int m = c.nrRows(); + int n = c.nrCols(); + int k; + + if (transa == 'N') { + assert(a.nrRows() == m); + k = a.nrCols(); + } + else { + assert(a.nrCols() == m); + k = a.nrRows(); + } + + if (transb == 'N') { + assert(b.nrRows() == k); + assert(b.nrCols() == n); + } + else { + assert(b.nrCols() == k); + assert(b.nrRows() == n); + } + + int lda = a.leadingDimension(); + int ldb = b.leadingDimension(); + int ldc = c.leadingDimension(); + + blas::UseDevice::gemm(&transa, &transb, m, n, k, alpha, a.ptr(), lda, b.ptr(), ldb, + beta, c.ptr(), ldc, thread_id, stream_id); +} + +// Performs the matrix-matrix multiplication c <- a * b +// Out: c +// Preconditions: a.nrRows() == c.nrRows(), b.nrCols() == c.nrCols() and a.nrCols() == b.nrRows() +template class MatrixA, + template class MatrixB, + template class MatrixC> +inline void gemm(const MatrixA& a, + const MatrixB& b, + MatrixC& c, int thread_id = 0, int stream_id = 0) { + gemm('N', 'N', 1., a, b, 0., c, thread_id, stream_id); +} + +// Performs the matrix-matrix multiplication c <- alpha * a * b + beta * c, +// In/Out: c ('In' only if beta != 0) +// Preconditions: a.nrRows() == c.nrRows(), b.nrCols() == c.nrCols() and a.nrCols() == b.nrRows() +template class MatrixA, + template class MatrixB, + template class MatrixC> +inline void gemm(Scalar alpha, const MatrixA& a, + const MatrixB& b, Scalar beta, + MatrixC& c, int thread_id = 0, int stream_id = 0) { + gemm('N', 'N', alpha, a, b, beta, c, thread_id, stream_id); +} + +// Performs the matrix-matrix multiplication c <- op(a) * op(b), +// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and +// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b). +// Out: c +// Preconditions: transa and transb should be one of the following: 'N', 'T' or 'C', +// a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise, +// b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise, +// ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and +// kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise. +template +inline void gemm(char transa, char transb, const Matrix& a, + const Matrix& b, Matrix& c, + int thread_id = 0, int stream_id = 0) { + gemm(transa, transb, 1., a, b, 0., c, thread_id, stream_id); +} + +// Performs the triangular solve b <- a^-1 * b, +// where a is a lower triangular matrix (uplo = 'L') or an upper triangular matrix (uplo = 'U'), +// with unit diagonal (diag = "U") or with general diagonal (diag = "N") +// In/Out: b +// Preconditions: a.nrRows() == a.nrCols() , a.nrCols() == b.nrRows() +template +void trsm(char uplo, char diag, const Matrix& a, + Matrix& b, int thread_id = 0, int stream_id = 0) { + assert(uplo == 'U' or uplo == 'L'); + assert(diag == 'U' or diag == 'N'); + assert(a.nrRows() == a.nrCols()); + assert(b.nrRows() == a.nrCols()); + + blas::UseDevice::trsm("L", &uplo, "N", &diag, b.nrRows(), b.nrCols(), Scalar(1), + a.ptr(), a.leadingDimension(), b.ptr(), b.leadingDimension(), + thread_id, stream_id); +} + +// Mixed real and complex matrix-matrix multiply. +// TODO: Not sure if this are needed. +// The only file in which it may be used is basis_transformation_cd_to_ed.h. + +// Performs the matrix-matrix multiplication c <- op(a) * op(b), +// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and +// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b). +// Out: c +// Preconditions: transa should be one of the following: 'N', 'T', +// transb should be one of 'N', 'T', 'C', +// a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise, +// b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise, +// ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and +// kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise. +template +void gemm(char transa, char transb, Matrix& a, + Matrix, CPU, ALLOC>& b, Matrix, CPU, ALLOC>& c) { + Matrix b_part(b.size()); + Matrix c_re(c.size()); + Matrix c_im(c.size()); + + Scalar sign = 1; + if (transb == 'C') { + sign = -1; + transb = 'T'; + } + + for (int j = 0; j < b.nrCols(); ++j) + for (int i = 0; i < b.nrRows(); ++i) + b_part(i, j) = b(i, j).real(); + + gemm(transa, transb, a, b_part, c_re); + + for (int j = 0; j < b.nrCols(); ++j) + for (int i = 0; i < b.nrRows(); ++i) + b_part(i, j) = b(i, j).imag(); + + gemm(transa, transb, sign, a, b_part, Scalar(0), c_im); + + for (int j = 0; j < c.nrCols(); ++j) + for (int i = 0; i < c.nrRows(); ++i) + c(i, j) = std::complex(c_re(i, j), c_im(i, j)); +} + +// Performs the matrix-matrix multiplication c <- op(a) * op(b), +// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and +// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b). +// Out: c +// Preconditions: transa should be one of the following: 'N', 'T', 'C', +// transb should be one of 'N', 'T', +// a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise, +// b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise, +// ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and +// kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise. +template +static void gemm(char transa, char transb, Matrix, CPU, ALLOC>& a, + Matrix& b, Matrix, CPU, ALLOC>& c) { + Matrix a_part(a.size()); + Matrix c_re(c.size()); + Matrix c_im(c.size()); + + Scalar sign = 1; + if (transa == 'C') { + sign = -1; + transa = 'T'; + } + + for (int j = 0; j < a.nrCols(); ++j) + for (int i = 0; i < a.nrRows(); ++i) + a_part(i, j) = a(i, j).real(); + + gemm(transa, transb, a_part, b, c_re); + + for (int j = 0; j < a.nrCols(); ++j) + for (int i = 0; i < a.nrRows(); ++i) + a_part(i, j) = a(i, j).imag(); + + gemm(transa, transb, sign, a_part, b, Scalar(0), c_im); + + for (int j = 0; j < c.nrCols(); ++j) + for (int i = 0; i < c.nrRows(); ++i) + c(i, j) = std::complex(c_re(i, j), c_im(i, j)); +} + +// Performs the matrix-matrix multiplication c = op(a) * op(b), where each matrix is split in real +// and imaginary part. This is implemented with 3 real matrix-matrix multiplications. +// Out: c +// Preconditions: transa and transb should be one of the following: 'N', 'T', 'C'. +// a[0].size == a[1].size() +// b[0].size == b[1].size() +// c[0].size == c[1].size() +// a[0].nrRows() == c[0].nrRows() if transa == 'N', a[0].nrCols() == c[0].nrRows() +// otherwise, +// b[0].nrCols() == c[0].nrCols() if transb == 'N', b[0].nrRows() == c[0].nrCols() +// otherwise, +// ka == kb, where ka = a[0].nrCols() if transa == 'N', ka = a[0].nrRows() otherwise +// and kb = b[0].nrRows() if transb == 'N', kb = b[0].nrCols() otherwise. +template +void multiply(char transa, char transb, const std::array, 2>& a, + const std::array, 2>& b, + std::array, 2>& c, + std::array, 5>& work) { + assert(a[0].size() == a[1].size()); + assert(b[0].size() == b[1].size()); + assert(c[0].size() == c[1].size()); + + work[0].resizeNoCopy(c[0].size()); + work[1].resizeNoCopy(c[0].size()); + work[2].resizeNoCopy(c[0].size()); + auto& a_sum = work[3]; + auto& b_sum = work[4]; + a_sum.resizeNoCopy(a[0].size()); + b_sum.resizeNoCopy(b[0].size()); + + const Scalar signa = transa == 'C' ? transa = 'T', -1 : 1; + const Scalar signb = transb == 'C' ? transb = 'T', -1 : 1; + + for (int j = 0; j < a[0].nrCols(); ++j) + for (int i = 0; i < a[0].nrRows(); ++i) + a_sum(i, j) = a[0](i, j) + signa * a[1](i, j); + for (int j = 0; j < b[0].nrCols(); ++j) + for (int i = 0; i < b[0].nrRows(); ++i) + b_sum(i, j) = b[0](i, j) + signb * b[1](i, j); + + gemm(transa, transb, a[0], b[0], work[0]); + gemm(transa, transb, signa * signb, a[1], b[1], Scalar(0), work[1]); + gemm(transa, transb, a_sum, b_sum, work[2]); + + for (int j = 0; j < c[0].nrCols(); ++j) + for (int i = 0; i < c[0].nrRows(); ++i) { + c[0](i, j) = work[0](i, j) - work[1](i, j); + c[1](i, j) = work[2](i, j) - work[0](i, j) - work[1](i, j); + } +} + +template +void multiply(const std::array, 2>& a, + const std::array, 2>& b, + std::array, 2>& c, + std::array, 5>& work) { + multiply('N', 'N', a, b, c, work); +} + +// Performs the matrix-matrix multiplication c = op(a) * op(b), where a and c are split in real +// and imaginary part, while b is real. +// Out: c +// Preconditions: transa and transb should be one of the following: 'N', 'T', +// a[0].size == a[1].size() +// c[0].size == c[1].size() +// a[0].nrRows() == c[0].nrRows() if transa == 'N', a.nrCols() == c.nrRows() +// otherwise, +// b.nrCols() == c[0].nrCols() if transb == 'N', b.nrRows() == c.nrCols() +// otherwise, +// ka == kb, where ka = a[0].nrCols() if transa == 'N', ka = a[0].nrRows() otherwise +// and kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise. +template +void multiply(char transa, char transb, const std::array, 2>& a, + const Matrix& b, std::array, 2>& c) { + assert(transa == 'N' || transa == 'T' || transa == 'C'); + assert(transb == 'N' || transb == 'T'); + assert(a[0].size() == a[1].size()); + assert(c[0].size() == c[1].size()); + + gemm(transa, transb, a[0], b, c[0]); + const Scalar sign = transa == 'C' ? transa = 'T', -1 : 1; + gemm(transa, transb, sign, a[1], b, Scalar(0), c[1]); +} + +template +void multiply(const std::array, 2>& a, + const Matrix& b, std::array, 2>& c) { + multiply('N', 'N', a, b, c); +} + +// Performs the matrix-matrix multiplication c = op(a) * op(b), where b and c are split in real +// and imaginary part, while a is real. +// Out: c +// Preconditions: transa and transb should be one of the following: 'N', 'T', +// b[0].size == b[1].size() +// c[0].size == c[1].size() +// a.nrRows() == c[0].nrRows() if transa == 'N', a.nrCols() == c.nrRows() +// otherwise, +// b.[0]nrCols() == c[0].nrCols() if transb == 'N', b.[0]nrRows() == c.nrCols() +// otherwise, +// ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise +// and kb = b.[0]nrRows() if transb == 'N', kb = b.[0]nrCols() otherwise. +template +void multiply(char transa, char transb, const Matrix& a, + const std::array, 2>& b, + std::array, 2>& c) { + assert(transa == 'N' || transa == 'T'); + assert(transb == 'N' || transb == 'T' || transb == 'C'); + assert(b[0].size() == b[1].size()); + assert(c[0].size() == c[1].size()); + + gemm(transa, transb, a, b[0], c[0]); + const Scalar sign = transb == 'C' ? transb = 'T', -1 : 1; + gemm(transa, transb, sign, a, b[1], Scalar(0), c[1]); +} + +template +void multiply(const Matrix& a, + const std::array, 2>& b, + std::array, 2>& c) { + multiply('N', 'N', a, b, c); +} + +// Performs the matrix-matrix multiplication b <- D * a, +// where d is a vector containing the diagonal elements of the matrix D. +// Out: b +// Preconditions: a.size() == b.size(), d.size() == a.nrRows(). +template +inline void multiplyDiagonalLeft(const Vector& d, + const Matrix& a, + Matrix& b, int thread_id = 0, + int stream_id = 0) { + lapack::UseDevice::multiplyDiagonalLeft(a.nrRows(), a.nrCols(), d.ptr(), 1, a.ptr(), + a.leadingDimension(), b.ptr(), + b.leadingDimension(), thread_id, stream_id); +} +template +inline void multiplyDiagonalLeft(const Vector& d, const Matrix& a, + Matrix& b, int thread_id = 0, int stream_id = 0) { + Vector d_gpu(d); + multiplyDiagonalLeft(d_gpu, a, b, thread_id, stream_id); +} + +// Performs the matrix-matrix multiplication b <- a * D, +// where d is a vector containing the diagonal elements of the matrix D. +// Out: b +// Preconditions: a.size() == b.size(), d.size() == a.nrCols(). +template +inline void multiplyDiagonalRight(const Matrix& a, + const Vector& d, + Matrix& b, int thread_id = 0, + int stream_id = 0) { + lapack::UseDevice::multiplyDiagonalRight(a.nrRows(), a.nrCols(), a.ptr(), + a.leadingDimension(), d.ptr(), 1, b.ptr(), + b.leadingDimension(), thread_id, stream_id); +} +template +inline void multiplyDiagonalRight(const Matrix& a, const Vector& d, + Matrix& b, int thread_id = 0, int stream_id = 0) { + Vector d_gpu(d); + multiplyDiagonalRight(a, d_gpu, b, thread_id, stream_id); +} + +// Computes the eigenvalues, the left eigenvectors (if jobvl == 'V') +// and the right eigenvectors (if jobvr == 'V') of the real matrix a. +// The real parts of the eigenvalues are stored in lambda_re, while the imaginary parts in +// lambda_im. +// If computed the left eigenvectors are stored in vl and the right eigenvectors in vr. +// See sgeev, dgeev Lapack documentation for information about how the +// eigenvectors are stored. +// Out: lambda_re, lambda_im, vl, vr. +// Precondition: jobvl == 'N' or jobvl == 'V', +// jobvr == 'N' or jobvr == 'V', +// a is a square matrix. +// Postcondition: lambda_re, lambda_i, are resized, vl if jobvl == 'V', vr if jobvr == 'V' are +// resized. +template +void eigensolver(char jobvl, char jobvr, const Matrix& a, + Vector& lambda_re, Vector& lambda_im, + Matrix& vl, Matrix& vr) { + assert(a.is_square()); + + Matrix a_copy(a); + lambda_re.resizeNoCopy(a_copy.nrRows()); + lambda_im.resizeNoCopy(a_copy.nrRows()); + int ldvl = 1; + int ldvr = 1; + if (jobvl == 'V' || jobvl == 'v') { + vl.resizeNoCopy(a_copy.size()); + ldvl = vl.leadingDimension(); + } + if (jobvr == 'V' || jobvr == 'v') { + vr.resizeNoCopy(a_copy.size()); + ldvr = vr.leadingDimension(); + } + + // Get optimal worksize. + int lwork = util::getEigensolverWorkSize(jobvl, jobvr, a_copy); + dca::linalg::Vector work(lwork); + + lapack::geev(&jobvl, &jobvr, a_copy.nrRows(), a_copy.ptr(), a_copy.leadingDimension(), + lambda_re.ptr(), lambda_im.ptr(), vl.ptr(), ldvl, vr.ptr(), ldvr, work.ptr(), + work.size()); +} + +// Computes the eigenvalues, the left eigenvectors (if jobvl == 'V') +// and the right eigenvectors (if jobvr == 'V') of the complex matrix a. +// The eigenvalues are stored in lambda. +// If computed the left eigenvectors are stored in vl and the right eigenvectors in vr. +// Out: lambda, vl, vr. +// Precondition: jobvl == 'N' or jobvl == 'V', +// jobvr == 'N' or jobvr == 'V', +// a is a square matrix. +// Postcondition: lambda, is resized, vl if jobvl == 'V', vr if jobvr == 'V' are resized. +template +void eigensolver(char jobvl, char jobvr, const Matrix, CPU, ALLOC>& a, + Vector, CPU>& lambda, + Matrix, CPU, ALLOC>& vl, + Matrix, CPU, ALLOC>& vr) { + assert(a.is_square()); + + Matrix, CPU> a_copy(a); + lambda.resizeNoCopy(a_copy.nrRows()); + int ldvl = 1; + int ldvr = 1; + if (jobvl == 'V' || jobvl == 'v') { + vl.resizeNoCopy(a_copy.size()); + ldvl = vl.leadingDimension(); + } + if (jobvr == 'V' || jobvr == 'v') { + vr.resizeNoCopy(a_copy.size()); + ldvr = vr.leadingDimension(); + } + + // Get optimal worksize. + int lwork = util::getEigensolverWorkSize(jobvl, jobvr, a_copy); + dca::linalg::Vector, CPU> work(lwork); + dca::linalg::Vector rwork(2 * a_copy.nrRows()); + + lapack::geev(&jobvl, &jobvr, a_copy.nrRows(), a_copy.ptr(), a_copy.leadingDimension(), + lambda.ptr(), vl.ptr(), ldvl, vr.ptr(), ldvr, work.ptr(), work.size(), rwork.ptr()); +} + +// Computes the eigenvalues, and the eigenvectors (if jobv == 'V') of the real symmetric matrix a. +// if uplo == 'U' the upper triangular part of a is referenced, whereas +// if uplo == 'L' the lower triangular part of a is referenced. +// The eigenvalues are stored in lambda. +// If computed the eigenvectors are stored in v. +// Out: lambda, v +// Precondition: jobv == 'N' or jobv == 'V', +// uplo == 'U' or uplo == 'L', +// a is a square matrix. +// Postcondition: lambda, and v are resized. +template +void eigensolverSymmetric(char jobv, char uplo, const Matrix& a, + Vector& lambda, Matrix& v) { + assert(a.is_square()); + + lambda.resizeNoCopy(a.nrRows()); + v = a; + + // Get optimal worksize. + auto lwork = util::getEigensolverSymmetricWorkSize(jobv, uplo, v); + dca::linalg::Vector work(std::get<0>(lwork)); + dca::linalg::Vector iwork(std::get<1>(lwork)); + + lapack::syevd(&jobv, &uplo, v.nrRows(), v.ptr(), v.leadingDimension(), lambda.ptr(), work.ptr(), + work.size(), iwork.ptr(), iwork.size()); +} +// For real types Hermitian and symmetric is the same. +template +inline void eigensolverHermitian(char jobv, char uplo, const Matrix& a, + Vector& lambda, Matrix& v) { + eigensolverSymmetric(jobv, uplo, a, lambda, v); +} + +// Computes the eigenvalues, and the eigenvectors (if jobv == 'V') +// of the complex Hermitian matrix a. +// if uplo == 'U' the upper triangular part of a is referenced, whereas +// if uplo == 'L' the lower triangular part of a is referenced. +// The eigenvalues are stored in lambda. +// If computed the eigenvectors are stored in v. +// Out: lambda, v +// Precondition: jobv == 'N' or jobv == 'V', +// uplo == 'U' or uplo == 'L', +// a is a square matrix. +// Postcondition: lambda, and v are resized. +template +void eigensolverHermitian(char jobv, char uplo, const Matrix, CPU, ALLOC>& a, + Vector& lambda, Matrix, CPU, ALLOC>& v) { + assert(a.is_square()); + + lambda.resizeNoCopy(a.nrRows()); + v = a; + + // Get optimal worksize. + auto [wsize, rsize, isize] = util::getEigensolverHermitianWorkSize(jobv, uplo, v); + dca::linalg::Vector, CPU> work(wsize); + dca::linalg::Vector rwork(rsize); + dca::linalg::Vector iwork(isize); + + lapack::heevd(&jobv, &uplo, v.nrRows(), v.ptr(), v.leadingDimension(), lambda.ptr(), work.ptr(), + work.size(), rwork.ptr(), rwork.size(), iwork.ptr(), iwork.size()); +} + +template +void eigensolverGreensFunctionMatrix(char jobv, char uplo, + const Matrix, CPU, ALLOC>& a, + Vector& lambda, + Matrix, CPU, ALLOC>& v) { + assert(a.is_square()); + int n = a.nrRows(); + assert(n % 2 == 0); + + if (n == 2) { // must be diagonal in spin space. + lambda.resize(2); + v.resize(2); + + assert(std::abs(std::imag(a(0, 0))) < 1.e-6); + assert(std::abs(std::imag(a(1, 1))) < 1.e-6); + + lambda[0] = std::real(a(0, 0)); + lambda[1] = std::real(a(1, 1)); + v(0, 0) = 1.; + v(1, 0) = 0.; + v(0, 1) = 0.; + v(1, 1) = 1.; + return; + } + eigensolverHermitian(jobv, uplo, a, lambda, v); +} + +// Computes the pseudo inverse of the matrix. +// Out: a_inv +// Postconditions: a_inv is resized to the needed dimension. +template +void pseudoInverse(const Matrix& a, Matrix& a_inv, + double eps = 1.e-6) { + int m = a.nrRows(); + int n = a.nrCols(); + a_inv.resizeNoCopy(std::make_pair(n, m)); + + using RealType = decltype(std::real(*a.ptr())); + + if (m <= n) { + // a_inv = a'*inv(a*a') + // inv(a*a') = v*inv(lambda)*v', [lambda, v] = eig(a*a') + + Matrix a_at("A_At", m); + dca::linalg::matrixop::gemm('N', 'C', a, a, a_at); + + dca::linalg::Vector lambda("Lambda", m); + Matrix v("V", m); + + eigensolverHermitian('V', 'U', a_at, lambda, v); + Matrix vt(v); + + for (int j = 0; j < m; j++) { + Scalar lambda_inv = 0; + + if (lambda[j] > eps * lambda[m - 1]) + lambda_inv = 1. / lambda[j]; + + scaleCol(v, j, lambda_inv); + } + + gemm('N', 'C', v, vt, a_at); + gemm('C', 'N', a, a_at, a_inv); + } + else { + // a_inv = inv(a'*a)*a' + // inv(a'*a) = v*inv(lambda)*v', [lambda, v] = eig(a'*a) + + Matrix at_a("at_a", n); + dca::linalg::matrixop::gemm('C', 'N', a, a, at_a); + + dca::linalg::Vector lambda("Lambda", n); + Matrix v("V", n); + + eigensolverHermitian('V', 'U', at_a, lambda, v); + Matrix vt(v); + + for (int j = 0; j < n; j++) { + Scalar lambda_inv = 0; + + if (lambda[j] > eps * lambda[n - 1]) + lambda_inv = 1. / lambda[j]; + + scaleCol(v, j, lambda_inv); + } + + gemm('N', 'C', v, vt, at_a); + gemm('N', 'C', at_a, a, a_inv); + } +} + +// Computes (in place) the determinant of the matrix. +// Returns: determinant. +// Postcondition: M is its LU decomposition. +template