From c306bae149b9c8aafbe4f565d2095edd5a25d9de Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Fri, 13 Sep 2024 17:10:07 -0400
Subject: [PATCH 01/11] adding some data classes ported from DCA, fixing up the
 includes

---
 CMake/mrpapp_cuda.cmake      |   19 +-
 CMake/mrpapp_hip.cmake       |   13 +-
 CMakeLists.txt               |   17 +-
 gaps3D.h                     |    4 +-
 linalg/linalg.hpp            |   63 ++
 linalg/matrix.hpp            |  619 +++++++++++++++
 linalg/matrixop.hpp          | 1367 ++++++++++++++++++++++++++++++++++
 linalg/reshapable_matrix.hpp |  453 +++++++++++
 linalg/vector.hpp            |  435 +++++++++++
 pairing.h                    |  296 ++++----
 platform/dca_gpu.h           |   37 +
 utilities.h                  |  456 +++++-------
 12 files changed, 3333 insertions(+), 446 deletions(-)
 create mode 100644 linalg/linalg.hpp
 create mode 100644 linalg/matrix.hpp
 create mode 100644 linalg/matrixop.hpp
 create mode 100644 linalg/reshapable_matrix.hpp
 create mode 100644 linalg/vector.hpp
 create mode 100644 platform/dca_gpu.h

diff --git a/CMake/mrpapp_cuda.cmake b/CMake/mrpapp_cuda.cmake
index 4c4c4ffe6..bc19a0ddc 100644
--- a/CMake/mrpapp_cuda.cmake
+++ b/CMake/mrpapp_cuda.cmake
@@ -1,4 +1,4 @@
-# // Copyright (C) 2023 UT-Battelle, LLC
+# // Copyright (C) 2024 UT-Battelle, LLC
 # // All rights reserved.
 # //
 # // See LICENSE for terms of usage.
@@ -6,22 +6,33 @@
 
 # Checks for CUDA and accordingly sets MRPAPP_HAVE_CUDA
 # In addition, set MRPAPP_GPU_LIBS.
+message("checking CUDA environment")
 set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "Name of the real architecture to build for.")
 
 set(MRPAPP_HAVE_CUDA FALSE CACHE INTERNAL "")
 set(MRPAPP_GPU_LIBS "" CACHE INTERNAL "")
 
-# Find CUDA.
+include(mrpapp_defines)
 include(CheckLanguage)
 
+if(NOT CMAKE_CUDA_FLAGS MATCHES "allow-unsupported-compiler")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --allow-unsupported-compiler")
+endif()
+
+set(CMAKE_CUDA_HOST_COMPILER
+  ${CMAKE_CXX_COMPILER}
+  CACHE STRING "nvcc host compiler passed via -ccbin")
+
+# Find CUDA.
 find_package(CUDAToolkit REQUIRED)
 check_language(CUDA)
 if (CMAKE_CUDA_COMPILER)
+  message("Found CUDA compiler!")
   enable_language(CUDA)
   set(MRPAPP_HAVE_CUDA TRUE CACHE INTERNAL "")
   set(MRPAPP_HAVE_GPU TRUE CACHE INTERNAL "")
-  dca_add_haves_define(MRPAPP_HAVE_CUDA)
-  dca_add_haves_define(MRPAPP_HAVE_GPU)
+  mrpapp_add_define(MRPAPP_HAVE_CUDA)
+  mrpapp_add_define(MRPAPP_HAVE_GPU)
 
   list(APPEND MRPAPP_GPU_LIBS CUDA::cudart CUDA::cublas)
   set(MRPAPP_CUDA_PROPERTIES "CMAKE_CUDA_ARCHITECTURES 70")
diff --git a/CMake/mrpapp_hip.cmake b/CMake/mrpapp_hip.cmake
index 1d9c73d20..9b88faff7 100644
--- a/CMake/mrpapp_hip.cmake
+++ b/CMake/mrpapp_hip.cmake
@@ -1,6 +1,12 @@
 ################################################################################
 # Author: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
 #
+# // Copyright (C) 2024 UT-Battelle, LLC
+# // All rights reserved.
+# //
+# // See LICENSE for terms of usage.
+# //
+#
 # Checks for HIP and and accordingly sets MRPAPP_HAVE_HIP
 
 set(ROCM_ROOT
@@ -53,6 +59,7 @@ set(MRPAPP_HAVE_HIP FALSE CACHE INTERNAL "")
 set(MRPAPP_HAVE_MAGMA FALSE CACHE INTERNAL "")
 set(MRPAPP_GPU_LIBS "" CACHE INTERNAL "")
 
+include(mrpapp_defines)
 include(CheckLanguage)
 check_language(HIP)
 if (CMAKE_HIP_COMPILER)
@@ -61,9 +68,9 @@ if (CMAKE_HIP_COMPILER)
   set(MRPAPP_HAVE_HIP TRUE CACHE INTERNAL "")
   set(MRPAPP_HAVE_GPU TRUE CACHE INTERNAL "")
   # Probably probably these should be public properties of the hip targets
-  dca_add_haves_define(MRPAPP_HAVE_HIP)
-  dca_add_haves_define(MRPAPP_HAVE_GPU)
-  dca_add_haves_define(__HIP_PLATFORM_AMD__)
+  mrpapp_add_define(MRPAPP_HAVE_HIP)
+  mrpapp_add_define(MRPAPP_HAVE_GPU)
+  mrpapp_add_define(__HIP_PLATFORM_AMD__)
   list(APPEND MRPAPP_GPU_LIBS hip::host roc::hipblas)
   set(MRPAPP_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx906,gfx908")
   set(CMAKE_HIP_STANDARD 17)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa2c27c0c..2511a9b2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,9 @@
+# // Copyright (C) 2024 UT-Battelle, LLC
+# // All rights reserved.
+# //
+# // See LICENSE for terms of usage.
+# //
+
 ######################################################################
 # CMake version and policies
 ######################################################################
@@ -35,7 +41,7 @@ endif(USE_MPI)
 option(ENABLE_CUDA "Build with GPU support through CUDA" OFF)
 option(ENABLE_HIP  "Build with GPU support through HIP" OFF)
 set(ENABLE_GPU "$<IF:$<OR:$<BOOL:${ENABLE_CUDA}>,$<BOOL:${ENABLE_CUDA}>>,ON,OFF>")
-
+message ("enable gpu: ${ENABLE_GPU}")
 set(MRPAPP_GPU_LIBS "" CACHE INTERNAL "")
 
 if(ENABLE_GPU)
@@ -45,10 +51,9 @@ if(ENABLE_GPU)
   if(ENABLE_HIP)
     include(mrapp_hip)
   endif(ENABLE_HIP)
-
   if(MRPAPP_HAVE_CUDA OR MRPAPP_HAVE_HIP)
     include(DetermineDeviceArchitectures)
-    message(STATUS "GPU device architectures: ${QMC_GPU_ARCHS}")
+    message(STATUS "GPU device architectures: ${MRPAPP_GPU_ARCHS}")
   endif()
 endif(ENABLE_GPU)
 
@@ -61,6 +66,9 @@ set(MRPAPP_SRC
 set(MRPAPP_MODEL "1BAND" CACHE STRING "RPA model options")
 set_property(CACHE MRPAPP_MODEL PROPERTY STRINGS SRRUO SRRUO3D SRRUO3DSUH 1BAND 1BANDWSPIN BILAYER_FESC BILAYER_1BAND ORTHOIIBILAYER BSCCOBILAYER BILAYER_FESC BAFEAS KFE2SE2 FOURORBITAL TBFILE COUPLEDLADDERS NDNIO2 MODELFROMFILESO KAGOME 1BANDABWSPIN 1BANDALTERMAGNET 1BANDAB)
 
+include(mrpapp_defines)
+mrpapp_write_definitions_file()
+
 add_executable(mrpapp ${MRPAPP_SRC})
 target_include_directories(mrpapp PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
   ${PROJECT_SOURCE_DIR}/PartialPsimag 
@@ -74,4 +82,5 @@ set(mpi_targs MPI::MPI_C MPI::MPI_CXX)
 set(MPI_TARGETS "$<$<BOOL:${MPI_FOUND}>:${mpi_targs}>")
 target_link_libraries(mrpapp ${MRPAPP_GPU_LIBS} LAPACK::LAPACK BLAS::BLAS ${MPI_TARGETS} )
 
-add_custom_target(genexdebug COMMAND ${CMAKE_COMMAND} -E echo "$<$<BOOL:${MPI_FOUND}>:USE_MPI>")
+add_custom_target(genexdebug COMMAND ${CMAKE_COMMAND} -E echo "$<IF:$<OR:$<BOOL:${ENABLE_CUDA}>,$<BOOL:${ENABLE_CUDA}>>,ON,OFF>")
+#$<$<BOOL:${MPI_FOUND}>:USE_MPI>")
diff --git a/gaps3D.h b/gaps3D.h
index e59d72183..642913fe0 100644
--- a/gaps3D.h
+++ b/gaps3D.h
@@ -98,9 +98,9 @@ class gap3D { // simple s+- for 5-orbital 1111 model 2D
                                        // description
       param.parity = 1.0;              // even parity (d-wave) gap
       if (band == 0) {                 // bonding band
-        return param.Delta0 * (abs(cos(k[0])) - cos(k[1]));
+        return param.Delta0 * (std::abs(cos(k[0])) - cos(k[1]));
       } else { // antibonding band (shift kx by pi)
-        return param.Delta0 * (-abs(cos(k[0])) - cos(k[1]));
+        return param.Delta0 * (-std::abs(cos(k[0])) - cos(k[1]));
       }
     } else if (param.gAmpl == "SrRuO_helical" ||
                param.gAmpl == "SrRuO_chiral" ||
diff --git a/linalg/linalg.hpp b/linalg/linalg.hpp
new file mode 100644
index 000000000..0652e9cdd
--- /dev/null
+++ b/linalg/linalg.hpp
@@ -0,0 +1,63 @@
+// based on linalg.hpp from DCA++
+// distributed under BSD-3-clause license
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2024 UT-Battelle, LLC
+// All rights reserved.
+//
+// 
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if MRPAPP is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file includes all the header files in include/dca/linalg.
+// TODO: This file is temporary and will be removed or updated.
+
+#include "dca/linalg/vector.hpp"
+#include "dca/linalg/matrix.hpp"
+#include "dca/linalg/matrixop.hpp"
+
+#ifdef DCA_HAVE_GPU
+#include "dca/platform/dca_gpu.h"
+#include "dca/platform/dca_gpu_blas.h"
+#include "dca/linalg/util/handle_functions.hpp"
+#include "dca/linalg/util/info_gpu.hpp"
+#include "dca/linalg/util/stream_container.hpp"
+#include "dca/linalg/util/stream_functions.hpp"
+#include "dca/linalg/util/util_gpublas.hpp"
+#endif  // DCA_HAVE_GPU
+
+// BLAS
+#include "dca/linalg/blas/blas1.hpp"
+#include "dca/linalg/blas/blas2.hpp"
+#include "dca/linalg/blas/blas3.hpp"
+
+#include "dca/linalg/lapack/bennet_update.hpp"
+#include "dca/linalg/lapack/inverse.hpp"
+#include "dca/linalg/lapack/lapack.hpp"
+#include "dca/linalg/lapack/silence_lapack.hpp"
+#include "dca/linalg/lapack/solve.hpp"
+
+#ifdef DCA_HAVE_GPU
+// CUBLAS
+#include "dca/linalg/blas/cublas1.hpp"
+#include "dca/linalg/blas/cublas3.hpp"
+#include "dca/linalg/blas/cublas_conversion_char_types.hpp"
+#include "dca/linalg/blas/kernels_gpu.hpp"
+
+#include "dca/linalg/lapack/laset_gpu.hpp"
+#include "dca/linalg/lapack/magma.hpp"
+#include "dca/linalg/lapack/multiply_diagonal_gpu.hpp"
+#endif  // DCA_HAVE_GPU
+
+// Device selector struct
+#include "dca/linalg/device_type.hpp"
+#include "dca/linalg/blas/use_device.hpp"
+#include "dca/linalg/lapack/use_device.hpp"
+
+// Utils
+#include "dca/linalg/util/allocators/allocators.hpp"
+#include "dca/linalg/util/copy.hpp"
+#include "dca/linalg/util/lapack_exception.hpp"
+#include "dca/linalg/util/util_lapack.hpp"
+#include "dca/linalg/util/util_matrixop.hpp"
diff --git a/linalg/matrix.hpp b/linalg/matrix.hpp
new file mode 100644
index 000000000..e55181138
--- /dev/null
+++ b/linalg/matrix.hpp
@@ -0,0 +1,619 @@
+// Copyright (C) 2023 ETH Zurich
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Staar (taa@zurich.ibm.com)
+//         Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter W. Doak (doakpw@ornl.gov
+//
+/// \file provides the Matrix object for different device types and allocators.
+
+#ifndef DCA_LINALG_MATRIX_HPP
+#define DCA_LINALG_MATRIX_HPP
+
+#include <cassert>
+#include <cmath>
+#include <sstream>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "dca/linalg/vector.hpp"
+#include "dca/linalg/util/allocators/allocators.hpp"
+#include "dca/linalg/device_type.hpp"
+#include "dca/linalg/util/copy.hpp"
+#include "dca/linalg/util/memory.hpp"
+#include "dca/linalg/util/stream_functions.hpp"
+#include "dca/util/type_help.hpp"
+
+namespace dca {
+namespace linalg {
+// dca::linalg::
+
+/** Matrix class for interfacing with Blas, Cublas, Rocblas
+ *  its row major i.e, row is fast.
+ */
+template <typename ScalarType, DeviceType device_name, class ALLOC = util::DefaultAllocator<ScalarType, device_name>>
+class Matrix : public ALLOC {
+public:
+  using ThisType = Matrix<ScalarType, device_name>;
+  using ValueType = ScalarType;
+  using Allocator = ALLOC;
+  constexpr static DeviceType device = device_name;
+
+  Matrix(const std::string& name = default_name_);
+
+  Matrix(int size);
+  Matrix(const std::string& name, int size);
+
+  // Preconditions: capacity >= size.
+  Matrix(int size, int capacity);
+  Matrix(const std::string& name, int size, int capacity);
+
+  Matrix(std::pair<int, int> size);
+  Matrix(const std::string& name, std::pair<int, int> size);
+
+  // Preconditions: capacity.first >= size.first, capacity.second >= size.second.
+  Matrix(std::pair<int, int> size, std::pair<int, int> capacity);
+  Matrix(const std::string& name, std::pair<int, int> size, std::pair<int, int> capacity);
+
+  // Copy and move constructor:
+  // Constructs a matrix with name name, size rhs.size() and a copy of the elements of rhs.
+  Matrix(const Matrix<ScalarType, device_name, ALLOC>& rhs, const std::string& name = default_name_);
+  // Constructs a matrix with name name, size rhs.size(). The elements of rhs are moved.
+  // Postcondition: rhs is a (0 x 0) matrix.
+  Matrix(Matrix<ScalarType, device_name, ALLOC>&& rhs, const std::string& = default_name_);
+
+  // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs
+  // elements are stored on a different device.
+  template <DeviceType rhs_device_name, class rhs_ALLOC>
+  Matrix(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const std::string& = default_name_);
+
+  // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs
+  // elements are stored on a different device.
+  template <typename Scalar2, DeviceType rhs_device_name, class rhs_ALLOC>
+  Matrix(const Matrix<Scalar2, rhs_device_name, rhs_ALLOC>& rhs, const std::string& = default_name_);
+
+  ~Matrix();
+
+  // Assignment operators:
+  // Resizes the matrix to rhs.size() and copy the elements of rhs.
+  // Postcondition: The name of the matrix is unchanged.
+  Matrix<ScalarType, device_name, ALLOC>& operator=(const Matrix<ScalarType, device_name, ALLOC>& rhs);
+  // Resizes the matrix to rhs.size() and move the elements of rhs.
+  // Postcondition: The name of the matrix is unchanged; rhs is a (0 x 0) matrix.
+  Matrix<ScalarType, device_name, ALLOC>& operator=(Matrix<ScalarType, device_name, ALLOC>&& rhs);
+
+  // Resizes the matrix to rhs.size() and copy the elements, stored on a different device, of rhs.
+  // Postcondition: The name of the matrix is unchanged.
+  template <DeviceType rhs_device_name, class rhs_ALLOC>
+  Matrix<ScalarType, device_name, ALLOC>& operator=(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs);
+
+  template <typename ScalarRhs, DeviceType rhs_device_name, class rhs_ALLOC>
+  Matrix<ScalarType, device_name, ALLOC>& operator=(const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs);
+
+  // Returns true if this is equal to other, false otherwise.
+  // Two matrices are equal, if they have the same size and contain the same elements. Name and
+  // capacity are ignored.
+  // Special case: two matrices without elements are equal.
+  bool operator==(const Matrix<ScalarType, device_name, ALLOC>& other) const;
+
+  // Returns true if this is not equal to other, false otherwise.
+  // See description of operator== for the definition of equality.
+  bool operator!=(const Matrix<ScalarType, device_name, ALLOC>& other) const;
+
+  // Returns the (i,j)-th element of the matrix.
+  // Preconditions: 0 <= i < size().first, 0 <= j < size().second.
+  // This method is available only if device_name == CPU.
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  ScalarType& operator()(int i, int j) {
+    assert(i >= 0 && i < size_.first);
+    assert(j >= 0 && j < size_.second);
+    return data_[i + j * leadingDimension()];
+  }
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  const ScalarType& operator()(int i, int j) const {
+    assert(i >= 0 && i < size_.first);
+    assert(j >= 0 && j < size_.second);
+    return data_[i + j * leadingDimension()];
+  }
+
+  const std::string& get_name() const {
+    return name_;
+  }
+  void set_name(const std::string& new_name) {
+    name_ = new_name;
+  }
+
+  // Returns the pointer to the (0,0)-th element.
+  ValueType* ptr() {
+    return data_;
+  }
+  const ValueType* ptr() const {
+    return data_;
+  }
+
+  // Returns the pointer to the (i,j)-th element i < size().first and 0 < j < size().second, or
+  // a pointer past the end of the range if i == size().first or j == size().second.
+  // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
+  ValueType* ptr(int i, int j) {
+    assert(i >= 0 && i <= size_.first);
+    assert(j >= 0 && j <= size_.second);
+    return data_ + i + j * leadingDimension();
+  }
+  const ValueType* ptr(int i, int j) const {
+    assert(i >= 0 && i <= size_.first);
+    assert(j >= 0 && j <= size_.second);
+    return data_ + i + j * leadingDimension();
+  }
+
+  bool is_square() const {
+    return (size_.first == size_.second);
+  }
+
+  const std::pair<int, int> size() const {
+    return size_;
+  }
+  const std::pair<int, int>& capacity() const {
+    return capacity_;
+  }
+  int nrRows() const {
+    return size_.first;
+  }
+  int nrCols() const {
+    return size_.second;
+  }
+
+  int getActualSize() {
+    return nrElements(capacity_);
+  }
+
+  int leadingDimension() const {
+    return capacity_.first;
+  }
+
+  // Resizes *this to a (new_size * new_size) matrix.
+  // Elements added may have any value.
+  // Remark: The capacity of the matrix and element pointers do not change
+  // if new_size <= capacity().first and new_size <= capacity().second.
+  void resize(int new_size) {
+    resize(std::make_pair(new_size, new_size));
+  }
+  // Resizes *this to a (new_size.first * new_size.second) matrix.
+  // Elements added may have any value.
+  // Remark: The capacity of the matrix and element pointers do not change
+  // if new_size.first <= capacity().first and new_size.second <= capacity().second.
+  void resize(std::pair<int, int> new_size);
+
+  // Resizes *this to a (new_size * new_size) matrix.
+  // The previous elements are not copied, therefore all the elements
+  // may have any value after the call to this method.
+  // Remark: The capacity of the matrix and element pointers do not change
+  // if new_size <= capacity().first and new_size <= capacity().second.
+  void resizeNoCopy(int new_size) {
+    resizeNoCopy(std::make_pair(new_size, new_size));
+  }
+  // Resizes *this to a (new_size.first * new_size.second) matrix.
+  // The previous elements are not copied, therefore all the elements
+  // may have any value after the call to this method.
+  // Remark: The capacity of the matrix and element pointers do not change
+  // if new_size.first <= capacity().first and new_size.second <= capacity().second.
+  void resizeNoCopy(std::pair<int, int> new_size);
+
+  // Releases the memory allocated by *this and sets size and capacity to zero.
+  void clear();
+
+  // Swaps the contents of the matrix except the name with those of rhs.
+  void swap(Matrix<ScalarType, device_name, ALLOC>& rhs);
+  // Swaps the contents of the matrix, included the name, with those of rhs.
+  void swapWithName(Matrix<ScalarType, device_name, ALLOC>& rhs);
+
+  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
+  // + synchronization of stream
+  template <DeviceType rhs_device_name, class rhs_ALLOC>
+  void set(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, int thread_id, int stream_id);
+
+  template <DeviceType rhs_device_name, class rhs_ALLOC>
+  void set(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const util::GpuStream& stream);
+
+  // Asynchronous assignment.
+  template <DeviceType rhs_device_name, class rhs_ALLOC>
+  void setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const util::GpuStream& stream);
+
+  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
+  template <DeviceType rhs_device_name, class rhs_ALLOC>
+  void setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, int thread_id, int stream_id);
+
+  void setToZero(const util::GpuStream& stream);
+
+  // Prints the values of the matrix elements.
+  void print() const;
+  // Prints the properties of *this.
+  void printFingerprint() const;
+  // Returns the allocated device memory in bytes.
+  std::size_t deviceFingerprint() const;
+
+  std::string toStr() const;
+private:
+  static std::pair<int, int> capacityMultipleOfBlockSize(std::pair<int, int> size);
+  inline static size_t nrElements(std::pair<int, int> size) {
+    return static_cast<size_t>(size.first) * static_cast<size_t>(size.second);
+  }
+  static constexpr int block_size_ = 32;
+  static const std::string default_name_;
+
+  std::string name_;
+
+  std::pair<int, int> size_;
+  std::pair<int, int> capacity_;
+
+  ValueType* data_ = nullptr;
+
+  template <class ScalarType2, DeviceType device_name2, class ALLOC2>
+  friend class dca::linalg::Matrix;
+};
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+const std::string Matrix<ScalarType, device_name,  ALLOC>::default_name_ = "no-name";
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name) : Matrix(name, 0) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(int size) : Matrix(std::make_pair(size, size)) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name, int size)
+    : Matrix(name, std::make_pair(size, size)) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(int size, int capacity)
+    : Matrix(std::make_pair(size, size), std::make_pair(capacity, capacity)) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name, int size, int capacity)
+    : Matrix(name, std::make_pair(size, size), std::make_pair(capacity, capacity)) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(std::pair<int, int> size) : Matrix(size, size) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name, std::pair<int, int> size)
+    : Matrix(name, size, size) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(std::pair<int, int> size, std::pair<int, int> capacity)
+    : Matrix(default_name_, size, capacity) {}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <DeviceType rhs_device_name, class rhs_ALLOC>
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+                                        const std::string& name)
+    : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) {
+  data_ = Allocator::allocate(nrElements(capacity_));
+  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <typename ScalarRhs, DeviceType rhs_device_name, class rhs_ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs,
+                                        const std::string& name)
+    : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) {
+  if (sizeof(ScalarType) != sizeof(ScalarRhs))
+    throw std::runtime_error("conversion of both type and location of Matrix not currently possible!");
+  data_ = ALLOC::allocate(nrElements(capacity_));
+  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name, std::pair<int, int> size,
+                                        std::pair<int, int> capacity)
+    : name_(name), size_(size), capacity_(capacityMultipleOfBlockSize(capacity)) {
+  assert(size_.first >= 0 && size_.second >= 0);
+  assert(capacity.first >= 0 && capacity.second >= 0);
+  assert(capacity.first >= size_.first && capacity.second >= size_.second);
+  assert(capacity_.first >= capacity.first && capacity_.second >= capacity.second);
+
+  data_ = ALLOC::allocate(nrElements(capacity_));
+  util::Memory<device_name>::setToZero(data_, nrElements(capacity_));
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(const Matrix<ScalarType, device_name,  ALLOC>& rhs,
+                                        const std::string& name)
+    : name_(name) {
+  *this = rhs;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::Matrix(Matrix<ScalarType, device_name,  ALLOC>&& rhs, const std::string& name)
+    : name_(name), size_(rhs.size_), capacity_(rhs.capacity_), data_(rhs.data_) {
+  rhs.capacity_ = std::make_pair(0, 0);
+  rhs.size_ = std::make_pair(0, 0);
+  rhs.data_ = nullptr;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>::~Matrix() {
+  Allocator::deallocate(data_);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::resize(std::pair<int, int> new_size) {
+  if (new_size.first == 0 || new_size.second ==0) {
+    size_ = new_size;
+    return;
+  } else if (new_size.first > capacity_.first || new_size.second > capacity_.second) {
+    std::pair<int, int> new_capacity = capacityMultipleOfBlockSize(new_size);
+
+    ValueType* new_data = nullptr;
+    new_data = Allocator::allocate(nrElements(new_capacity));
+    // hip memorycpy2D routines don't tolerate leadingDimension = 0
+    const std::pair<int, int> copy_size(std::min(new_size.first, size_.first),
+                                        std::min(new_size.second, size_.second));
+    util::memoryCopy(new_data, new_capacity.first, data_, leadingDimension(), copy_size);
+    Allocator::deallocate(data_);
+    data_ = new_data;
+    capacity_ = new_capacity;
+    size_ = new_size;
+  }
+  else {
+    size_ = new_size;
+  }
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name, ALLOC>::operator=(
+    const Matrix<ScalarType, device_name,  ALLOC>& rhs) {
+  resizeNoCopy(rhs.size_);
+  if (device_name == CPU)
+    util::memoryCopyCpu(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  else
+    util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>::operator=(
+    Matrix<ScalarType, device_name,  ALLOC>&& rhs) {
+  swap(rhs);
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <DeviceType rhs_device_name, class rhs_ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>::operator=(
+    const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs) {
+  resizeNoCopy(rhs.size_);
+  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  return *this;
+}
+
+#ifdef DCA_HAVE_GPU
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <typename ScalarRhs, DeviceType rhs_device_name, class rhs_ALLOC>
+Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>::operator=(
+    const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs) {
+  static_assert(sizeof(ScalarType) == sizeof(ScalarRhs),
+                "sizeof ScalarType and ScalarRhs are not equal");
+  resizeNoCopy(rhs.size_);
+  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  return *this;
+}
+
+#endif
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+bool Matrix<ScalarType, device_name, ALLOC>::operator==(const Matrix<ScalarType, device_name,  ALLOC>& other) const {
+  if (device_name == GPU)
+    return Matrix<ScalarType, CPU>(*this) == Matrix<ScalarType, CPU>(other);
+
+  if (size() != other.size())
+    return nrRows() * nrCols() == 0 and other.nrRows() * other.nrCols() == 0;
+
+  for (int j = 0; j < nrCols(); ++j)
+    for (int i = 0; i < nrRows(); ++i)
+      if ((*this)(i, j) != other(i, j))
+        return false;
+
+  return true;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+bool Matrix<ScalarType, device_name,  ALLOC>::operator!=(const Matrix<ScalarType, device_name,  ALLOC>& other) const {
+  return not(*this == other);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::resizeNoCopy(std::pair<int, int> new_size) {
+  if (new_size.first > capacity_.first || new_size.second > capacity_.second) {
+    size_ = new_size;
+    capacity_ = capacityMultipleOfBlockSize(new_size);
+
+    Allocator::deallocate(data_);
+    data_ = Allocator::allocate(nrElements(capacity_));
+  }
+  else {
+    size_ = new_size;
+  }
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::clear() {
+  Allocator::deallocate(data_);
+  size_ = capacity_ = std::make_pair(0, 0);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::swap(Matrix<ScalarType, device_name,  ALLOC>& rhs) {
+  std::swap(size_, rhs.size_);
+  std::swap(capacity_, rhs.capacity_);
+  std::swap(data_, rhs.data_);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::swapWithName(Matrix<ScalarType, device_name,  ALLOC>& rhs) {
+  std::swap(name_, rhs.name_);
+  swap(rhs);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <DeviceType rhs_device_name, class rhs_ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::set(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+                                          int thread_id, int stream_id) {
+  resize(rhs.size_);
+  // This specialization is required since without unified memory CUDA doesn't known which memory locality the pointer has.
+  if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU)
+    util::memoryCopyH2D(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_,
+                        thread_id, stream_id);
+  else if constexpr (device_name == DeviceType::CPU && rhs_device_name == DeviceType::GPU)
+    util::memoryCopyD2H(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_,
+                        thread_id, stream_id);
+  else if constexpr (device_name == DeviceType::CPU && rhs_device_name == DeviceType::CPU)
+    util::memoryCopyCpu(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_,
+                        thread_id, stream_id);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <DeviceType rhs_device_name, class rhs_ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::set(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+                                          const util::GpuStream& stream [[maybe_unused]]) {
+  resize(rhs.size_);
+  if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU)
+    util::memoryCopyH2D(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  else if constexpr (device_name == DeviceType::CPU && rhs_device_name == DeviceType::GPU)
+    util::memoryCopyD2H(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <DeviceType rhs_device_name, class rhs_ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+                                               const util::GpuStream& stream) {
+  resizeNoCopy(rhs.size_);
+  util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+template <DeviceType rhs_device_name, class rhs_ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+                                               const int thread_id, const int stream_id) {
+  setAsync(rhs, util::getStream(thread_id, stream_id));
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::setToZero(const util::GpuStream& stream) {
+  util::Memory<device_name>::setToZeroAsync(data_, leadingDimension() * nrCols(), stream);
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::print() const {
+  if (device_name == GPU)
+    return Matrix<ScalarType, CPU>(*this).print();
+
+  printFingerprint();
+
+  std::stringstream ss;
+  ss.precision(6);
+  ss << std::scientific;
+
+  ss << "\n";
+  for (int i = 0; i < nrRows(); ++i) {
+    for (int j = 0; j < nrCols(); ++j)
+      ss << "\t" << operator()(i, j);
+    ss << "\n";
+  }
+
+  std::cout << ss.str() << std::endl;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+std::string Matrix<ScalarType, device_name,  ALLOC>::toStr() const {
+  if (device_name == GPU)
+    return Matrix<ScalarType, CPU>(*this).toStr();
+
+  std::stringstream ss;
+  ss.precision(16);
+  ss << std::scientific;
+
+  ss << "\n";
+  for (int i = 0; i < nrRows(); ++i) {
+    for (int j = 0; j < nrCols(); ++j)
+      ss << "\t" << operator()(i, j);
+    ss << "\n";
+  }
+
+  return ss.str();
+}
+  
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+void Matrix<ScalarType, device_name,  ALLOC>::printFingerprint() const {
+  std::stringstream ss;
+
+  ss << "\n";
+  ss << "    name: " << name_ << "\n";
+  ss << "    size: " << size_.first << ", " << size_.second << "\n";
+  ss << "    capacity: " << capacity_.first << ", " << capacity_.second << "\n";
+  ss << "    memory-size: " << nrElements(capacity_) * sizeof(ScalarType) * 1.e-6 << "(Mbytes)\n";
+
+  std::cout << ss.str() << std::endl;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+std::pair<int, int> Matrix<ScalarType, device_name,  ALLOC>::capacityMultipleOfBlockSize(
+    std::pair<int, int> size) {
+  assert(size.first >= 0);
+  assert(size.second >= 0);
+
+  auto get_new_size = [=](const int size) {
+    return size <= 16 ? size : (size + block_size_ - 1) / block_size_ * block_size_;
+  };
+
+  size.first = get_new_size(size.first);
+  size.second = get_new_size(size.second);
+
+  return size;
+}
+
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+std::size_t Matrix<ScalarType, device_name,  ALLOC>::deviceFingerprint() const {
+  if (device_name == GPU)
+    return capacity_.first * capacity_.second * sizeof(ScalarType);
+  else
+    return 0;
+}
+
+/// Factory function for diangonal matrices, type is inferred from the type of Vector.
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+auto makeDiagonalMatrix(Vector<ScalarType, device_name, ALLOC>& diag) {
+  int dsize = diag.size();
+  Matrix<ScalarType, device_name,  ALLOC> matrix("diag_matrix", dsize);
+  for (int i = 0; i < dsize; ++i) {
+    matrix(i, i) = diag[i];
+  }
+  return matrix;
+}
+
+/// Factory function for diangonal matrices, type is inferred from the type of Vector.
+template <typename ScalarType, DeviceType device_name, class ALLOC>
+auto makeDiagonalMatrixInv(Vector<ScalarType, device_name, ALLOC>& diag) {
+  int dsize = diag.size();
+  Matrix<ScalarType, device_name,  ALLOC> matrix("diag_matrix", dsize);
+  // insure that if ScalarType is complex the 1 is as well.
+  // then std::complex will give us a proper complex multiplicative inverse
+  ScalarType the_one{};
+  the_one += 1.0;
+  for (int i = 0; i < dsize; ++i) {
+    matrix(i, i) = the_one / diag[i];
+  }
+  return matrix;
+}
+
+}  // namespace linalg
+}  // namespace dca
+
+#endif  // DCA_LINALG_MATRIX_HPP
diff --git a/linalg/matrixop.hpp b/linalg/matrixop.hpp
new file mode 100644
index 000000000..87c650d25
--- /dev/null
+++ b/linalg/matrixop.hpp
@@ -0,0 +1,1367 @@
+// Copyright (C) 2023 ETH Zurich
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Staar (taa@zurich.ibm.com)
+//         Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//         Peter W. Doak (doakpw@ornl.gov)
+//
+/** \file provides the matrix interface for the following matrix operations:
+ *  - copyCol, copyRow, copyCols, copyRows
+ * - difference
+ * - real
+ * - insertCol, insertRow (for CPU matrices only)
+ * - inverse
+ * - inverseAndDeterminant
+ * - removeCol, removeCols, removeRow, removeRows, removeRowAndCol, removeRowAndCols
+ * - scaleCol, scaleRow, scaleRows
+ * - swapCol, swapRow, swapRowAndCol
+ * - swapCols, swapRows (for GPU matrices only)
+ * - gemm
+ * - multiply
+ * - trsm
+ * - determinant
+ * - logDeterminant
+ * - eigensolver (non-symmetric / symmetric / Hermitian)
+ * - pseudoInverse
+ *
+ * CPU Matrix has choice of allocator, although currently all must match
+ * GPU or mixed CPU GPU must use default allocator.
+ */
+
+#ifndef DCA_LINALG_MATRIXOP_HPP
+#define DCA_LINALG_MATRIXOP_HPP
+
+#include <cassert>
+#include <cstring>
+#include <tuple>
+
+#include "dca/linalg/blas/use_device.hpp"
+#include "dca/linalg/lapack/use_device.hpp"
+#include "dca/linalg/matrix.hpp"
+#include "dca/linalg/util/util_lapack.hpp"
+#include "dca/linalg/util/util_matrixop.hpp"
+#include "dca/linalg/vector.hpp"
+#include "dca/math/util/phase.hpp"
+
+#ifdef DCA_HAVE_GPU
+#include "dca/linalg/blas/kernels_gpu.hpp"
+#endif
+
+namespace dca {
+namespace linalg {
+namespace matrixop {
+// dca::linalg::matrixop::
+
+// Copies the matrix mat in a.
+// Preconditions: lda >= mat.nrRows().
+template <typename Scalar, class ALLOC>
+inline void copyMatrixToArray(const Matrix<Scalar, CPU, ALLOC>& mat, Scalar* a, int lda) {
+  assert(lda >= mat.nrRows());
+  lapack::lacpy("A", mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(), a, lda);
+}
+
+// Copies the m by n matrix stored in a to the matrix mat.
+// Preconditions: lda >= m.
+template <typename Scalar, class ALLOC>
+inline void copyArrayToMatrix(int m, int n, const Scalar* a, int lda,
+                              Matrix<Scalar, CPU, ALLOC>& mat) {
+  assert(lda >= m);
+  mat.resizeNoCopy(std::make_pair(m, n));
+  lapack::lacpy("A", mat.nrRows(), mat.nrCols(), a, lda, mat.ptr(), mat.leadingDimension());
+}
+
+// Copies the jx-th column of mat_x into the jy-th column of mat_y.
+// In/Out: mat_y
+// Preconditions: mat_x.nrRows() == mat_y.nrRows(),
+//                0 <= jx < mat_x.nrCols(), 0 <= jy < mat_y.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void copyCol(const Matrix<Scalar, device_name>& mat_x, int jx,
+                    Matrix<Scalar, device_name>& mat_y, int jy, int thread_id = 0, int stream_id = 0) {
+  assert(jx >= 0 && jx < mat_x.nrCols());
+  assert(jy >= 0 && jy < mat_y.nrCols());
+  assert(mat_x.nrRows() == mat_y.nrRows());
+
+  blas::UseDevice<device_name>::copy(mat_x.nrRows(), mat_x.ptr(0, jx), 1, mat_y.ptr(0, jy), 1,
+                                     thread_id, stream_id);
+}
+
+// Copies the j_x[i]-th column of mat_x into the j_y[i]-th column of mat_y, for 0 <= i < j_x.size().
+// In/Out: mat_y
+// Preconditions: j_x.size() <= j_y.size(), mat_x.nrRows() == mat_y.nrRows()
+//                0 <= j_x[i] < mat_x.nrCols() for 0 <= i < j_x.size(),
+//                0 <= j_y[i] < mat_y.nrCols() for 0 <= i < j_x.size().
+template <typename Scalar, class Vec, class ALLOC>
+inline void copyCols(const Matrix<Scalar, CPU, ALLOC>& mat_x, const Vec& j_x,
+                     Matrix<Scalar, CPU, ALLOC>& mat_y, const Vec& j_y, int /*thread_id*/ = 0,
+                     int /*stream_id*/ = 0) {
+  assert(j_x.size() <= j_y.size());
+
+  for (int ind_j = 0; ind_j < j_x.size(); ++ind_j)
+    copyCol(mat_x, j_x[ind_j], mat_y, j_y[ind_j]);
+}
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+inline void copyCols(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& j_x,
+                     Matrix<Scalar, GPU>& mat_y, const Vector<int, GPU>& j_y, int thread_id = 0,
+                     int stream_id = 0) {
+  assert(j_x.size() <= j_y.size());
+  assert(mat_x.nrRows() == mat_y.nrRows());
+
+  blas::copyCols(mat_x.nrRows(), j_x.size(), j_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(),
+                 j_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+
+// Copies the j_x columns of mat_x into the  mat_y, for 0 <= i < j_x.size().
+// In/Out: mat_y
+// Preconditions: mat_x.nrRows() == mat_y.nrRows()
+//                0 <= j_x[i] < mat_x.nrCols() for 0 <= i < j_x.size(),
+template <typename Scalar>
+inline void copyCols(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& j_x,
+                     Matrix<Scalar, GPU>& mat_y, int thread_id = 0, int stream_id = 0) {
+  assert(mat_x.nrRows() == mat_y.nrRows());
+
+  blas::copyCols(mat_x.nrRows(), j_x.size(), j_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(),
+                 mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Copies the ix-th row of mat_x into the iy-th row of mat_y.
+// In/Out: mat_y
+// Preconditions: mat_x.nrCols() == mat_y.nrCols(),
+//                0 <= ix < mat_x.nrRows(), 0 <= iy < mat_y.nrRows().
+template <typename Scalar, DeviceType device_name>
+inline void copyRow(const Matrix<Scalar, device_name>& mat_x, int ix,
+                    Matrix<Scalar, device_name>& mat_y, int iy, int thread_id = 0, int stream_id = 0) {
+  assert(ix >= 0 && ix < mat_x.nrRows());
+  assert(iy >= 0 && iy < mat_y.nrRows());
+  assert(mat_x.nrCols() == mat_y.nrCols());
+
+  blas::UseDevice<device_name>::copy(mat_x.nrCols(), mat_x.ptr(ix, 0), mat_x.leadingDimension(),
+                                     mat_y.ptr(iy, 0), mat_y.leadingDimension(), thread_id,
+                                     stream_id);
+}
+
+// Copies the i_x[i]-th row of mat_x into the i_y[i]-th row of mat_y, for 0 <= i < i_x.size().
+// In/Out: mat_y
+// Preconditions: i_x.size() <= i_y.size(), mat_x.nrCols() == mat_y.nrCols()
+//                0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size(),
+//                0 <= i_y[i] < mat_y.nrRows() for 0 <= i < i_x.size().
+template <typename Scalar, class Vec, class ALLOC>
+inline void copyRows(const Matrix<Scalar, CPU, ALLOC>& mat_x, const Vec& i_x,
+                     Matrix<Scalar, CPU, ALLOC>& mat_y, const Vec& i_y, const int /*thread_id*/ = 0,
+                     const int /*stream_id*/ = 0) {
+  assert(i_x.size() <= i_y.size());
+  assert(mat_x.nrCols() == mat_y.nrCols());
+
+  for (int j = 0; j < mat_x.nrCols(); ++j)
+    for (int ind_i = 0; ind_i < i_x.size(); ++ind_i)
+      mat_y(i_y[ind_i], j) = mat_x(i_x[ind_i], j);
+}
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i_x,
+                     Matrix<Scalar, GPU>& mat_y, const Vector<int, GPU>& i_y, const int thread_id,
+                     const int stream_id) {
+  assert(i_x.size() <= i_y.size());
+  assert(mat_x.nrCols() == mat_y.nrCols());
+
+  blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(),
+                 i_y.ptr(), mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+
+// Copies the i_x rows of mat_x into  mat_y, for 0 <= i < i_x.size().
+// In/Out: mat_y
+// Preconditions: mat_x.nrCols() == mat_y.nrCols()
+//                0 <= i_x[i] < mat_x.nrRows() for 0 <= i < i_x.size().
+template <typename Scalar>
+inline void copyRows(const Matrix<Scalar, GPU>& mat_x, const Vector<int, GPU>& i_x,
+                     Matrix<Scalar, GPU>& mat_y, const int thread_id, const int stream_id) {
+  assert(mat_x.nrCols() == mat_y.nrCols());
+
+  blas::copyRows(mat_x.nrCols(), i_x.size(), i_x.ptr(), mat_x.ptr(), mat_x.leadingDimension(),
+                 mat_y.ptr(), mat_y.leadingDimension(), thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Returns the difference of two matrices in terms of max_i,j(|a(i, j) - b(i, j)|).
+// If the difference is larger than the threshold a std::logic_error exception is thrown,
+// and if NDEBUG is not defined each difference which exceeds the threshold is printed.
+// Preconditions: a.size() == b.size().
+template <typename Scalar, class ALLOC>
+auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, CPU, ALLOC>& b,
+                double diff_threshold = 1e-3) {
+  auto max_diff = std::abs(Scalar(0));
+  assert(a.size() == b.size());
+
+  for (int j = 0; j < a.nrCols(); ++j) {
+    for (int i = 0; i < a.nrRows(); ++i) {
+      max_diff = std::max(max_diff, std::abs(a(i, j) - b(i, j)));
+    }
+  }
+
+  if (max_diff > diff_threshold) {
+#ifndef NDEBUG
+    std::stringstream s;
+    for (int i = 0; i < a.nrRows(); ++i) {
+      for (int j = 0; j < a.nrCols(); ++j) {
+        if (std::abs(a(i, j) - b(i, j)) <= diff_threshold)
+          s << 0. << "\t";
+        else
+          s << a(i, j) - b(i, j) << "\t";
+      }
+      s << "\n";
+    }
+    s << std::endl;
+    std::cout << s.str();
+#endif  // NDEBUG
+    std::cerr << "matrix difference in excess of threshold!\n";
+    throw std::logic_error(__FUNCTION__);
+  }
+
+  return max_diff;
+}
+template <typename Scalar, DeviceType device_name, class ALLOC>
+auto difference(const Matrix<Scalar, device_name>& a, const Matrix<Scalar, CPU, ALLOC>& b,
+                double diff_threshold = 1e-3) {
+  Matrix<Scalar, CPU, ALLOC> cp_a(a);
+  return difference(cp_a, b, diff_threshold);
+}
+template <typename Scalar, DeviceType device_name_a, DeviceType device_name_b>
+auto difference(const Matrix<Scalar, device_name_a>& a, const Matrix<Scalar, device_name_b>& b,
+                double diff_threshold = 1e-3) {
+  Matrix<Scalar, CPU> cp_b(b);
+  return difference(a, cp_b, diff_threshold);
+}
+
+// Returns the real part of a matrix.
+// In: a
+// TODO test.
+template <typename Scalar, class ALLOC>
+Matrix<Scalar, CPU, ALLOC> real(const Matrix<std::complex<Scalar>, CPU, ALLOC>& a) {
+  Matrix<Scalar, CPU, ALLOC> a_re(a.size());
+  for (int j = 0; j < a.nrCols(); ++j)
+    for (int i = 0; i < a.nrRows(); ++i)
+      a_re(i, j) = std::real(a(i, j));
+  return a_re;
+}
+
+// Insert a column at position j. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= j < mat.nrCols() + 1.
+// Postconditions: The elements of the inserted column are set to 0.
+template <typename Scalar, class ALLOC>
+void insertCol(Matrix<Scalar, CPU, ALLOC>& mat, int j) {
+  assert(j >= 0 && j < mat.nrCols() + 1);
+
+  mat.resize(std::make_pair(mat.nrRows(), mat.nrCols() + 1));
+
+  if (mat.nrRows() > 0 && j < mat.nrCols() - 1)
+    memmove(mat.ptr(0, j + 1), mat.ptr(0, j),
+            sizeof(Scalar) * (mat.nrCols() - 1 - j) * mat.leadingDimension());
+
+  for (int i = 0; i < mat.nrRows(); ++i)
+    mat(i, j) = 0;
+}
+
+// Insert a row at position i. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= i < mat.nrRows() + 1.
+// Postconditions: The elements of the inserted row are set to 0.
+template <typename Scalar, class ALLOC>
+void insertRow(Matrix<Scalar, CPU, ALLOC>& mat, int i) {
+  assert(i >= 0 && i < mat.nrRows() + 1);
+
+  mat.resize(std::make_pair(mat.nrRows() + 1, mat.nrCols()));
+
+  if (i < mat.nrRows() - 1)
+    for (int j = 0; j < mat.nrCols(); ++j)
+      memmove(mat.ptr(i + 1, j), mat.ptr(i, j), sizeof(Scalar) * (mat.nrRows() - 1 - i));
+
+  for (int j = 0; j < mat.nrCols(); ++j)
+    mat(i, j) = 0;
+}
+
+// Computes the inverse of the matrix using the LU factorization.
+// In/Out: mat
+// Out: ipiv, work
+// Preconditions: mat is a square matrix.
+// Postconditions: ipiv and work are resized to the needed dimension.
+// \todo consider doing inverse at full precision reguardless of incoming Scalar precision
+template <typename Scalar, DeviceType device_name, template <typename> class ALLOC,
+          template <typename, DeviceType, class> class MatrixType>
+void inverse(MatrixType<Scalar, device_name, ALLOC<Scalar>>& mat, Vector<int, CPU>& ipiv,
+             Vector<Scalar, device_name>& work) {
+  assert(mat.is_square());
+
+  ipiv.resizeNoCopy(mat.nrRows());
+
+  lapack::UseDevice<device_name>::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(),
+                                        mat.leadingDimension(), ipiv.ptr());
+  // Get optimal worksize.
+  int lwork = util::getInverseWorkSize(mat);
+  work.resizeNoCopy(lwork);
+
+  lapack::UseDevice<device_name>::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.ptr(),
+                                        work.ptr(), lwork);
+}
+
+  template <typename Scalar, DeviceType device_name, template<typename> class ALLOC,
+          template <typename, DeviceType, class> class MatrixType>
+void inverse(MatrixType<Scalar, device_name, ALLOC<Scalar>>& mat) {
+  Vector<int, CPU> ipiv;
+  Vector<Scalar, device_name> work;
+  inverse(mat, ipiv, work);
+}
+
+template <typename Scalar, class ALLOC>
+void smallInverse(Matrix<Scalar, CPU, ALLOC>& m_inv, Vector<int, CPU>& ipiv,
+                  Vector<Scalar, CPU>& work) {
+  assert(m_inv.is_square());
+  switch (m_inv.nrCols()) {
+    case 1:
+      m_inv(0, 0) = Scalar(1.) / m_inv(0, 0);
+      break;
+    case 2: {
+      const Scalar det = m_inv(0, 0) * m_inv(1, 1) - m_inv(0, 1) * m_inv(1, 0);
+
+      std::swap(m_inv(0, 0), m_inv(1, 1));
+      m_inv(0, 0) /= det;
+      m_inv(1, 1) /= det;
+      // Thomas this looks like your bug fix was this it?
+      // std::swap(m_inv(1, 0), m_inv(0, 1));
+      m_inv(1, 0) /= -det;
+      m_inv(0, 1) /= -det;
+      break;
+    }
+    case 3: {
+      const Matrix<Scalar, CPU, ALLOC> m(m_inv);
+      const Scalar det = m(0, 0) * (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) -
+                         m(1, 0) * (m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1)) +
+                         m(2, 0) * (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
+      m_inv(0, 0) = (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) / det;
+      m_inv(0, 1) = -(m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1)) / det;
+      m_inv(0, 2) = (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)) / det;
+      m_inv(1, 0) = -(m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) / det;
+      m_inv(1, 1) = (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0)) / det;
+      m_inv(1, 2) = -(m(0, 0) * m(1, 2) - m(1, 0) * m(0, 2)) / det;
+      m_inv(2, 0) = (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0)) / det;
+      m_inv(2, 1) = -(m(0, 0) * m(2, 1) - m(0, 1) * m(2, 0)) / det;
+      m_inv(2, 2) = (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0)) / det;
+      break;
+    }
+    default:
+      inverse(m_inv, ipiv, work);
+  }
+}
+
+template <class Scalar, class ALLOC>
+void smallInverse(Matrix<Scalar, CPU, ALLOC>& m_inv) {
+  Vector<int, CPU> ipiv;
+  Vector<Scalar, CPU> work;
+  smallInverse(m_inv, ipiv, work);
+}
+
+// Computes in place the inverse of mat and the determinant of the inverse.
+// In/Out: mat
+// Returns: the determinant of mat^-1
+// Precondition: mat is a non-singular real matrix.
+template <typename Scalar, template <typename, DeviceType> class MatrixType>
+Scalar inverseAndDeterminant(MatrixType<Scalar, CPU>& mat) {
+  assert(mat.is_square());
+  std::vector<int> ipiv(mat.nrRows());
+
+  lapack::UseDevice<CPU>::getrf(mat.nrRows(), mat.nrCols(), mat.ptr(), mat.leadingDimension(),
+                                ipiv.data());
+
+  Scalar det = 1;
+  for (int i = 0; i < mat.nrCols(); ++i) {
+    det *= mat(i, i);
+    if (ipiv[i] != i + 1)
+      det *= -1;
+  }
+
+  const int lwork = util::getInverseWorkSize(mat);
+  std::vector<Scalar, class ALLOC> work(lwork);
+  lapack::UseDevice<CPU>::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.data(),
+                                work.data(), lwork);
+
+  return 1. / det;
+}
+
+// Remove the j-th column. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= j < mat.nrCols().
+template <typename Scalar, class ALLOC>
+void removeCol(Matrix<Scalar, CPU, ALLOC>& mat, int j) {
+  assert(j >= 0 && j < mat.nrCols());
+
+  if (mat.nrRows() > 0 && j < mat.nrCols() - 1)
+    memmove(mat.ptr(0, j), mat.ptr(0, j + 1),
+            sizeof(Scalar) * (mat.nrCols() - j - 1) * mat.leadingDimension());
+
+  mat.resize(std::make_pair(mat.nrRows(), mat.nrCols() - 1));
+}
+
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+void removeCol(Matrix<Scalar, GPU>& mat, int j) {
+  assert(j >= 0 && j < mat.nrCols());
+
+  if (mat.nrRows() > 0 && j < mat.nrCols() - 1)
+    blas::moveLeft(mat.nrRows(), mat.nrCols() - j, mat.ptr(0, j), mat.leadingDimension());
+
+  mat.resize(std::make_pair(mat.nrRows(), mat.nrCols() - 1));
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Remove columns in range [first, last]. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= first, last < mat.nrCols().
+template <typename Scalar, class ALLOC>
+void removeCols(Matrix<Scalar, CPU, ALLOC>& mat, int first, int last) {
+  const int n_removed = last - first + 1;
+  const int n = mat.nrRows();
+  const int m = mat.nrCols();
+  assert(last < m and last >= first and first >= 0);
+
+  if (n > 0 and last < m - 1)
+    std::memmove(mat.ptr(0, first), mat.ptr(0, last + 1),
+                 mat.leadingDimension() * (m - last - 1) * sizeof(Scalar));
+
+  mat.resize(std::make_pair(n, m - n_removed));
+}
+
+// Remove the i-th row. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= i < mat.nrRows().
+template <typename Scalar, class ALLOC>
+void removeRow(Matrix<Scalar, CPU, ALLOC>& mat, int i) {
+  assert(i >= 0 && i < mat.nrRows());
+
+  if (i < mat.nrRows() - 1)
+    for (int j = 0; j < mat.nrCols(); ++j)
+      memmove(mat.ptr(i, j), mat.ptr(i + 1, j), sizeof(Scalar) * (mat.nrRows() - i - 1));
+
+  mat.resize(std::make_pair(mat.nrRows() - 1, mat.nrCols()));
+}
+
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+void removeRow(Matrix<Scalar, GPU>& mat, int i) {
+  assert(i >= 0 && i < mat.nrRows());
+
+  if (mat.nrCols() > 0 && i < mat.nrRows() - 1)
+    blas::moveUp(mat.nrRows() - i, mat.nrCols(), mat.ptr(i, 0), mat.leadingDimension());
+
+  mat.resize(std::make_pair(mat.nrRows() - 1, mat.nrCols()));
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Remove rows in range [first, last]. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= first, last < mat.nrRows().
+template <typename Scalar, class ALLOC>
+void removeRows(Matrix<Scalar, CPU, ALLOC>& mat, int first, int last) {
+  const int n_removed = last - first + 1;
+  const int n = mat.nrRows();
+  const int m = mat.nrCols();
+  assert(last < n and last >= first and first >= 0);
+
+  if (last < n - 1)
+    for (int j = 0; j < m; ++j)
+      std::memmove(mat.ptr(first, j), mat.ptr(last + 1, j), (n - last - 1) * sizeof(Scalar));
+
+  mat.resize(std::make_pair(n - n_removed, m));
+}
+
+// Remove the i-th row and the j-th column. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= i < mat.nrRows(), 0 <= j < mat.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void removeRowAndCol(Matrix<Scalar, device_name>& mat, int i, int j) {
+  removeRow(mat, i);
+  removeCol(mat, j);
+}
+
+// Remove the i-th row and the i-th column. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= i < mat.nrRows(), i < mat.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void removeRowAndCol(Matrix<Scalar, device_name>& mat, int i) {
+  removeRowAndCol(mat, i, i);
+}
+
+// Remove rows and columns in range [first, last]. The data is moved accordingly.
+// In/Out: mat
+// Preconditions: 0 <= first, last < min(mat.nrRows(), mat.nrCols()).
+template <typename Scalar, class ALLOC>
+void removeRowsAndCols(Matrix<Scalar, CPU, ALLOC>& mat, int first, int last) {
+  removeCols(mat, first, last);
+  removeRows(mat, first, last);
+}
+
+// Scales the j-th column of mat by val.
+// In/Out: mat
+// Preconditions: 0 <= j < mat.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void scaleCol(Matrix<Scalar, device_name>& mat, int j, Scalar val, int thread_id = 0,
+                     int stream_id = 0) {
+  assert(j >= 0 && j < mat.nrCols());
+  blas::UseDevice<device_name>::scal(mat.nrRows(), val, mat.ptr(0, j), 1, thread_id, stream_id);
+}
+
+// Scales the i-th row of mat by val.
+// In/Out: mat
+// Preconditions: 0 <= i < mat.nrRow().
+template <typename Scalar, DeviceType device_name>
+inline void scaleRow(Matrix<Scalar, device_name>& mat, int i, Scalar val, int thread_id = 0,
+                     int stream_id = 0) {
+  assert(i >= 0 && i < mat.nrRows());
+  blas::UseDevice<device_name>::scal(mat.nrCols(), val, mat.ptr(i, 0), mat.leadingDimension(),
+                                     thread_id, stream_id);
+}
+
+// Scales the i[k]-th row of mat by val[k] for 0 <= k < i.size().
+// In/Out: mat
+// Preconditions: i.size() == val.size(), 0 <= i[k] < mat.nrRow() for 0 <= k < i.size().
+template <typename Scalar, class ALLOC>
+inline void scaleRows(Matrix<Scalar, CPU, ALLOC>& mat, const Vector<int, CPU>& i,
+                      const Vector<Scalar, CPU>& val, int /*thread_id*/ = 0, int /*stream_id*/ = 0) {
+  assert(i.size() == val.size());
+
+  for (int j = 0; j < mat.nrCols(); ++j)
+    for (int ind = 0; ind < i.size(); ++ind)
+      mat(i[ind], j) *= val[ind];
+}
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+inline void scaleRows(Matrix<Scalar, GPU>& mat, const Vector<int, GPU>& i,
+                      const Vector<Scalar, GPU>& val, int thread_id = 0, int stream_id = 0) {
+  assert(i.size() == val.size());
+
+  blas::scaleRows(mat.nrCols(), i.size(), i.ptr(), val.ptr(), mat.ptr(), mat.leadingDimension(),
+                  thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Swaps the j1-th column with the j2-th column of mat.
+// In/Out: mat
+// Preconditions: 0 <= j1 < mat.nrCols(), 0 <= j2 < mat_y.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void swapCol(Matrix<Scalar, device_name>& mat, int j1, int j2, int thread_id = 0,
+                    int stream_id = 0) {
+  assert(j1 >= 0 && j1 < mat.nrCols());
+  assert(j2 >= 0 && j2 < mat.nrCols());
+  blas::UseDevice<device_name>::swap(mat.nrRows(), mat.ptr(0, j1), 1, mat.ptr(0, j2), 1, thread_id,
+                                     stream_id);
+}
+
+// Swaps the j_1[i]-th column with the j_2[i]-th column of mat, for 0 <= i < j_1.size().
+// In/Out: mat
+// Preconditions: j_1.size() <= j_2.size()
+//                0 <= j_1[i] < mat.nrCols() for 0 <= i < j_1.size(),
+//                0 <= j_2[i] < mat.nrCols() for 0 <= i < j_1.size().
+//                j_1[i] != j_1[j] for i != j, j_2[i] != j_2[j] for i != j,
+//                j_1[i] != j_2[j] for all i, j.
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+inline void swapCols(Matrix<Scalar, GPU>& mat, const Vector<int, GPU>& j_1,
+                     const Vector<int, GPU>& j_2, int thread_id = 0, int stream_id = 0) {
+  assert(j_1.size() <= j_2.size());
+  blas::swapCols(mat.nrRows(), j_1.size(), j_1.ptr(), j_2.ptr(), mat.ptr(), mat.leadingDimension(),
+                 thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Swaps the i1-th row with the i2-th row of mat.
+// In/Out: mat
+// Preconditions: 0 <= i1 < mat.nrRows(), 0 <= i2 < mat_y.nrRows().
+template <typename Scalar, DeviceType device_name>
+inline void swapRow(Matrix<Scalar, device_name>& mat, int i1, int i2, int thread_id = 0,
+                    int stream_id = 0) {
+  assert(i1 >= 0 && i1 < mat.nrRows());
+  assert(i2 >= 0 && i2 < mat.nrRows());
+  blas::UseDevice<device_name>::swap(mat.nrCols(), mat.ptr(i1, 0), mat.leadingDimension(),
+                                     mat.ptr(i2, 0), mat.leadingDimension(), thread_id, stream_id);
+}
+
+// Swaps the i_1[i]-th row with the i_2[i]-th row of mat, for 0 <= i < i_1.size().
+// In/Out: mat
+// Preconditions: i_2.size() == i_2.size()
+//                0 <= i_1[i] < mat.nrRows() for 0 <= i < i_1.size(),
+//                0 <= i_2[i] < mat.nrRows() for 0 <= i < i_1.size().
+//                i_1[i] != i_1[j] for i != j, i_2[i] != i_2[j] for i != j,
+//                i_1[i] != i_2[j] for all i, j.
+#ifdef DCA_HAVE_GPU
+template <typename Scalar>
+inline void swapRows(Matrix<Scalar, GPU>& mat, const Vector<int, GPU>& i_1,
+                     const Vector<int, GPU>& i_2, int thread_id = 0, int stream_id = 0) {
+  assert(i_1.size() == i_2.size());
+  blas::swapRows(mat.nrCols(), i_1.size(), i_1.ptr(), i_2.ptr(), mat.ptr(), mat.leadingDimension(),
+                 thread_id, stream_id);
+  checkErrorsCudaDebug();
+}
+#endif  // DCA_HAVE_GPU
+
+// Swaps the i1-th row with the i2-th row and the i1-th column with the i2-th column of mat.
+// In/Out: mat
+// Preconditions: 0 <= i1 < mat.nrRows(), i1 < mat.nrCols(),
+//                0 <= i2 < mat.nrRows(), i2 < mat.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void swapRowAndCol(Matrix<Scalar, device_name>& mat, int i1, int i2, int thread_id = 0,
+                          int stream_id = 0) {
+  swapRow(mat, i1, i2, thread_id, stream_id);
+  swapCol(mat, i1, i2, thread_id, stream_id);
+}
+
+// Performs the matrix-vector multiplication y <- alpha * op(a) * x + beta * y,
+// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and
+// op(X) == conjugate_transposed(X) if transX == 'C' (X = a).
+// In/Out: y ('In' only if beta != 0)
+// Preconditions: transa should be one of the following: 'N', 'T' or 'C',
+//                a.nrRows() == y.size() if transa == 'N', a.nrCols() == y.size() otherwise,
+//                a.nrCols() == x.size() if transa == 'N', a.nrRows() == x.size() otherwise.
+template <typename Scalar, class ALLOC>
+void gemv(char transa, Scalar alpha, const Matrix<Scalar, CPU, ALLOC>& a,
+          const Vector<Scalar, CPU>& x, Scalar beta, Vector<Scalar, CPU>& y) {
+  if (transa == 'N') {
+    assert(a.nrRows() == y.size());
+    assert(a.nrCols() == x.size());
+  }
+  else {
+    assert(a.nrRows() == x.size());
+    assert(a.nrCols() == y.size());
+  }
+
+  int lda = a.leadingDimension();
+
+  blas::gemv(&transa, a.nrRows(), a.nrCols(), alpha, a.ptr(), lda, x.ptr(), 1, beta, y.ptr(), 1);
+}
+
+// Performs the matrix-vector multiplication y <- op(a) * x,
+// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and
+// op(X) == conjugate_transposed(X) if transX == 'C' (X = a).
+// Out: y
+// Preconditions: transa should be one of the following: 'N', 'T' or 'C',
+//                a.nrRows() == y.size() if transa == 'N', a.nrCols() == y.size() otherwise,
+//                a.nrCols() == x.size() if transa == 'N', a.nrRows() == x.size() otherwise.
+template <typename Scalar, class ALLOC>
+void gemv(char transa, const Matrix<Scalar, CPU, ALLOC>& a, const Vector<Scalar, CPU>& x,
+          Vector<Scalar, CPU>& y) {
+  gemv<Scalar>(transa, 1., a, x, 0., y);
+}
+
+// Performs the matrix-matrix multiplication c <- alpha * op(a) * op(b) + beta * c,
+// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and
+// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b).
+// In/Out: c ('In' only if beta != 0)
+// Preconditions: transa and transb should be one of the following: 'N', 'T' or 'C',
+//                a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise,
+//                b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise,
+//                ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and
+//                          kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise.
+template <typename Scalar, DeviceType device_name, template <typename, DeviceType, class> class MatrixA,
+          template <typename, DeviceType, class> class MatrixB,
+          template <typename, DeviceType, class> class MatrixC, class ALLOC1, class ALLOC2, class ALLOC3>
+void gemm(char transa, char transb, Scalar alpha, const MatrixA<Scalar, device_name, ALLOC1>& a,
+          const MatrixB<Scalar, device_name, ALLOC2>& b, Scalar beta,
+          MatrixC<Scalar, device_name, ALLOC3>& c, int thread_id = 0, int stream_id = 0) {
+  int m = c.nrRows();
+  int n = c.nrCols();
+  int k;
+
+  if (transa == 'N') {
+    assert(a.nrRows() == m);
+    k = a.nrCols();
+  }
+  else {
+    assert(a.nrCols() == m);
+    k = a.nrRows();
+  }
+
+  if (transb == 'N') {
+    assert(b.nrRows() == k);
+    assert(b.nrCols() == n);
+  }
+  else {
+    assert(b.nrCols() == k);
+    assert(b.nrRows() == n);
+  }
+
+  int lda = a.leadingDimension();
+  int ldb = b.leadingDimension();
+  int ldc = c.leadingDimension();
+
+  blas::UseDevice<device_name>::gemm(&transa, &transb, m, n, k, alpha, a.ptr(), lda, b.ptr(), ldb,
+                                     beta, c.ptr(), ldc, thread_id, stream_id);
+}
+
+// Performs the matrix-matrix multiplication c <- a * b
+// Out: c
+// Preconditions: a.nrRows() == c.nrRows(), b.nrCols() == c.nrCols() and a.nrCols() == b.nrRows()
+template <typename Scalar, DeviceType device_name, class ALLOC,
+          template <typename, DeviceType, class> class MatrixA,
+          template <typename, DeviceType, class> class MatrixB,
+          template <typename, DeviceType, class> class MatrixC>
+inline void gemm(const MatrixA<Scalar, device_name, ALLOC>& a,
+                 const MatrixB<Scalar, device_name, ALLOC>& b,
+                 MatrixC<Scalar, device_name, ALLOC>& c, int thread_id = 0, int stream_id = 0) {
+  gemm<Scalar, device_name>('N', 'N', 1., a, b, 0., c, thread_id, stream_id);
+}
+
+// Performs the matrix-matrix multiplication c <- alpha * a * b + beta * c,
+// In/Out: c ('In' only if beta != 0)
+// Preconditions: a.nrRows() == c.nrRows(), b.nrCols() == c.nrCols() and a.nrCols() == b.nrRows()
+template <typename Scalar, DeviceType device_name, class ALLOC,
+          template <typename, DeviceType, class> class MatrixA,
+          template <typename, DeviceType, class> class MatrixB,
+          template <typename, DeviceType, class> class MatrixC>
+inline void gemm(Scalar alpha, const MatrixA<Scalar, device_name, ALLOC>& a,
+                 const MatrixB<Scalar, device_name, ALLOC>& b, Scalar beta,
+                 MatrixC<Scalar, device_name, ALLOC>& c, int thread_id = 0, int stream_id = 0) {
+  gemm<Scalar, device_name>('N', 'N', alpha, a, b, beta, c, thread_id, stream_id);
+}
+
+// Performs the matrix-matrix multiplication c <- op(a) * op(b),
+// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and
+// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b).
+// Out: c
+// Preconditions: transa and transb should be one of the following: 'N', 'T' or 'C',
+//                a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise,
+//                b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise,
+//                ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and
+//                          kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise.
+template <typename Scalar, DeviceType device_name, class ALLOC>
+inline void gemm(char transa, char transb, const Matrix<Scalar, device_name, ALLOC>& a,
+                 const Matrix<Scalar, device_name, ALLOC>& b, Matrix<Scalar, device_name, ALLOC>& c,
+                 int thread_id = 0, int stream_id = 0) {
+  gemm<Scalar, device_name>(transa, transb, 1., a, b, 0., c, thread_id, stream_id);
+}
+
+// Performs the triangular solve b <- a^-1 * b,
+// where a is a lower triangular matrix (uplo = 'L') or an upper triangular matrix (uplo = 'U'),
+// with unit diagonal (diag = "U") or with general diagonal (diag = "N")
+// In/Out: b
+// Preconditions: a.nrRows() == a.nrCols() , a.nrCols() == b.nrRows()
+template <typename Scalar, DeviceType device_name>
+void trsm(char uplo, char diag, const Matrix<Scalar, device_name>& a,
+          Matrix<Scalar, device_name>& b, int thread_id = 0, int stream_id = 0) {
+  assert(uplo == 'U' or uplo == 'L');
+  assert(diag == 'U' or diag == 'N');
+  assert(a.nrRows() == a.nrCols());
+  assert(b.nrRows() == a.nrCols());
+
+  blas::UseDevice<device_name>::trsm("L", &uplo, "N", &diag, b.nrRows(), b.nrCols(), Scalar(1),
+                                     a.ptr(), a.leadingDimension(), b.ptr(), b.leadingDimension(),
+                                     thread_id, stream_id);
+}
+
+// Mixed real and complex matrix-matrix multiply.
+// TODO: Not sure if this are needed.
+//       The only file in which it may be used is basis_transformation_cd_to_ed.h.
+
+// Performs the matrix-matrix multiplication c <- op(a) * op(b),
+// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and
+// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b).
+// Out: c
+// Preconditions: transa should be one of the following: 'N', 'T',
+//                transb should be one of 'N', 'T', 'C',
+//                a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise,
+//                b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise,
+//                ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and
+//                          kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise.
+template <typename Scalar, class ALLOC>
+void gemm(char transa, char transb, Matrix<Scalar, CPU, ALLOC>& a,
+          Matrix<std::complex<Scalar>, CPU, ALLOC>& b, Matrix<std::complex<Scalar>, CPU, ALLOC>& c) {
+  Matrix<Scalar, CPU, ALLOC> b_part(b.size());
+  Matrix<Scalar, CPU, ALLOC> c_re(c.size());
+  Matrix<Scalar, CPU, ALLOC> c_im(c.size());
+
+  Scalar sign = 1;
+  if (transb == 'C') {
+    sign = -1;
+    transb = 'T';
+  }
+
+  for (int j = 0; j < b.nrCols(); ++j)
+    for (int i = 0; i < b.nrRows(); ++i)
+      b_part(i, j) = b(i, j).real();
+
+  gemm(transa, transb, a, b_part, c_re);
+
+  for (int j = 0; j < b.nrCols(); ++j)
+    for (int i = 0; i < b.nrRows(); ++i)
+      b_part(i, j) = b(i, j).imag();
+
+  gemm(transa, transb, sign, a, b_part, Scalar(0), c_im);
+
+  for (int j = 0; j < c.nrCols(); ++j)
+    for (int i = 0; i < c.nrRows(); ++i)
+      c(i, j) = std::complex<Scalar>(c_re(i, j), c_im(i, j));
+}
+
+// Performs the matrix-matrix multiplication c <- op(a) * op(b),
+// where op(X) = X if transX == 'N', op(X) = transposed(X) if transX == 'T', and
+// op(X) == conjugate_transposed(X) if transX == 'C' (X = a, b).
+// Out: c
+// Preconditions: transa should be one of the following: 'N', 'T', 'C',
+//                transb should be one of 'N', 'T',
+//                a.nrRows() == c.nrRows() if transa == 'N', a.nrCols() == c.nrRows() otherwise,
+//                b.nrCols() == c.nrCols() if transb == 'N', b.nrRows() == c.nrCols() otherwise,
+//                ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise and
+//                          kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise.
+template <typename Scalar, class ALLOC>
+static void gemm(char transa, char transb, Matrix<std::complex<Scalar>, CPU, ALLOC>& a,
+                 Matrix<Scalar, CPU, ALLOC>& b, Matrix<std::complex<Scalar>, CPU, ALLOC>& c) {
+  Matrix<Scalar, CPU, ALLOC> a_part(a.size());
+  Matrix<Scalar, CPU, ALLOC> c_re(c.size());
+  Matrix<Scalar, CPU, ALLOC> c_im(c.size());
+
+  Scalar sign = 1;
+  if (transa == 'C') {
+    sign = -1;
+    transa = 'T';
+  }
+
+  for (int j = 0; j < a.nrCols(); ++j)
+    for (int i = 0; i < a.nrRows(); ++i)
+      a_part(i, j) = a(i, j).real();
+
+  gemm(transa, transb, a_part, b, c_re);
+
+  for (int j = 0; j < a.nrCols(); ++j)
+    for (int i = 0; i < a.nrRows(); ++i)
+      a_part(i, j) = a(i, j).imag();
+
+  gemm(transa, transb, sign, a_part, b, Scalar(0), c_im);
+
+  for (int j = 0; j < c.nrCols(); ++j)
+    for (int i = 0; i < c.nrRows(); ++i)
+      c(i, j) = std::complex<Scalar>(c_re(i, j), c_im(i, j));
+}
+
+// Performs the matrix-matrix multiplication c = op(a) * op(b), where each matrix is split in real
+// and imaginary part. This is implemented with 3 real matrix-matrix multiplications.
+// Out: c
+// Preconditions: transa and transb should be one of the following: 'N', 'T', 'C'.
+//                a[0].size == a[1].size()
+//                b[0].size == b[1].size()
+//                c[0].size == c[1].size()
+//                a[0].nrRows() == c[0].nrRows() if transa == 'N', a[0].nrCols() == c[0].nrRows()
+//                  otherwise,
+//                b[0].nrCols() == c[0].nrCols() if transb == 'N', b[0].nrRows() == c[0].nrCols()
+//                  otherwise,
+//                ka == kb, where ka = a[0].nrCols() if transa == 'N', ka = a[0].nrRows() otherwise
+//                and kb = b[0].nrRows() if transb == 'N', kb = b[0].nrCols() otherwise.
+template <typename Scalar, class ALLOC>
+void multiply(char transa, char transb, const std::array<Matrix<Scalar, CPU, ALLOC>, 2>& a,
+              const std::array<Matrix<Scalar, CPU, ALLOC>, 2>& b,
+              std::array<Matrix<Scalar, CPU, ALLOC>, 2>& c,
+              std::array<Matrix<Scalar, CPU, ALLOC>, 5>& work) {
+  assert(a[0].size() == a[1].size());
+  assert(b[0].size() == b[1].size());
+  assert(c[0].size() == c[1].size());
+
+  work[0].resizeNoCopy(c[0].size());
+  work[1].resizeNoCopy(c[0].size());
+  work[2].resizeNoCopy(c[0].size());
+  auto& a_sum = work[3];
+  auto& b_sum = work[4];
+  a_sum.resizeNoCopy(a[0].size());
+  b_sum.resizeNoCopy(b[0].size());
+
+  const Scalar signa = transa == 'C' ? transa = 'T', -1 : 1;
+  const Scalar signb = transb == 'C' ? transb = 'T', -1 : 1;
+
+  for (int j = 0; j < a[0].nrCols(); ++j)
+    for (int i = 0; i < a[0].nrRows(); ++i)
+      a_sum(i, j) = a[0](i, j) + signa * a[1](i, j);
+  for (int j = 0; j < b[0].nrCols(); ++j)
+    for (int i = 0; i < b[0].nrRows(); ++i)
+      b_sum(i, j) = b[0](i, j) + signb * b[1](i, j);
+
+  gemm(transa, transb, a[0], b[0], work[0]);
+  gemm(transa, transb, signa * signb, a[1], b[1], Scalar(0), work[1]);
+  gemm(transa, transb, a_sum, b_sum, work[2]);
+
+  for (int j = 0; j < c[0].nrCols(); ++j)
+    for (int i = 0; i < c[0].nrRows(); ++i) {
+      c[0](i, j) = work[0](i, j) - work[1](i, j);
+      c[1](i, j) = work[2](i, j) - work[0](i, j) - work[1](i, j);
+    }
+}
+
+template <typename Scalar, class ALLOC>
+void multiply(const std::array<Matrix<Scalar, CPU, ALLOC>, 2>& a,
+              const std::array<Matrix<Scalar, CPU, ALLOC>, 2>& b,
+              std::array<Matrix<Scalar, CPU, ALLOC>, 2>& c,
+              std::array<Matrix<Scalar, CPU, ALLOC>, 5>& work) {
+  multiply('N', 'N', a, b, c, work);
+}
+
+// Performs the matrix-matrix multiplication c = op(a) * op(b), where a and c are split in real
+// and imaginary part, while b is real.
+// Out: c
+// Preconditions: transa and transb should be one of the following: 'N', 'T',
+//                a[0].size == a[1].size()
+//                c[0].size == c[1].size()
+//                a[0].nrRows() == c[0].nrRows() if transa == 'N', a.nrCols() == c.nrRows()
+//                  otherwise,
+//                b.nrCols() == c[0].nrCols() if transb == 'N', b.nrRows() == c.nrCols()
+//                  otherwise,
+//                ka == kb, where ka = a[0].nrCols() if transa == 'N', ka = a[0].nrRows() otherwise
+//                and kb = b.nrRows() if transb == 'N', kb = b.nrCols() otherwise.
+template <typename Scalar, DeviceType device_name>
+void multiply(char transa, char transb, const std::array<Matrix<Scalar, device_name>, 2>& a,
+              const Matrix<Scalar, device_name>& b, std::array<Matrix<Scalar, device_name>, 2>& c) {
+  assert(transa == 'N' || transa == 'T' || transa == 'C');
+  assert(transb == 'N' || transb == 'T');
+  assert(a[0].size() == a[1].size());
+  assert(c[0].size() == c[1].size());
+
+  gemm(transa, transb, a[0], b, c[0]);
+  const Scalar sign = transa == 'C' ? transa = 'T', -1 : 1;
+  gemm(transa, transb, sign, a[1], b, Scalar(0), c[1]);
+}
+
+template <typename Scalar, DeviceType device_name>
+void multiply(const std::array<Matrix<Scalar, device_name>, 2>& a,
+              const Matrix<Scalar, device_name>& b, std::array<Matrix<Scalar, device_name>, 2>& c) {
+  multiply('N', 'N', a, b, c);
+}
+
+// Performs the matrix-matrix multiplication c = op(a) * op(b), where b and c are split in real
+// and imaginary part, while a is real.
+// Out: c
+// Preconditions: transa and transb should be one of the following: 'N', 'T',
+//                b[0].size == b[1].size()
+//                c[0].size == c[1].size()
+//                a.nrRows() == c[0].nrRows() if transa == 'N', a.nrCols() == c.nrRows()
+//                  otherwise,
+//                b.[0]nrCols() == c[0].nrCols() if transb == 'N', b.[0]nrRows() == c.nrCols()
+//                  otherwise,
+//                ka == kb, where ka = a.nrCols() if transa == 'N', ka = a.nrRows() otherwise
+//                and kb = b.[0]nrRows() if transb == 'N', kb = b.[0]nrCols() otherwise.
+template <typename Scalar, DeviceType device_name>
+void multiply(char transa, char transb, const Matrix<Scalar, device_name>& a,
+              const std::array<Matrix<Scalar, device_name>, 2>& b,
+              std::array<Matrix<Scalar, device_name>, 2>& c) {
+  assert(transa == 'N' || transa == 'T');
+  assert(transb == 'N' || transb == 'T' || transb == 'C');
+  assert(b[0].size() == b[1].size());
+  assert(c[0].size() == c[1].size());
+
+  gemm(transa, transb, a, b[0], c[0]);
+  const Scalar sign = transb == 'C' ? transb = 'T', -1 : 1;
+  gemm(transa, transb, sign, a, b[1], Scalar(0), c[1]);
+}
+
+template <typename Scalar, DeviceType device_name>
+void multiply(const Matrix<Scalar, device_name>& a,
+              const std::array<Matrix<Scalar, device_name>, 2>& b,
+              std::array<Matrix<Scalar, device_name>, 2>& c) {
+  multiply('N', 'N', a, b, c);
+}
+
+// Performs the matrix-matrix multiplication b <- D * a,
+// where d is a vector containing the diagonal elements of the matrix D.
+// Out: b
+// Preconditions: a.size() == b.size(), d.size() == a.nrRows().
+template <typename ScalarIn, typename ScalarOut, DeviceType device_name>
+inline void multiplyDiagonalLeft(const Vector<ScalarIn, device_name>& d,
+                                 const Matrix<ScalarIn, device_name>& a,
+                                 Matrix<ScalarOut, device_name>& b, int thread_id = 0,
+                                 int stream_id = 0) {
+  lapack::UseDevice<device_name>::multiplyDiagonalLeft(a.nrRows(), a.nrCols(), d.ptr(), 1, a.ptr(),
+                                                       a.leadingDimension(), b.ptr(),
+                                                       b.leadingDimension(), thread_id, stream_id);
+}
+template <typename ScalarIn, typename ScalarOut>
+inline void multiplyDiagonalLeft(const Vector<ScalarIn, CPU>& d, const Matrix<ScalarIn, GPU>& a,
+                                 Matrix<ScalarOut, GPU>& b, int thread_id = 0, int stream_id = 0) {
+  Vector<ScalarIn, GPU> d_gpu(d);
+  multiplyDiagonalLeft(d_gpu, a, b, thread_id, stream_id);
+}
+
+// Performs the matrix-matrix multiplication b <- a * D,
+// where d is a vector containing the diagonal elements of the matrix D.
+// Out: b
+// Preconditions: a.size() == b.size(), d.size() == a.nrCols().
+template <typename Scalar, DeviceType device_name>
+inline void multiplyDiagonalRight(const Matrix<Scalar, device_name>& a,
+                                  const Vector<Scalar, device_name>& d,
+                                  Matrix<Scalar, device_name>& b, int thread_id = 0,
+                                  int stream_id = 0) {
+  lapack::UseDevice<device_name>::multiplyDiagonalRight(a.nrRows(), a.nrCols(), a.ptr(),
+                                                        a.leadingDimension(), d.ptr(), 1, b.ptr(),
+                                                        b.leadingDimension(), thread_id, stream_id);
+}
+template <typename Scalar>
+inline void multiplyDiagonalRight(const Matrix<Scalar, GPU>& a, const Vector<Scalar, CPU>& d,
+                                  Matrix<Scalar, GPU>& b, int thread_id = 0, int stream_id = 0) {
+  Vector<Scalar, GPU> d_gpu(d);
+  multiplyDiagonalRight(a, d_gpu, b, thread_id, stream_id);
+}
+
+// Computes the eigenvalues, the left eigenvectors (if jobvl == 'V')
+// and the right eigenvectors (if jobvr == 'V') of the real matrix a.
+// The real parts of the eigenvalues are stored in lambda_re, while the imaginary parts in
+// lambda_im.
+// If computed the left eigenvectors are stored in vl and the right eigenvectors in vr.
+// See sgeev, dgeev Lapack documentation for information about how the
+// eigenvectors are stored.
+// Out: lambda_re, lambda_im, vl, vr.
+// Precondition: jobvl == 'N' or jobvl == 'V',
+//               jobvr == 'N' or jobvr == 'V',
+//               a is a square matrix.
+// Postcondition: lambda_re, lambda_i, are resized, vl if jobvl == 'V', vr if jobvr == 'V' are
+// resized.
+template <typename Scalar, class ALLOC>
+void eigensolver(char jobvl, char jobvr, const Matrix<Scalar, CPU, ALLOC>& a,
+                 Vector<Scalar, CPU>& lambda_re, Vector<Scalar, CPU>& lambda_im,
+                 Matrix<Scalar, CPU, ALLOC>& vl, Matrix<Scalar, CPU, ALLOC>& vr) {
+  assert(a.is_square());
+
+  Matrix<Scalar, CPU, ALLOC> a_copy(a);
+  lambda_re.resizeNoCopy(a_copy.nrRows());
+  lambda_im.resizeNoCopy(a_copy.nrRows());
+  int ldvl = 1;
+  int ldvr = 1;
+  if (jobvl == 'V' || jobvl == 'v') {
+    vl.resizeNoCopy(a_copy.size());
+    ldvl = vl.leadingDimension();
+  }
+  if (jobvr == 'V' || jobvr == 'v') {
+    vr.resizeNoCopy(a_copy.size());
+    ldvr = vr.leadingDimension();
+  }
+
+  // Get optimal worksize.
+  int lwork = util::getEigensolverWorkSize(jobvl, jobvr, a_copy);
+  dca::linalg::Vector<Scalar, CPU> work(lwork);
+
+  lapack::geev(&jobvl, &jobvr, a_copy.nrRows(), a_copy.ptr(), a_copy.leadingDimension(),
+               lambda_re.ptr(), lambda_im.ptr(), vl.ptr(), ldvl, vr.ptr(), ldvr, work.ptr(),
+               work.size());
+}
+
+// Computes the eigenvalues, the left eigenvectors (if jobvl == 'V')
+// and the right eigenvectors (if jobvr == 'V') of the complex matrix a.
+// The eigenvalues are stored in lambda.
+// If computed the left eigenvectors are stored in vl and the right eigenvectors in vr.
+// Out: lambda, vl, vr.
+// Precondition: jobvl == 'N' or jobvl == 'V',
+//               jobvr == 'N' or jobvr == 'V',
+//               a is a square matrix.
+// Postcondition: lambda, is resized, vl if jobvl == 'V', vr if jobvr == 'V' are resized.
+template <typename Scalar, class ALLOC>
+void eigensolver(char jobvl, char jobvr, const Matrix<std::complex<Scalar>, CPU, ALLOC>& a,
+                 Vector<std::complex<Scalar>, CPU>& lambda,
+                 Matrix<std::complex<Scalar>, CPU, ALLOC>& vl,
+                 Matrix<std::complex<Scalar>, CPU, ALLOC>& vr) {
+  assert(a.is_square());
+
+  Matrix<std::complex<Scalar>, CPU> a_copy(a);
+  lambda.resizeNoCopy(a_copy.nrRows());
+  int ldvl = 1;
+  int ldvr = 1;
+  if (jobvl == 'V' || jobvl == 'v') {
+    vl.resizeNoCopy(a_copy.size());
+    ldvl = vl.leadingDimension();
+  }
+  if (jobvr == 'V' || jobvr == 'v') {
+    vr.resizeNoCopy(a_copy.size());
+    ldvr = vr.leadingDimension();
+  }
+
+  // Get optimal worksize.
+  int lwork = util::getEigensolverWorkSize(jobvl, jobvr, a_copy);
+  dca::linalg::Vector<std::complex<Scalar>, CPU> work(lwork);
+  dca::linalg::Vector<Scalar, CPU> rwork(2 * a_copy.nrRows());
+
+  lapack::geev(&jobvl, &jobvr, a_copy.nrRows(), a_copy.ptr(), a_copy.leadingDimension(),
+               lambda.ptr(), vl.ptr(), ldvl, vr.ptr(), ldvr, work.ptr(), work.size(), rwork.ptr());
+}
+
+// Computes the eigenvalues, and the eigenvectors (if jobv == 'V') of the real symmetric matrix a.
+// if uplo == 'U' the upper triangular part of a is referenced, whereas
+// if uplo == 'L' the lower triangular part of a is referenced.
+// The eigenvalues are stored in lambda.
+// If computed the eigenvectors are stored in v.
+// Out: lambda, v
+// Precondition: jobv == 'N' or jobv == 'V',
+//               uplo == 'U' or uplo == 'L',
+//               a is a square matrix.
+// Postcondition: lambda, and v are resized.
+template <typename Scalar, class ALLOC>
+void eigensolverSymmetric(char jobv, char uplo, const Matrix<Scalar, CPU, ALLOC>& a,
+                          Vector<Scalar, CPU>& lambda, Matrix<Scalar, CPU, ALLOC>& v) {
+  assert(a.is_square());
+
+  lambda.resizeNoCopy(a.nrRows());
+  v = a;
+
+  // Get optimal worksize.
+  auto lwork = util::getEigensolverSymmetricWorkSize(jobv, uplo, v);
+  dca::linalg::Vector<Scalar, CPU> work(std::get<0>(lwork));
+  dca::linalg::Vector<int, CPU> iwork(std::get<1>(lwork));
+
+  lapack::syevd(&jobv, &uplo, v.nrRows(), v.ptr(), v.leadingDimension(), lambda.ptr(), work.ptr(),
+                work.size(), iwork.ptr(), iwork.size());
+}
+// For real types Hermitian and symmetric is the same.
+template <typename Scalar, class ALLOC>
+inline void eigensolverHermitian(char jobv, char uplo, const Matrix<Scalar, CPU, ALLOC>& a,
+                                 Vector<Scalar, CPU>& lambda, Matrix<Scalar, CPU, ALLOC>& v) {
+  eigensolverSymmetric(jobv, uplo, a, lambda, v);
+}
+
+// Computes the eigenvalues, and the eigenvectors (if jobv == 'V')
+// of the complex Hermitian matrix a.
+// if uplo == 'U' the upper triangular part of a is referenced, whereas
+// if uplo == 'L' the lower triangular part of a is referenced.
+// The eigenvalues are stored in lambda.
+// If computed the eigenvectors are stored in v.
+// Out: lambda, v
+// Precondition: jobv == 'N' or jobv == 'V',
+//               uplo == 'U' or uplo == 'L',
+//               a is a square matrix.
+// Postcondition: lambda, and v are resized.
+template <typename Scalar, class ALLOC>
+void eigensolverHermitian(char jobv, char uplo, const Matrix<std::complex<Scalar>, CPU, ALLOC>& a,
+                          Vector<Scalar, CPU>& lambda, Matrix<std::complex<Scalar>, CPU, ALLOC>& v) {
+  assert(a.is_square());
+
+  lambda.resizeNoCopy(a.nrRows());
+  v = a;
+
+  // Get optimal worksize.
+  auto [wsize, rsize, isize] = util::getEigensolverHermitianWorkSize(jobv, uplo, v);
+  dca::linalg::Vector<std::complex<Scalar>, CPU> work(wsize);
+  dca::linalg::Vector<Scalar, CPU> rwork(rsize);
+  dca::linalg::Vector<int, CPU> iwork(isize);
+
+  lapack::heevd(&jobv, &uplo, v.nrRows(), v.ptr(), v.leadingDimension(), lambda.ptr(), work.ptr(),
+                work.size(), rwork.ptr(), rwork.size(), iwork.ptr(), iwork.size());
+}
+
+template <typename Scalar, class ALLOC>
+void eigensolverGreensFunctionMatrix(char jobv, char uplo,
+                                     const Matrix<std::complex<Scalar>, CPU, ALLOC>& a,
+                                     Vector<Scalar, CPU>& lambda,
+                                     Matrix<std::complex<Scalar>, CPU, ALLOC>& v) {
+  assert(a.is_square());
+  int n = a.nrRows();
+  assert(n % 2 == 0);
+
+  if (n == 2) {  // must be diagonal in spin space.
+    lambda.resize(2);
+    v.resize(2);
+
+    assert(std::abs(std::imag(a(0, 0))) < 1.e-6);
+    assert(std::abs(std::imag(a(1, 1))) < 1.e-6);
+
+    lambda[0] = std::real(a(0, 0));
+    lambda[1] = std::real(a(1, 1));
+    v(0, 0) = 1.;
+    v(1, 0) = 0.;
+    v(0, 1) = 0.;
+    v(1, 1) = 1.;
+    return;
+  }
+  eigensolverHermitian(jobv, uplo, a, lambda, v);
+}
+
+// Computes the pseudo inverse of the matrix.
+// Out: a_inv
+// Postconditions: a_inv is resized to the needed dimension.
+template <typename Scalar, class ALLOC>
+void pseudoInverse(const Matrix<Scalar, CPU, ALLOC>& a, Matrix<Scalar, CPU, ALLOC>& a_inv,
+                   double eps = 1.e-6) {
+  int m = a.nrRows();
+  int n = a.nrCols();
+  a_inv.resizeNoCopy(std::make_pair(n, m));
+
+  using RealType = decltype(std::real(*a.ptr()));
+
+  if (m <= n) {
+    // a_inv = a'*inv(a*a')
+    // inv(a*a') = v*inv(lambda)*v', [lambda, v] = eig(a*a')
+
+    Matrix<Scalar, CPU, ALLOC> a_at("A_At", m);
+    dca::linalg::matrixop::gemm('N', 'C', a, a, a_at);
+
+    dca::linalg::Vector<RealType, CPU> lambda("Lambda", m);
+    Matrix<Scalar, CPU, ALLOC> v("V", m);
+
+    eigensolverHermitian('V', 'U', a_at, lambda, v);
+    Matrix<Scalar, CPU, ALLOC> vt(v);
+
+    for (int j = 0; j < m; j++) {
+      Scalar lambda_inv = 0;
+
+      if (lambda[j] > eps * lambda[m - 1])
+        lambda_inv = 1. / lambda[j];
+
+      scaleCol(v, j, lambda_inv);
+    }
+
+    gemm('N', 'C', v, vt, a_at);
+    gemm('C', 'N', a, a_at, a_inv);
+  }
+  else {
+    // a_inv = inv(a'*a)*a'
+    // inv(a'*a) = v*inv(lambda)*v', [lambda, v] = eig(a'*a)
+
+    Matrix<Scalar, CPU, ALLOC> at_a("at_a", n);
+    dca::linalg::matrixop::gemm('C', 'N', a, a, at_a);
+
+    dca::linalg::Vector<RealType, CPU> lambda("Lambda", n);
+    Matrix<Scalar, CPU, ALLOC> v("V", n);
+
+    eigensolverHermitian('V', 'U', at_a, lambda, v);
+    Matrix<Scalar, CPU, ALLOC> vt(v);
+
+    for (int j = 0; j < n; j++) {
+      Scalar lambda_inv = 0;
+
+      if (lambda[j] > eps * lambda[n - 1])
+        lambda_inv = 1. / lambda[j];
+
+      scaleCol(v, j, lambda_inv);
+    }
+
+    gemm('N', 'C', v, vt, at_a);
+    gemm('N', 'C', at_a, a, a_inv);
+  }
+}
+
+// Computes (in place) the determinant of the matrix.
+// Returns: determinant.
+// Postcondition: M is its LU decomposition.
+template <template <typename, DeviceType, class> class MatrixType, typename Scalar, class ALLOC>
+Scalar determinantIP(MatrixType<Scalar, CPU, ALLOC>& M) {
+  assert(M.nrCols() == M.nrRows());
+  const int n = M.nrCols();
+  std::vector<int> ipiv(n);
+
+  try {
+    lapack::getrf(n, n, M.ptr(), M.leadingDimension(), ipiv.data());
+  }
+  catch (lapack::util::LapackException& err) {
+    if (err.info() > 0)
+      return 0;
+    else
+      throw(std::logic_error("LU decomposition failed."));
+  }
+
+  Scalar det = 1.;
+  for (int i = 0; i < n; i++) {
+    det *= M(i, i);
+    if (ipiv[i] != i + 1)
+      det *= -1;
+  }
+  return det;
+}
+
+// Copy and computes the determinant of the matrix.
+// Returns: determinant.
+template <typename Scalar, DeviceType device, class ALLOC>
+Scalar determinant(const Matrix<Scalar, device>& M) {
+  Matrix<Scalar, CPU, ALLOC> M_copy(M);
+  return determinantIP(M_copy);
+}
+
+// Returns: logarithm of the absolute value of the determinant and the sign or phase of the
+// determinant. If the determinant is zero returns the null sign/phase.
+// Postcondition: M is its LU decomposition.
+template <template <typename, DeviceType, class> class MatrixType, typename Scalar, class ALLOC>
+auto logDeterminantIP(MatrixType<Scalar, CPU, ALLOC>& M, std::vector<int>& ipiv) {
+  assert(M.is_square());
+  const int n = M.nrCols();
+  ipiv.resize(n);
+
+  dca::util::RealAlias<Scalar> log_det = 0.;
+  math::Phase<Scalar> phase;
+
+  try {
+    lapack::getrf(n, n, M.ptr(), M.leadingDimension(), ipiv.data());
+  }
+  catch (lapack::util::LapackException& err) {
+    if (err.info() > 0) {
+      phase.makeNull();
+      return std::make_pair(log_det, phase);
+    }
+    else {
+      throw(std::logic_error("LU decomposition failed."));
+    }
+  }
+
+  for (int i = 0; i < n; i++) {
+    log_det += std::log(std::abs(M(i, i)));
+    phase.multiply(M(i, i));
+
+    if (ipiv[i] != i + 1)
+      phase.flip();
+  }
+
+  return std::make_pair(log_det, phase);
+}
+
+template <template <typename, DeviceType, class> class MatrixType, typename Scalar,
+          DeviceType device, class ALLOC>
+auto logDeterminant(const MatrixType<Scalar, device, ALLOC>& m) {
+  Matrix<Scalar, CPU, ALLOC> m_copy(m);
+  std::vector<int> ipiv;
+  return logDeterminantIP(m_copy, ipiv);
+}
+
+template <typename Scalar, class ALLOC, template <typename, DeviceType, class> class MatrixType>
+auto inverseAndLogDeterminant(MatrixType<Scalar, CPU, ALLOC>& mat) {
+  std::vector<int> ipiv;
+  auto [log_det, phase] = logDeterminantIP(mat, ipiv);
+
+  if (phase.isNull())
+    throw(std::logic_error("Singular matrix"));
+
+  // Invert
+  const int lwork = util::getInverseWorkSize(mat);
+  std::vector<Scalar> work(lwork);
+  lapack::UseDevice<CPU>::getri(mat.nrRows(), mat.ptr(), mat.leadingDimension(), ipiv.data(),
+                                work.data(), lwork);
+
+  return std::make_pair(-log_det, phase);
+}
+
+template <typename Scalar, class ALLOC>
+bool areNear(const Matrix<Scalar, CPU, ALLOC>& A, const Matrix<Scalar, CPU, ALLOC>& B,
+             const double err = 1e-16) {
+  if (A.size() != B.size())
+    return false;
+
+  for (int j = 0; j < A.size().second; j++)
+    for (int i = 0; i < A.size().first; i++)
+      if (std::abs(A(i, j) - B(i, j)) > err)
+        return false;
+
+  return true;
+}
+
+}  // namespace matrixop
+}  // namespace linalg
+}  // namespace dca
+
+#endif  // DCA_LINALG_MATRIXOP_HPP
diff --git a/linalg/reshapable_matrix.hpp b/linalg/reshapable_matrix.hpp
new file mode 100644
index 000000000..19a7cb35f
--- /dev/null
+++ b/linalg/reshapable_matrix.hpp
@@ -0,0 +1,453 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides a matrix with a more efficient reshaping between different rectangular shapes
+//  of similar total size.
+
+// Always a bad smell
+#pragma once
+
+#include <cassert>
+#include <cmath>
+#include <sstream>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "dca/config/haves_defines.hpp"
+#ifdef DCA_HAVE_GPU
+#include "dca/platform/dca_gpu.h"
+#endif
+#include "dca/linalg/util/allocators/allocators.hpp"
+#include "dca/linalg/device_type.hpp"
+#include "dca/linalg/util/copy.hpp"
+#include "dca/linalg/util/stream_functions.hpp"
+
+namespace dca {
+namespace linalg {
+// dca::linalg::
+
+template <typename ScalarType, DeviceType device_name,
+          class Allocator = util::DefaultAllocator<ScalarType, device_name>>
+class ReshapableMatrix : public Allocator {
+public:
+  using ThisType = ReshapableMatrix<ScalarType, device_name, Allocator>;
+  using ValueType = ScalarType;
+
+  // Default contructor creates a matrix of zero size and capacity.
+  ReshapableMatrix() = default;
+  // Initializes a square size x size matrix.
+  ReshapableMatrix(int size);
+  // Initializes a square size.first x size.second matrix.
+  ReshapableMatrix(std::pair<int, int> size);
+
+  // Contructs a matrix with size rhs.size() and a copy of the elements of rhs.
+  ReshapableMatrix(const ThisType& rhs);
+  template <DeviceType rhs_device_name, class AllocatorRhs>
+  ReshapableMatrix(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs);
+
+  // Complex Case
+  template <typename ScalarTypeRhs, DeviceType rhs_device_name, class AllocatorRhs>
+  ReshapableMatrix(const ReshapableMatrix<ScalarTypeRhs, rhs_device_name, AllocatorRhs>& rhs);
+
+  // Constructs a matrix with size rhs.size(). The elements of rhs are moved.
+  ReshapableMatrix(ThisType&& rhs);
+
+  // Resize the matrix to rhs.size() and copies the elements.
+  ReshapableMatrix& operator=(const ThisType& rhs);
+
+  // template <DeviceType rhs_device_name, class AllocatorRhs>
+  // ReshapableMatrix& operator=(const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs);
+  template <typename ScalarRhs, DeviceType rhs_device_name, class AllocatorRhs>
+  ReshapableMatrix& operator=(const ReshapableMatrix<ScalarRhs, rhs_device_name, AllocatorRhs>& rhs);
+
+  // Moves the elements of rhs into this matrix.
+  ReshapableMatrix& operator=(ThisType&& rhs);
+
+  ~ReshapableMatrix();
+
+  // Returns true if this is equal to other, false otherwise.
+  // Two matrices are equal, if they have the same size and contain the same elements. Name and
+  // capacity are ignored.
+  // Special case: two matrices without elements are equal.
+  bool operator==(const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const;
+
+  // Returns true if this is not equal to other, false otherwise.
+  // See description of operator== for the definition of equality.
+  bool operator!=(const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const;
+
+  // Returns the (i,j)-th element of the matrix.
+  // Preconditions: 0 <= i < size().first, 0 <= j < size().second.
+  // This method is available only if device_name == CPU.
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  ScalarType& operator()(int i, int j) {
+    assert(i >= 0 && i < size_.first);
+    assert(j >= 0 && j < size_.second);
+    return data_[i + j * leadingDimension()];
+  }
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  const ScalarType& operator()(int i, int j) const {
+    assert(i >= 0 && i < size_.first);
+    assert(j >= 0 && j < size_.second);
+    return data_[i + j * leadingDimension()];
+  }
+
+  struct Iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = ScalarType;
+    using pointer = ScalarType*;
+    using reference = ScalarType&;
+
+    Iterator(pointer ptr) : m_ptr(ptr) {}
+
+    reference operator*() const {
+      return *m_ptr;
+    }
+    pointer operator->() {
+      return m_ptr;
+    }
+
+    Iterator& operator++() {
+      m_ptr++;
+      return *this;
+    }
+    Iterator operator++(int) {
+      Iterator tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+    friend bool operator==(const Iterator& a, const Iterator& b) {
+      return a.m_ptr == b.m_ptr;
+    };
+    friend bool operator!=(const Iterator& a, const Iterator& b) {
+      return a.m_ptr != b.m_ptr;
+    };
+
+  private:
+    pointer m_ptr;
+  };
+
+  Iterator begin() {
+    return Iterator(data_);
+  }
+  Iterator end() {
+    return Iterator(data_ + size_.first * size_.second);
+  }
+
+  // Returns the pointer to the (0,0)-th element.
+  ValueType* ptr() {
+    return data_;
+  }
+  const ValueType* ptr() const {
+    return data_;
+  }
+
+  // Returns the pointer to the (i,j)-th element i < size().first and 0 < j < size().second, or
+  // a pointer past the end of the range if i == size().first or j == size().second.
+  // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
+  ValueType* ptr(int i, int j) {
+    assert(i >= 0 && i <= size_.first);
+    assert(j >= 0 && j <= size_.second);
+    return data_ + i + j * leadingDimension();
+  }
+  const ValueType* ptr(int i, int j) const {
+    assert(i >= 0 && i <= size_.first);
+    assert(j >= 0 && j <= size_.second);
+    return data_ + i + j * leadingDimension();
+  }
+
+  const std::pair<int, int> size() const {
+    return size_;
+  }
+  std::size_t capacity() const {
+    return capacity_;
+  }
+  int nrRows() const {
+    return size_.first;
+  }
+  int nrCols() const {
+    return size_.second;
+  }
+  int leadingDimension() const {
+    return size_.first;
+  }
+
+  // Resizes *this to a (new_size.first * new_size.second) matrix.
+  // The previous elements are not copied, therefore all the elements
+  // may have any value after the call to this method.
+  // Returns: true if reallocation took place.
+  bool resizeNoCopy(std::pair<int, int> new_size);
+  // Resizes *this to a (new_size * new_size) matrix. See previous method for details.
+  bool resizeNoCopy(int new_size) {
+    return resizeNoCopy(std::make_pair(new_size, new_size));
+  }
+
+  // Reserves the space for at least (new_size.first * new_size.second) elements without changing
+  // the matrix size. The value of the matrix elements is undefined after calling this method.
+  // Returns: true if reallocation took place.
+  bool reserveNoCopy(std::size_t new_size);
+
+  void swap(ReshapableMatrix<ScalarType, device_name, Allocator>& other);
+
+  // Releases the memory allocated by *this and sets size and capacity to zero.
+  void clear();
+
+#ifdef DCA_HAVE_GPU
+  // Asynchronous assignment.
+  template <DeviceType rhs_device_name>
+  void setAsync(const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, cudaStream_t stream);
+
+  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
+  template <DeviceType rhs_device_name>
+  void setAsync(const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, int thread_id,
+                int stream_id);
+
+  void setToZero(cudaStream_t stream);
+#else
+  // Synchronous assignment fallback for SetAsync.
+  template <DeviceType rhs_device_name>
+  void setAsync(const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, int /*thread_id*/,
+                int /*stream_id*/);
+
+#endif  // DCA_HAVE_GPU
+
+  // Returns the allocated device memory in bytes.
+  std::size_t deviceFingerprint() const;
+
+private:
+  static std::size_t nextCapacity(std::size_t size);
+  inline static size_t nrElements(std::pair<int, int> size) {
+    return static_cast<size_t>(size.first) * static_cast<size_t>(size.second);
+  }
+
+  std::pair<int, int> size_ = std::make_pair(0, 0);
+  std::size_t capacity_ = 0;
+
+  ValueType* data_ = nullptr;
+
+  template <class ScalarType2, DeviceType device_name2, class Allocator2>
+  friend class dca::linalg::ReshapableMatrix;
+};
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(int size)
+    : ReshapableMatrix(std::make_pair(size, size)) {}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(std::pair<int, int> size)
+    : size_(size), capacity_(nextCapacity(nrElements(size))) {
+  assert(size_.first >= 0 && size_.second >= 0);
+  assert(capacity_ >= nrElements(size_));
+
+  data_ = Allocator::allocate(capacity_);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(const ThisType& rhs) {
+  *this = rhs;
+}
+
+#ifdef DCA_HAVE_GPU
+// Case for non matching GPU type complex
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <typename ScalarRhs, DeviceType rhs_device_name, class AllocatorRhs>
+ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix<
+    ScalarType, device_name,
+    Allocator>::operator=(const ReshapableMatrix<ScalarRhs, rhs_device_name, AllocatorRhs>& rhs) {
+  static_assert(sizeof(ScalarType) == sizeof(ScalarRhs),
+                "sizeof ScalarType and ScalarRhs are not equal");
+  if constexpr (device_name == rhs_device_name)
+    if (this != &rhs)
+      return *this;
+  resizeNoCopy(rhs.size_);
+  util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <typename ScalarRhs, DeviceType rhs_device_name, class AllocatorRhs>
+ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(
+    const ReshapableMatrix<ScalarRhs, rhs_device_name, AllocatorRhs>& rhs) {
+  *this = rhs;
+}
+#endif
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType rhs_device_name, class AllocatorRhs>
+ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(
+    const ReshapableMatrix<ScalarType, rhs_device_name, AllocatorRhs>& rhs) {
+  *this = rhs;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix<
+    ScalarType, device_name, Allocator>::operator=(const ThisType& rhs) {
+  if (this != &rhs) {
+    resizeNoCopy(rhs.size_);
+    util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
+  }
+
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>::ReshapableMatrix(ThisType&& rhs) {
+  swap(rhs);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>& ReshapableMatrix<
+    ScalarType, device_name, Allocator>::operator=(ThisType&& rhs) {
+  swap(rhs);
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+ReshapableMatrix<ScalarType, device_name, Allocator>::~ReshapableMatrix() {
+  Allocator::deallocate(data_);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+bool ReshapableMatrix<ScalarType, device_name, Allocator>::operator==(
+    const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const {
+  if (device_name == GPU)
+    return ReshapableMatrix<ScalarType, CPU>(*this) == ReshapableMatrix<ScalarType, CPU>(other);
+
+  if (size() != other.size())
+    return nrRows() * nrCols() == 0 and other.nrRows() * other.nrCols() == 0;
+
+  for (int j = 0; j < nrCols(); ++j)
+    for (int i = 0; i < nrRows(); ++i)
+      if ((*this)(i, j) != other(i, j))
+        return false;
+
+  return true;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+bool ReshapableMatrix<ScalarType, device_name, Allocator>::operator!=(
+    const ReshapableMatrix<ScalarType, device_name, Allocator>& other) const {
+  return !(*this == other);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+bool ReshapableMatrix<ScalarType, device_name, Allocator>::resizeNoCopy(
+    const std::pair<int, int> new_size) {
+  const bool realloc = reserveNoCopy(nrElements(new_size));
+  size_ = new_size;
+  assert(capacity_ >= nrElements(size_));
+  return realloc;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+bool ReshapableMatrix<ScalarType, device_name, Allocator>::reserveNoCopy(std::size_t new_size) {
+  if (new_size > capacity_) {
+    Allocator::deallocate(data_);
+    capacity_ = nextCapacity(new_size);
+    data_ = Allocator::allocate(capacity_);
+    return true;
+  }
+  return false;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void ReshapableMatrix<ScalarType, device_name, Allocator>::swap(
+    ReshapableMatrix<ScalarType, device_name, Allocator>& other) {
+  std::swap(size_, other.size_);
+  std::swap(capacity_, other.capacity_);
+  std::swap(data_, other.data_);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void ReshapableMatrix<ScalarType, device_name, Allocator>::clear() {
+  Allocator::deallocate(data_);
+  size_ = std::make_pair(0, 0);
+  capacity_ = 0;
+}
+
+#ifdef DCA_HAVE_GPU
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType rhs_device_name>
+void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync(
+    const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, const cudaStream_t stream) {
+  resizeNoCopy(rhs.size_);
+  util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType rhs_device_name>
+void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync(
+    const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, const int thread_id,
+    const int stream_id) {
+  setAsync(rhs, util::getStream(thread_id, stream_id));
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void ReshapableMatrix<ScalarType, device_name, Allocator>::setToZero(cudaStream_t stream) {
+  checkRC(cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream));
+}
+
+#else  // DCA_HAVE_GPU
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType rhs_device_name>
+void ReshapableMatrix<ScalarType, device_name, Allocator>::setAsync(
+    const ReshapableMatrix<ScalarType, rhs_device_name>& rhs, int /*thread_id*/, int /*stream_id*/) {
+  *this = rhs;
+}
+
+#endif  // DCA_HAVE_GPU
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+std::size_t ReshapableMatrix<ScalarType, device_name, Allocator>::nextCapacity(const std::size_t size) {
+  constexpr std::size_t block_size = 512;
+
+  auto next_power_of_two = [](std::size_t x) {
+    if (!x)
+      return std::size_t(0);
+    std::size_t result = 1;
+    while (result < x) {
+      result <<= 1;
+    }
+    return result;
+  };
+
+  return size <= block_size ? next_power_of_two(size)
+                            : (size + block_size - 1) / block_size * block_size;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+std::size_t ReshapableMatrix<ScalarType, device_name, Allocator>::deviceFingerprint() const {
+  if (device_name == GPU)
+    return capacity_ * sizeof(ScalarType);
+  else
+    return 0;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+std::ostream& operator<<(std::ostream& ostr,
+                         const ReshapableMatrix<ScalarType, device_name, Allocator>& rmatrix) {
+  ostr << "{";
+  for (std::size_t i = 0; i < rmatrix.size().first; ++i) {
+    ostr << "{";
+    for (std::size_t j = 0; j < rmatrix.size().second; ++j)
+      ostr << rmatrix(i, j) << ",";
+    ostr << "},";
+  }
+  ostr << "}";
+  return ostr;
+}
+
+}  // namespace linalg
+}  // namespace dca
diff --git a/linalg/vector.hpp b/linalg/vector.hpp
new file mode 100644
index 000000000..a21fd0203
--- /dev/null
+++ b/linalg/vector.hpp
@@ -0,0 +1,435 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Staar (taa@zurich.ibm.com)
+//         Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides the Vector object for different device types.
+
+#ifndef DCA_LINALG_VECTOR_HPP
+#define DCA_LINALG_VECTOR_HPP
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "dca/platform/dca_gpu.h"
+#include "dca/linalg/device_type.hpp"
+#include "dca/linalg/util/memory.hpp"
+#include "dca/linalg/util/allocators/allocators.hpp"
+#include "dca/linalg/util/copy.hpp"
+#include "dca/linalg/util/stream_functions.hpp"
+
+namespace dca {
+namespace linalg {
+// dca::linalg::
+
+template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
+          class Allocator = util::DefaultAllocator<ScalarType, device_name>>
+class Vector : public Allocator {
+public:
+  using ThisType = Vector<ScalarType, device_name, Allocator>;
+  using ValueType = ScalarType;
+  using AllocatorType = Allocator;
+
+  Vector();
+  Vector(const std::string& name);
+  Vector(size_t size);
+  Vector(const std::string& name, size_t size);
+  // Preconditions: capacity >= size.
+  Vector(size_t size, size_t capacity);
+  Vector(const std::string& name, size_t size, size_t capacity);
+
+  /** copy constructor except for name.
+   *  this is strange but for historical reasons is kept.
+   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit
+   * conversion from an int to a Vector& argument that landed here. This occurred in Debug with
+   */
+  explicit Vector(const ThisType& rhs, const std::string& name = default_name_);
+
+  template <DeviceType device_name2, class Allocator2>
+  Vector(const Vector<ScalarType, device_name2, Allocator2>& rhs,
+         const std::string& name = default_name_);
+
+  template <DeviceType device_name2, class Allocator2>
+  Vector(Vector<ScalarType, device_name2, Allocator2>&& rhs, const std::string& name = default_name_);
+
+  ~Vector();
+
+  ThisType& operator=(const ThisType& rhs);
+
+  template <DeviceType device_name2, class Allocator2>
+  ThisType& operator=(const Vector<ScalarType, device_name2, Allocator2>& rhs);
+
+  template <class Container>
+  ThisType& operator=(const Container& rhs);
+
+  template <class Allocator2>
+  ThisType& operator=(Vector<ScalarType, device_name, Allocator2>&& rhs);
+
+  // Returns the i-th element of the vector.
+  // Preconditions: 0 <= i < size().first.
+  // This method is available only if device_name == CPU.
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  ScalarType& operator[](size_t i) {
+    assert(i < size_);
+    return data_[i];
+  }
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  const ScalarType& operator[](size_t i) const {
+    assert(i < size_);
+    return data_[i];
+  }
+
+  const std::string& get_name() const {
+    return name_;
+  }
+
+  void set_name(const std::string& name) {
+    name_ = name;
+  }
+
+  // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
+  // + synchronization of stream.
+  template <class Container>
+  void set(const Container& rhs, int thread_id, int stream_id);
+
+  // Synchronous copy. Container must define a data() and size() methods.
+  // Precondition: rhs has the same size as this Vector.
+  template <class Container>
+  void copyTo(Container& rhs) const;
+
+  // Asynchronous assignment.
+  template <class Container>
+  void setAsync(const Container& rhs, const util::GpuStream& stream);
+
+  void setToZeroAsync(const util::GpuStream& stream);
+  void setToZero(const util::GpuStream& stream);
+
+  template <class Container>
+  void setAsync(const Container& rhs, int thred_id, int stream_id = 0);
+
+  // Returns the pointer to the 0-th element of the vector.
+  ValueType* ptr() {
+    return data_;
+  }
+  ValueType* data() {
+    return data_;
+  }
+  const ValueType* ptr() const {
+    return data_;
+  }
+  const ValueType* data() const {
+    return data_;
+  }
+
+  // Returns the pointer to the i-th element of the vector.
+  // Preconditions: 0 <= i < size().first.
+  ValueType* ptr(size_t i) {
+    assert(i < size_);
+    return data_ + i;
+  }
+  const ValueType* ptr(size_t i) const {
+    assert(i < size_);
+    return data_ + i;
+  }
+
+  size_t size() const {
+    return size_;
+  }
+  size_t capacity() const {
+    return capacity_;
+  }
+
+  // Resizes *this to a new_size vector.
+  // Elements added may have any value.
+  // Remark: The capacity of the vector and element pointers do not change
+  // if new_size <= capacity().
+  void resize(size_t new_size);
+  // Resizes *this to a new_size vector.
+  // The previous elements are not copied, therefore all the elements
+  // may have any value after the call to this method.
+  // Remark: The capacity of the vector and element pointers do not change
+  // if new_size <= capacity().
+  void resizeNoCopy(size_t new_size);
+
+  // Releases the memory allocated by *this and sets size and capacity to zero
+  void clear();
+
+  // Prints the values of the vector elements.
+  void print() const;
+  // Prints the properties of *this.
+  void printFingerprint() const;
+
+  std::size_t deviceFingerprint() const;
+
+private:
+  std::string name_;
+
+  size_t size_;
+  size_t capacity_;
+
+  ValueType* data_;
+
+  template <typename ScalarType2, DeviceType device_name2, class Allocator2>
+  friend class dca::linalg::Vector;
+
+  static const std::string default_name_;
+};
+template <typename ScalarType, DeviceType device_name, class Allocator>
+const std::string Vector<ScalarType, device_name, Allocator>::default_name_("no-name");
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector() : Vector(default_name_, 0, 0) {}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector(const std::string& name) : Vector(name, 0, 0) {}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector(size_t size)
+    : Vector(default_name_, size, size) {}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector(const std::string& name, size_t size)
+    : Vector(name, size, size) {}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector(size_t size, size_t capacity)
+    : Vector(default_name_, size, capacity) {}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector(const std::string& name, size_t size,
+                                                   size_t capacity)
+    : name_(name), size_(size), capacity_(capacity), data_(nullptr) {
+  assert(capacity_ >= size_);
+  data_ = Allocator::allocate(capacity_);
+  if (size) {  // Avoid cuda calls when initializing static vectors.
+    util::Memory<device_name>::setToZero(data_, capacity_);
+  }
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::Vector(const ThisType& rhs, const std::string& name)
+    : Vector(name) {
+  *this = rhs;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType device_name2, class Allocator2>
+Vector<ScalarType, device_name, Allocator>::Vector(
+    const Vector<ScalarType, device_name2, Allocator2>& rhs, const std::string& name)
+    : Vector(name) {
+  *this = rhs;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType device_name2, class Allocator2>
+Vector<ScalarType, device_name, Allocator>::Vector(Vector<ScalarType, device_name2, Allocator2>&& rhs,
+                                                   const std::string& name)
+    : Vector(name) {
+  *this = std::move(rhs);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>::~Vector() {
+  Allocator::deallocate(data_);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
+    const ThisType& rhs) {
+  resizeNoCopy(rhs.size());
+  if (device_name == CPU)
+    util::memoryCopyCpu(data_, rhs.data_, size_);
+  else
+    util::memoryCopy(data_, rhs.data_, size_);
+
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <DeviceType device_name2, class Allocator2>
+Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
+    const Vector<ScalarType, device_name2, Allocator2>& rhs) {
+  resizeNoCopy(rhs.size());
+  if (device_name == CPU && device_name2 == CPU)
+    util::memoryCopyCpu(data_, rhs.data_, size_);
+  else
+    util::memoryCopy(data_, rhs.data_, size_);
+
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <class Container>
+Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
+    const Container& rhs) {
+  resizeNoCopy(rhs.size());
+  if (device_name == CPU)
+    util::memoryCopyCpu(data_, rhs.data(), size_);
+  else
+    util::memoryCopy(data_, rhs.data(), size_);
+
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <class Allocator2>
+Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
+    Vector<ScalarType, device_name, Allocator2>&& rhs) {
+  static_cast<Allocator>(*this) = std::move(rhs);
+
+  std::swap(data_, rhs.data_);
+  std::swap(size_, rhs.size_);
+  std::swap(capacity_, rhs.capacity_);
+
+  return *this;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <class Container>
+void Vector<ScalarType, device_name, Allocator>::set(const Container& rhs, int thread_id,
+                                                     int stream_id) {
+  resizeNoCopy(rhs.size());
+  util::memoryCopy(data_, rhs.data(), size_, thread_id, stream_id);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <class Container>
+void Vector<ScalarType, device_name, Allocator>::copyTo(Container& rhs) const {
+  if (rhs.size() != size())
+    throw(std::logic_error("The size of the destination container is different."));
+  util::memoryCopy(rhs.data(), data_, size_);
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <class Container>
+void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
+                                                          const util::GpuStream& stream) {
+  resizeNoCopy(rhs.size());
+  util::memoryCopyAsync(data_, rhs.data(), size_, stream);
+  //  cudaDeviceSynchronize();
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
+                                                                [[maybe_unused]]) {
+  // TODO: implement in copy.hpp.
+#ifdef DCA_HAVE_GPU
+  checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream));
+#else
+  std::memset(data_, 0, size_ * sizeof(ScalarType));
+#endif
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+                                                           [[maybe_unused]]) {
+  dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
+}
+
+// template <typename ScalarType, DeviceType device_name, class Allocator>
+// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+// [[maybe_unused]]) {
+//   // TODO: implement in copy.hpp.
+//   dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
+// }
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+template <class Container>
+void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const int thread_id,
+                                                          const int stream_id) {
+  setAsync(rhs, util::getStream(thread_id, stream_id));
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::resize(size_t new_size) {
+  if (new_size > capacity_) {
+    int new_capacity = (new_size / 64 + 1) * 64;
+
+    ValueType* new_data = Allocator::allocate(new_capacity);
+    util::memoryCopy(new_data, data_, size_);
+    Allocator::deallocate(data_);
+
+    data_ = new_data;
+    capacity_ = new_capacity;
+    size_ = new_size;
+  }
+  else
+    size_ = new_size;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::resizeNoCopy(size_t new_size) {
+  if (new_size > capacity_) {
+    int new_capacity = (new_size / 64 + 1) * 64;
+
+    Allocator::deallocate(data_);
+    data_ = Allocator::allocate(new_capacity);
+
+    capacity_ = new_capacity;
+    size_ = new_size;
+  }
+  else
+    size_ = new_size;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::clear() {
+  Allocator::deallocate(data_);
+  size_ = capacity_ = 0;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::print() const {
+  if (device_name == GPU) {
+    Vector<ScalarType, CPU> copy(*this);
+    return copy.print();
+  }
+
+  printFingerprint();
+
+  std::stringstream ss;
+  ss.precision(6);
+  ss << std::scientific;
+
+  ss << "\n";
+  for (int i = 0; i < size_; i++)
+    ss << "\t" << operator[](i);
+  ss << "\n";
+
+  std::cout << ss.str() << std::endl;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+void Vector<ScalarType, device_name, Allocator>::printFingerprint() const {
+  std::stringstream ss;
+
+  ss << "\n";
+  ss << "    name: " << name_ << "\n";
+  ss << "    size: " << size_ << "\n";
+  ss << "    capacity: " << capacity_ << "\n";
+  ss << "    memory-size: " << capacity_ * sizeof(ScalarType) * 1.e-6 << "(Mbytes)\n";
+
+  std::cout << ss.str() << std::endl;
+}
+
+template <typename ScalarType, DeviceType device_name, class Allocator>
+std::size_t Vector<ScalarType, device_name, Allocator>::deviceFingerprint() const {
+  return device_name == GPU ? capacity_ * sizeof(ScalarType) : 0;
+}
+
+}  // namespace linalg
+}  // namespace dca
+
+#endif  // DCA_LINALG_VECTOR_HPP
diff --git a/pairing.h b/pairing.h
index dd8d3b912..c1db58ffc 100644
--- a/pairing.h
+++ b/pairing.h
@@ -19,35 +19,22 @@
 #include <vector>
 
 namespace rpa {
-extern "C" void
-#ifdef glyph
-    dgeev_
-#else
-    dgeev
-#endif
+extern "C" void dgeev_
     // JOBVL,JOBVR, N,    A,        LDA, WR,      WI,      VL,       LDVL,   VR,
     // , LDVR,  WORK,   LWORK, INFO
-    (char *, char *, int *, double *, int *, double *, double *, double *,
-     int *, double *, int *, double *, int *, int *);
-inline void GEEV(char jobvl, char jobvr, int n, psimag::Matrix<double> &A,
-                 int lda, std::vector<double> &wr, std::vector<double> &wi,
-                 psimag::Matrix<double> &vl, int ldvl,
-                 psimag::Matrix<double> &vr, int ldvr,
-                 std::vector<double> &work, int &lwork, int *info) {
-#ifdef glyph
-  dgeev_
-#else
-  dgeev
-#endif
-      (&jobvl, &jobvr, &n, &(A(0, 0)), &lda, &(wr[0]), &(wi[0]), &(vl(0, 0)),
-       &ldvl, &(vr(0, 0)), &ldvr, &(work[0]), &lwork, info);
+    (char*, char*, int*, double*, int*, double*, double*, double*, int*, double*, int*, double*,
+     int*, int*);
+inline void GEEV(char jobvl, char jobvr, int n, psimag::Matrix<double>& A, int lda,
+                 std::vector<double>& wr, std::vector<double>& wi, psimag::Matrix<double>& vl,
+                 int ldvl, psimag::Matrix<double>& vr, int ldvr, std::vector<double>& work,
+                 int& lwork, int* info) {
+  dgeev_(&jobvl, &jobvr, &n, &(A(0, 0)), &lda, &(wr[0]), &(wi[0]), &(vl(0, 0)), &ldvl, &(vr(0, 0)),
+         &ldvr, &(work[0]), &lwork, info);
 }
 
-template <typename Field, typename BandsType, typename SuscType,
-          typename GapType, template <typename> class MatrixTemplate,
-          typename ModelType, typename ConcurrencyType>
+template <typename Field, typename BandsType, typename SuscType, typename GapType,
+          template <typename> class MatrixTemplate, typename ModelType, typename ConcurrencyType>
 class pairing {
-
 private:
   typedef Field FieldType;
   typedef std::complex<Field> ComplexType;
@@ -57,15 +44,15 @@ class pairing {
   typedef std::vector<Field> VectorType;
   typedef std::vector<ComplexType> ComplexVectorType;
 
-  rpa::parameters<Field, MatrixTemplate, ConcurrencyType> &param;
-  rpa::model<Field, MatrixTemplate, ConcurrencyType> &model;
-  const size_t &dim;
-  ConcurrencyType &conc;
+  rpa::parameters<Field, MatrixTemplate, ConcurrencyType>& param;
+  rpa::model<Field, MatrixTemplate, ConcurrencyType>& model;
+  const size_t& dim;
+  ConcurrencyType& conc;
   size_t interpolateChi_;
   size_t storeChi_;
   size_t storeGammaOrb_;
   size_t readChi_;
-  momentumDomain<FieldType, psimag::Matrix, ConcurrencyType> &qMesh;
+  momentumDomain<FieldType, psimag::Matrix, ConcurrencyType>& qMesh;
   const size_t &nOrb, nk;
   size_t msize;
   momentumDomain<FieldType, psimag::Matrix, ConcurrencyType> kMesh;
@@ -76,9 +63,7 @@ class pairing {
   std::vector<FieldType> deltakF;
   std::vector<FieldType> vkF;
   SuscType chi0;
-  chi0q<FieldType, SuscType, BandsType, GapType, psimag::Matrix,
-        ConcurrencyType>
-      susq;
+  chi0q<FieldType, SuscType, BandsType, GapType, psimag::Matrix, ConcurrencyType> susq;
 
   SuscType chiRPAs;
   SuscType chiRPAc;
@@ -96,30 +81,47 @@ class pairing {
   size_t nTotal;
   std::vector<SuscType> chiStore;
   std::vector<VectorType> qStore;
-  ferminator<FieldType, BandsType, MatrixTemplate, ModelType, ConcurrencyType>
-      FSpoints;
+  ferminator<FieldType, BandsType, MatrixTemplate, ModelType, ConcurrencyType> FSpoints;
   size_t nkF;
   MatrixType chikk;
 
 public:
-  pairing(rpa::parameters<Field, MatrixTemplate, ConcurrencyType> &parameters,
-          ModelType &modelIn, ConcurrencyType &concurrency,
-          const size_t interpolateChi,
-          momentumDomain<FieldType, psimag::Matrix, ConcurrencyType> &qMeshIn)
-      : param(parameters), model(modelIn), dim(param.dimension),
-        conc(concurrency), interpolateChi_(interpolateChi),
-        storeChi_(param.storeChi), storeGammaOrb_(param.storeGammaOrb),
-        readChi_(param.readChi), qMesh(qMeshIn), nOrb(param.nOrb),
-        nk(parameters.nkInt), msize(param.nOrb * param.nOrb),
+  pairing(rpa::parameters<Field, MatrixTemplate, ConcurrencyType>& parameters, ModelType& modelIn,
+          ConcurrencyType& concurrency, const size_t interpolateChi,
+          momentumDomain<FieldType, psimag::Matrix, ConcurrencyType>& qMeshIn)
+      : param(parameters),
+        model(modelIn),
+        dim(param.dimension),
+        conc(concurrency),
+        interpolateChi_(interpolateChi),
+        storeChi_(param.storeChi),
+        storeGammaOrb_(param.storeGammaOrb),
+        readChi_(param.readChi),
+        qMesh(qMeshIn),
+        nOrb(param.nOrb),
+        nk(parameters.nkInt),
+        msize(param.nOrb * param.nOrb),
         kMesh(param, conc, param.nkInt, param.nkIntz, param.dimension),
-        bands(param, model, conc, kMesh, param.cacheBands), chi0(param, conc),
-        susq(param, qMesh, param.chifile, conc), chiRPAs(param, conc),
-        chiRPAc(param, conc), chi0s(param, conc), chi0c(param, conc),
-        temps(msize, msize), tempc(msize, msize), gammaPP(0, 0), gammaZ(0, 0),
-        paritySign(param.pairingSpinParity ? -1.0 : 1.0), // 1=triplet,0=singlet
-        normalization(0, 0), Zofk(0, 0), nTotal(0),
-        chiStore(0, SuscType(param, conc)), qStore(0, VectorType(3)),
-        FSpoints(param, model, conc), nkF(FSpoints.nTotal), chikk(nkF, nkF) {
+        bands(param, model, conc, kMesh, param.cacheBands),
+        chi0(param, conc),
+        susq(param, qMesh, param.chifile, conc),
+        chiRPAs(param, conc),
+        chiRPAc(param, conc),
+        chi0s(param, conc),
+        chi0c(param, conc),
+        temps(msize, msize),
+        tempc(msize, msize),
+        gammaPP(0, 0),
+        gammaZ(0, 0),
+        paritySign(param.pairingSpinParity ? -1.0 : 1.0),  // 1=triplet,0=singlet
+        normalization(0, 0),
+        Zofk(0, 0),
+        nTotal(0),
+        chiStore(0, SuscType(param, conc)),
+        qStore(0, VectorType(3)),
+        FSpoints(param, model, conc),
+        nkF(FSpoints.nTotal),
+        chikk(nkF, nkF) {
     // determineKF(file);
 
     if (conc.rank() == 0)
@@ -142,10 +144,9 @@ class pairing {
     std::ostringstream ss;
     ss << param.temperature;
     std::string tempStr(ss.str());
-    std::string filenameChiPP("chiPairing_" + param.fileID + "_T_" + tempStr +
-                              ".txt");
+    std::string filenameChiPP("chiPairing_" + param.fileID + "_T_" + tempStr + ".txt");
     if (interpolateChi_ == 0) {
-      kMesh.set_momenta(false); // for chi claculation
+      kMesh.set_momenta(false);  // for chi claculation
       if (param.cacheBands)
         bands.precalculate_ekak();
       if (storeChi_ == 1 || readChi_ == 1) {
@@ -161,8 +162,7 @@ class pairing {
           conc.broadcast(chiStore[iq]);
         }
         std::cout << "... and broadcasted \n";
-        std::cout << "q,chi " << qStore[nTotal - 1] << ", "
-                  << chiStore[nTotal - 1].calcSus() << "\n";
+        std::cout << "q,chi " << qStore[nTotal - 1] << ", " << chiStore[nTotal - 1].calcSus() << "\n";
       }
     }
     calcNorm();
@@ -213,7 +213,7 @@ class pairing {
     // FieldType term2(0.0);
 
     std::string cstr = "GammaPPOrb_" + param.fileID + ".txt";
-    const char *filename = cstr.c_str();
+    const char* filename = cstr.c_str();
     std::ofstream os(filename);
     os << "#i, j, l1, l2, l3, l4, Gammma_{l1,l2,l3,l4}(k_i,k_j)\n";
 
@@ -240,17 +240,14 @@ class pairing {
         calcGammaPPEmery(q, k1, k2, ik1, ik2, band1, band2, GammaPPkkp, os);
 
       else
-        calcGammaPPTerms(ind, q, k1, k2, ik1, ik2, band1, band2, GammaPPkkp,
-                         GammaZkkp, chiTerm, os);
-
+        calcGammaPPTerms(ind, q, k1, k2, ik1, ik2, band1, band2, GammaPPkkp, GammaZkkp, chiTerm, os);
 
       Container[ind] = GammaPPkkp;
       ContainerZ[ind] = GammaZkkp;
       Container2[ind] = chiTerm;
       if (conc.rank() == 0)
-        std::cout << "now calculating " << ind << " of " << nTotal
-                  << " terms with ik1=" << ik1 << " and ik2=" << ik2
-                  << " GammaPPkkp=" << GammaPPkkp
+        std::cout << "now calculating " << ind << " of " << nTotal << " terms with ik1=" << ik1
+                  << " and ik2=" << ik2 << " GammaPPkkp=" << GammaPPkkp
                   << " , "
                      " GammaZkkp="
                   << GammaZkkp << " , "
@@ -260,7 +257,6 @@ class pairing {
         std::cout << "RPA spin susceptibility (chiTerm) negative !! \n";
         exit(0);
       }
-
     }
 
     conc.reduce(Container);
@@ -286,23 +282,20 @@ class pairing {
     }
   }
 
-  void calcGammaPPEmery(std::vector<FieldType> q, VectorType &k1,
-                        VectorType &k2, size_t ik1, size_t ik2, size_t band1,
-                        size_t band2, FieldType &result, std::ofstream &os) {
+  void calcGammaPPEmery(std::vector<FieldType> q, VectorType& k1, VectorType& k2, size_t ik1,
+                        size_t ik2, size_t band1, size_t band2, FieldType& result, std::ofstream& os) {
     ComplexMatrixType chi0_gg(19, 19);
-    ComplexMatrixType chi0(3, 3);    // not needed (dummy)
-    ComplexMatrixType chi0_g(19, 3); // not needed (dummy)
+    ComplexMatrixType chi0(3, 3);     // not needed (dummy)
+    ComplexMatrixType chi0_g(19, 3);  // not needed (dummy)
 
-    FieldType dummy(0.0); // dummy float for Gamma_Z, which is not calculated
-                          // for Emery model
+    FieldType dummy(0.0);  // dummy float for Gamma_Z, which is not calculated
+                           // for Emery model
 
-    calcChi0Matrix<FieldType, SuscType, BandsType, GapType, MatrixTemplate,
-                   ConcurrencyType>
-        calcChi0(param, kMesh, bands, q, conc, chi0, chi0_g, chi0_gg);
+    calcChi0Matrix<FieldType, SuscType, BandsType, GapType, MatrixTemplate, ConcurrencyType> calcChi0(
+        param, kMesh, bands, q, conc, chi0, chi0_g, chi0_gg);
 
     // Now calculate RPA result of interaction
-    interactionEmery<FieldType, psimag::Matrix, ConcurrencyType> rpaEmery(
-        param);
+    interactionEmery<FieldType, psimag::Matrix, ConcurrencyType> rpaEmery(param);
 
     // First calculate the effective interaction with chi0
     ComplexMatrixType GammaS(19, 19);
@@ -311,10 +304,10 @@ class pairing {
     ComplexMatrixType bareCharge(19, 19);
 
     rpaEmery.calcRPAResult(chi0_gg, 1, GammaS, bareSpin,
-                           q); // renormalized spin interaction
+                           q);  // renormalized spin interaction
 
     rpaEmery.calcRPAResult(chi0_gg, 0, GammaC, bareCharge,
-                           q); // renormalized charge interaction
+                           q);  // renormalized charge interaction
 
     // Determine GammaC and GammaS from chiRPA
     ComplexMatrixType chiRPAC(19, 19);
@@ -323,21 +316,17 @@ class pairing {
     rpaEmery.calcChiRPAFromGammaRPA(GammaS, 1, q, chiRPAS);
     // Now re-calculate GammaC from chiRPA
     rpaEmery.setupVBare(q);
-    rpaEmery.calcGammaFromChiRPA(rpaEmery.V_Charge, rpaEmery.V_Charge_coupl,
-                                 chiRPAC, GammaC);
-    rpaEmery.calcGammaFromChiRPA(rpaEmery.V_Spin, rpaEmery.V_Spin_coupl,
-                                 chiRPAS, GammaS);
+    rpaEmery.calcGammaFromChiRPA(rpaEmery.V_Charge, rpaEmery.V_Charge_coupl, chiRPAC, GammaC);
+    rpaEmery.calcGammaFromChiRPA(rpaEmery.V_Spin, rpaEmery.V_Spin_coupl, chiRPAS, GammaS);
 
     calcGammaPPOrbEmery(GammaS, GammaC, bareSpin, bareCharge, k1, k2, temps);
 
-    calcGammaPPBand(temps, temps, ik1, ik2, k1, k2, band1, band2, result, dummy,
-                    os);
+    calcGammaPPBand(temps, temps, ik1, ik2, k1, k2, band1, band2, result, dummy, os);
   }
 
-  void calcGammaPPOrbEmery(ComplexMatrixType &GammaS, ComplexMatrixType &GammaC,
-                           ComplexMatrixType &bareSpin,
-                           ComplexMatrixType &bareCharge, VectorType &k1,
-                           VectorType &k2, ComplexMatrixType &result) {
+  void calcGammaPPOrbEmery(ComplexMatrixType& GammaS, ComplexMatrixType& GammaC,
+                           ComplexMatrixType& bareSpin, ComplexMatrixType& bareCharge,
+                           VectorType& k1, VectorType& k2, ComplexMatrixType& result) {
     // the singlet interaction is 1/2*(3*GammaS - GammaC)
     // which then is multiplied with the basis g^i_{l1,l2}(k) * g^j_{l3,l4}(-k')
     VectorType mk2(3, 0);
@@ -348,10 +337,8 @@ class pairing {
       mk2[i] = -k2[i];
     for (size_t i = 0; i < mk1.size(); ++i)
       mk1[i] = -k1[i];
-    sepBasis<FieldType, psimag::Matrix, ConcurrencyType> basisLeft(param, conc,
-                                                                   k2);
-    sepBasis<FieldType, psimag::Matrix, ConcurrencyType> basisRight(param, conc,
-                                                                    mk1);
+    sepBasis<FieldType, psimag::Matrix, ConcurrencyType> basisLeft(param, conc, k2);
+    sepBasis<FieldType, psimag::Matrix, ConcurrencyType> basisRight(param, conc, mk1);
     // sepBasis<FieldType,psimag::Matrix,ConcurrencyType>
     // basisLeft(param,conc,mk1);
     // sepBasis<FieldType,psimag::Matrix,ConcurrencyType>
@@ -370,11 +357,9 @@ class pairing {
                     basisLeft(i, l1, l2) * basisRight(j, l3, l4) * 0.5 *
                     // result(ind1,ind2) += basisLeft(i,l3,l4) *
                     // basisRight(j,l1,l2) *
-                    ((-3 * (GammaS(i, j) - bareSpin(i, j)) -
-                      param.staticUFactor * bareSpin(i, j)) *
+                    ((-3 * (GammaS(i, j) - bareSpin(i, j)) - param.staticUFactor * bareSpin(i, j)) *
                          FieldType(param.pairingFromSpin) +
-                     ((GammaC(i, j) - bareCharge(i, j)) +
-                      param.staticUFactor * bareCharge(i, j)) *
+                     ((GammaC(i, j) - bareCharge(i, j)) + param.staticUFactor * bareCharge(i, j)) *
                          FieldType(param.pairingFromCharge));
                 // 1.0*(GammaC(i,j)); // only consider charge channel
               }
@@ -383,17 +368,17 @@ class pairing {
     return;
   }
 
-  void calcGammaPPTerms(size_t ind, std::vector<FieldType> q, VectorType &k1,
-                        VectorType &k2, size_t ik1, size_t ik2, size_t band1,
-                        size_t band2, FieldType &resultPP, FieldType &resultZ,
-                        FieldType &chiTerm, std::ofstream &os) {
+  void calcGammaPPTerms(size_t ind, std::vector<FieldType> q, VectorType& k1, VectorType& k2,
+                        size_t ik1, size_t ik2, size_t band1, size_t band2, FieldType& resultPP,
+                        FieldType& resultZ, FieldType& chiTerm, std::ofstream& os) {
     // SuscType* chiq;
     SuscType chiq(param, conc);
     SuscType chi0Intq(param, conc);
     if (interpolateChi_ == 1) {
       getChi0forQ(q, chi0Intq);
       chiq = chi0Intq;
-    } else if (interpolateChi_ == 0 && readChi_ == 0) {
+    }
+    else if (interpolateChi_ == 0 && readChi_ == 0) {
       // std::cout << "Calculating chi for q=" << q << "\n";
 
       if (param.cacheBands) {
@@ -405,9 +390,8 @@ class pairing {
       // 	calcChi0Matrix<FieldType,SuscType,BandsType,GapType,MatrixTemplate,ConcurrencyType>
       //           calcChi0(param,kMesh,bands,q,conc,chi0);
       // }
-      calcChi0Matrix<FieldType, SuscType, BandsType, GapType, MatrixTemplate,
-                     ConcurrencyType>
-          calcChi0(param, kMesh, q, bands, conc, chi0, param.cacheBands);
+      calcChi0Matrix<FieldType, SuscType, BandsType, GapType, MatrixTemplate, ConcurrencyType> calcChi0(
+          param, kMesh, q, bands, conc, chi0, param.cacheBands);
       // for (size_t i=0;i<msize;i++) for (size_t j=0;j<msize;j++) chi0(i,j) =
       // calcChi0(i,j); chi0.setLowerTriangle();
       chiq = chi0;
@@ -415,7 +399,8 @@ class pairing {
         qStore[ind] = q;
         chiStore[ind] = chi0;
       }
-    } else if (interpolateChi_ == 0 && readChi_ == 1) {
+    }
+    else if (interpolateChi_ == 0 && readChi_ == 1) {
       chiq = chiStore[ind];
     }
 
@@ -441,8 +426,8 @@ class pairing {
     // std::vector<std::complex<double> > chiRow(25,0);
     // if (ik1==0&&ik2==24) std::cout << "in calcGammaPPTerms chiRPAs=" <<
     // chiRPAs.calcSus() << "\n";
-    calcGammaPPOrb(chi0s, chi0c, model.spinMatrix, chiRPAs, model.chargeMatrix,
-                   chiRPAc, temps, tempc);
+    calcGammaPPOrb(chi0s, chi0c, model.spinMatrix, chiRPAs, model.chargeMatrix, chiRPAc, temps,
+                   tempc);
 
     // if (storeGammaOrb_)	{
     // 	for (size_t i=0; i < msize; i++) for (size_t j=0; j < msize; j++) {
@@ -457,20 +442,15 @@ class pairing {
     // 	}
     // }
 
-    calcGammaPPBand(temps, tempc, ik1, ik2, k1, k2, band1, band2, resultPP,
-                    resultZ, os);
+    calcGammaPPBand(temps, tempc, ik1, ik2, k1, k2, band1, band2, resultPP, resultZ, os);
     // if (ik1==0&&ik2==40) std::cout << "in calcGammaPPTerms result=" << result
     // << "\n";
   }
 
-  void calcGammaPPOrb(const ComplexMatrixType &usc0us,
-                      const ComplexMatrixType &ucc0uc,
-                      const ComplexMatrixType &ms,
-                      const ComplexMatrixType &ucsu,
-                      const ComplexMatrixType &mc,
-                      const ComplexMatrixType &uccu,
-                      ComplexMatrixType &resultPP, ComplexMatrixType &resultZ) {
-
+  void calcGammaPPOrb(const ComplexMatrixType& usc0us, const ComplexMatrixType& ucc0uc,
+                      const ComplexMatrixType& ms, const ComplexMatrixType& ucsu,
+                      const ComplexMatrixType& mc, const ComplexMatrixType& uccu,
+                      ComplexMatrixType& resultPP, ComplexMatrixType& resultZ) {
     for (size_t l1 = 0; l1 < nOrb; l1++) {
       for (size_t l2 = 0; l2 < nOrb; l2++) {
         for (size_t l3 = 0; l3 < nOrb; l3++) {
@@ -479,15 +459,13 @@ class pairing {
             size_t ind2 = param.OrbsToIndex(l3, l4);
             size_t ind3 = ind1;
             size_t ind4 = ind2;
-            if (param.pairingSpinParity == 0) { // Singlet vertex
-              resultPP(ind1, ind2) =
-                  0.5 * (ms(ind3, ind4) - mc(ind3, ind4)) *
-                      param.staticUFactor -
-                  0.5 * uccu(ind3, ind4) * param.chargeFactor +
-                  3. / 2. * ucsu(ind3, ind4) * param.spinFactor;
-            } else if (param.pairingSpinParity == 1) { // Triplet vertex
-              resultPP(ind1, ind2) =
-                  -0.5 * uccu(ind3, ind4) - 0.5 * ucsu(ind3, ind4);
+            if (param.pairingSpinParity == 0) {  // Singlet vertex
+              resultPP(ind1, ind2) = 0.5 * (ms(ind3, ind4) - mc(ind3, ind4)) * param.staticUFactor -
+                                     0.5 * uccu(ind3, ind4) * param.chargeFactor +
+                                     3. / 2. * ucsu(ind3, ind4) * param.spinFactor;
+            }
+            else if (param.pairingSpinParity == 1) {  // Triplet vertex
+              resultPP(ind1, ind2) = -0.5 * uccu(ind3, ind4) - 0.5 * ucsu(ind3, ind4);
             }
             if (param.calcLambdaZ) {
               resultZ = 0.5 * uccu(ind3, ind4) * param.chargeFactor +
@@ -500,12 +478,9 @@ class pairing {
     }
   }
 
-  void calcGammaPPBand(const ComplexMatrixType &gammaOrb,
-                       const ComplexMatrixType &gammaOrbZ, size_t ik1,
-                       size_t ik2, VectorType &k1, VectorType &k2, size_t band1,
-                       size_t band2, FieldType &resultPP, FieldType &resultZ,
-                       std::ofstream &os) {
-
+  void calcGammaPPBand(const ComplexMatrixType& gammaOrb, const ComplexMatrixType& gammaOrbZ,
+                       size_t ik1, size_t ik2, VectorType& k1, VectorType& k2, size_t band1,
+                       size_t band2, FieldType& resultPP, FieldType& resultZ, std::ofstream& os) {
     VectorType ek1(nOrb, 0);
     VectorType ek2(nOrb, 0);
     ComplexMatrixType ak1(nOrb, nOrb);
@@ -547,27 +522,22 @@ class pairing {
             // ak2(l1,band2) * conj(ak2(l4,band2));
 
             ComplexType c1 =
-                ak2(l2, band2) *
-                ak2m(l3, band2) * // works for Emery model and general case!!!
-                conj(ak1(l1, band1)) *
-                conj(ak1m(l4, band1)); // as in Kreisel et al. PRB 88, 094522
+                ak2(l2, band2) * ak2m(l3, band2) *  // works for Emery model and general case!!!
+                conj(ak1(l1, band1)) * conj(ak1m(l4, band1));  // as in Kreisel et al. PRB 88, 094522
             c2 += c1 * gammaOrb(ind1, ind2);
 
             if (param.calcLambdaZ) {
-              ComplexType c3 = ak2(l2, band2) *
-                               conj(ak2(l4, band2)) * // for Gamma_Z
+              ComplexType c3 = ak2(l2, band2) * conj(ak2(l4, band2)) *  // for Gamma_Z
                                conj(ak1(l1, band1)) * ak1(l3, band1);
               c4 += c3 * gammaOrb(ind1, ind2);
             }
 
             if (storeGammaOrb_) {
-              os << ik1 << "," << ik2 << "," << l1 << "," << l2 << "," << l3
-                 << "," << l4 << "," << real(gammaOrb(ind1, ind2)) << ","
-                 << real(c1 * gammaOrb(ind1, ind2)) << "\n";
+              os << ik1 << "," << ik2 << "," << l1 << "," << l2 << "," << l3 << "," << l4 << ","
+                 << real(gammaOrb(ind1, ind2)) << "," << real(c1 * gammaOrb(ind1, ind2)) << "\n";
               if (ik1 != ik2)
-                os << ik2 << "," << ik1 << "," << l1 << "," << l2 << "," << l3
-                   << "," << l4 << "," << real(gammaOrb(ind1, ind2)) << ","
-                   << real(c1 * gammaOrb(ind1, ind2)) << "\n";
+                os << ik2 << "," << ik1 << "," << l1 << "," << l2 << "," << l3 << "," << l4 << ","
+                   << real(gammaOrb(ind1, ind2)) << "," << real(c1 * gammaOrb(ind1, ind2)) << "\n";
             }
 
             // c2 += conj(c1)*gammaOrb(ind1,ind2);
@@ -584,15 +554,15 @@ class pairing {
     // if (conc.rank()==0) std::cout << "c2 = " << c2 << "\n";
   }
 
-  void getChi0forQ(VectorType &q, SuscType &chi) {
+  void getChi0forQ(VectorType& q, SuscType& chi) {
     // mapQto1Quadrant(q);
     // ComplexVectorType susVec(qMesh.nktot);
-    interpolation<FieldType, MatrixTemplate, SuscType, ConcurrencyType>
-        interpolate(qMesh, susq);
+    interpolation<FieldType, MatrixTemplate, SuscType, ConcurrencyType> interpolate(qMesh, susq);
     if (param.dimension == 2) {
       interpolate.BiLinearGeneral(q, chi);
       // interpolate.BiLinear(q,chi);
-    } else if (param.dimension == 3) {
+    }
+    else if (param.dimension == 3) {
       interpolate.TriLinearGeneral(q, chi);
     }
   }
@@ -606,8 +576,7 @@ class pairing {
       // FieldType vkF2 = bands.fermiVelocity2D(k,band);
       // std::cout << k[0] << " , " << k[1] << " , " << band << " , " << vkF2 <<
       // "\n";
-      normalization[ik] =
-          FSpoints.deltakF[ik] / (pow(2. * param.pi_f, dim) * FSpoints.vkF[ik]);
+      normalization[ik] = FSpoints.deltakF[ik] / (pow(2. * param.pi_f, dim) * FSpoints.vkF[ik]);
     }
   }
 
@@ -617,15 +586,14 @@ class pairing {
     // gammapp_diag.R
     size_t nkF(FSpoints.nTotal);
     std::string cstr = "Gammakkp_" + param.fileID + ".txt";
-    const char *filename = cstr.c_str();
+    const char* filename = cstr.c_str();
     std::ofstream os(filename);
     os << "nkF:\n" << nkF << "\n";
     os << "kFx:\n" << FSpoints.kFx << "\n";
     os << "kFy:\n" << FSpoints.kFy << "\n";
     os << "kFz:\n" << FSpoints.kFz << "\n";
     os << "U,U',J,J':\n"
-       << param.U << " , " << param.Up << " , " << param.J << " , " << param.Jp
-       << "\n\n";
+       << param.U << " , " << param.Up << " , " << param.J << " , " << param.Jp << "\n\n";
     os << "Gamma(k,k'): \n";
     os << gammaPP << "\n\n";
     os << "GammaZ(k,k'): \n";
@@ -638,11 +606,11 @@ class pairing {
 
     // Now write in jsn format for post-processing with python/matplotlib
     std::string cstr2 = "Gammakkp_" + param.fileID + ".jsn";
-    const char *filename2 = cstr2.c_str();
+    const char* filename2 = cstr2.c_str();
     std::ofstream os2(filename2);
     int width(13);
     os2.precision(width);
-    os2 << std::fixed; // scientific;
+    os2 << std::fixed;  // scientific;
     os2 << "{ ";
     os2 << "\"U\": " << param.U << ",\n";
     os2 << "\"Up\": " << param.Up << ",\n";
@@ -653,8 +621,7 @@ class pairing {
 
     os2 << " \"kfPoints\": [\n";
     for (size_t ik = 0; ik < nkF; ik++) {
-      os2 << "[" << FSpoints.kFx[ik] << " ," << FSpoints.kFy[ik] << " ,"
-          << FSpoints.kFz[ik] << "] ";
+      os2 << "[" << FSpoints.kFx[ik] << " ," << FSpoints.kFy[ik] << " ," << FSpoints.kFz[ik] << "] ";
       if (ik < nkF - 1)
         os2 << ",";
     }
@@ -739,7 +706,7 @@ class pairing {
     os2.close();
   }
 
-  void eigen(MatrixType &matrix, VectorType &wr, MatrixType &vr) const {
+  void eigen(MatrixType& matrix, VectorType& wr, MatrixType& vr) const {
     int n = matrix.n_row();
     int lwork = 4 * n;
     std::vector<double> work(lwork);
@@ -754,7 +721,6 @@ class pairing {
   }
 
   void calcZofk() {
-
     for (size_t ik = 0; ik < nkF; ik++)
       Zofk[ik] = 1.0;
     if (param.calcLambdaZ) {
@@ -766,7 +732,6 @@ class pairing {
   }
 
   void calcEigenVectors() {
-
     std::cout << "Now calculating the eigenvalues and -vectors of BSE\n";
 
     MatrixType matrix(gammaPP);
@@ -781,11 +746,11 @@ class pairing {
     eigen(matrix, eigenvals, eigenvects);
     // Now print out eigenvalues and -vectors
     std::string cstr = "Gap_" + param.fileID + ".jsn";
-    const char *filename = cstr.c_str();
+    const char* filename = cstr.c_str();
     std::ofstream os2(filename);
     int width(13);
     os2.precision(width);
-    os2 << std::fixed; // scientific;
+    os2 << std::fixed;  // scientific;
     os2 << "{ ";
     os2 << "\"U\": " << param.U << ",\n";
     os2 << "\"Up\": " << param.Up << ",\n";
@@ -796,8 +761,7 @@ class pairing {
 
     os2 << " \"kfPoints\": [\n";
     for (size_t ik = 0; ik < nkF; ik++) {
-      os2 << "[" << FSpoints.kFx[ik] << " ," << FSpoints.kFy[ik] << " ,"
-          << FSpoints.kFz[ik] << "] ";
+      os2 << "[" << FSpoints.kFx[ik] << " ," << FSpoints.kFy[ik] << " ," << FSpoints.kFz[ik] << "] ";
       if (ik < nkF - 1)
         os2 << ",";
     }
@@ -845,7 +809,7 @@ class pairing {
 
   void writeMatrixElementsOnFs() {
     std::string cstr = "akOnFS_" + param.fileID + ".txt";
-    const char *filename = cstr.c_str();
+    const char* filename = cstr.c_str();
     std::ofstream os(filename);
     os << "nkF: \n";
     os << nkF << "\n";
@@ -877,6 +841,6 @@ class pairing {
   }
 };
 
-} // namespace rpa
+}  // namespace rpa
 
 #endif
diff --git a/platform/dca_gpu.h b/platform/dca_gpu.h
new file mode 100644
index 000000000..d9097f729
--- /dev/null
+++ b/platform/dca_gpu.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Doak (doakpw@ornl.gov)
+//
+
+/** \file
+ *  This file provides vender independent basic gpu headers.
+ *
+ *  This file turns our to be essential to make sure compilation units in libraries
+ *  include the same haves_defines.hpp and have the expected symbols defined.
+ *
+ *  Since having DCA_HAVE_GPU defined means at least basic GPU types need to be defined
+ *  this is more often included rather than haves_defines.hpp directly
+ */
+#ifndef MRPAPP_GPU_H
+#define MRPAPP_GPU_H
+
+#include "dca/config/haves_defines.hpp"
+#if defined(DCA_HAVE_CUDA)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "dca/linalg/util/error_cuda.hpp"
+
+#define HIP_SYMBOL(x) x
+
+#elif defined(DCA_HAVE_HIP)
+#include <hip/hip_runtime.h>
+#include "dca/util/cuda2hip.h"
+#include "dca/linalg/util/error_hip.hpp"
+#endif
+
+#endif
diff --git a/utilities.h b/utilities.h
index 27f90e717..9c2fa5470 100644
--- a/utilities.h
+++ b/utilities.h
@@ -5,157 +5,80 @@
 #include "Matrix.h"
 #include <vector>
 
+extern "C" void zheev_(char*, char*, int*, std::complex<double>*, int*, double*,
+                       std::complex<double>*, int*, double*, int*);
 
+inline void GEEV(char jobvl, char jobvr, int n, psimag::Matrix<std::complex<double>>& A, int lda,
+                 std::vector<double>& w, std::vector<std::complex<double>>& work, int lwork,
+                 std::vector<double>& rwork, int* info) {
+  zheev_(&jobvl, &jobvr, &n, &(A(0, 0)), &lda, &(w[0]), &(work[0]), &lwork, &(rwork[0]), info);
+}
 
-extern "C" void 
-#ifdef glyph
-	zheev_
-#else
-	zheev
-#endif
-	(char *,char *,int *,std::complex<double> *,int *,double *,std::complex<double> *,int *,double *,int *);
-	
-inline void GEEV(char jobvl,char jobvr,int n,psimag::Matrix<std::complex<double> > &A, int lda,
-				 std::vector<double>  &w,std::vector<std::complex<double> > &work,
-				 int lwork,std::vector<double> &rwork,int *info)
-				 {
-#ifdef glyph
-						zheev_
-#else
-						zheev
-#endif
-						(&jobvl,&jobvr,&n,&(A(0,0)),&lda,
-						      &(w[0]),&(work[0]),&lwork,&(rwork[0]),info);
-						}
-
-
-#ifndef glyph
-	extern "C" void 
-	zgemm(char *,char *,int *,int *,int *,std::complex<double> *,
-					  std::complex<double> *,int *,std::complex<double> *,
-					  int *,std::complex<double> *,std::complex<double> *,int *);
-
-	extern "C" void 
-	dgetrf(int *,int *,double *,int *,int *,int *);
-	
-	extern "C" void 
-	dgetri(int *, double *, int *, int *, double *, int *, int *);
-#endif
-
-
-inline void GETRF(int m, int n, 
- 			      psimag::Matrix<double> &a,int lda, 
- 			      std::vector<int> &ipiv, int *info) 
-				 {
-#ifdef glyph						
-				psimag::LAPACK::dgetrf_
-#else
-				dgetrf
-#endif
-					(&m,&n,&(a(0,0)),&lda,&(ipiv[0]),info);
-				}
-inline void GETRI(int m, 
-			      psimag::Matrix<double> &a, int lda, 
-			      std::vector<int> &ipiv, 
-				  psimag::Matrix<double> &work, int lwork, int *info) 
-				 {
-#ifdef glyph						
-				psimag::LAPACK::dgetri_
-#else
-				dgetri
-#endif
-					(&m,&(a(0,0)),&lda,&(ipiv[0]),&(work(0,0)),&lwork,info);
-				}
-
-extern "C" void 
-#ifdef glyph
-	zgetrf_
-#else
-	zgetrf
-#endif
-	(int *,int *,std::complex<double> *,int *,int *,int *);
-	
-	extern "C" void 
-#ifdef glyph
-	zgetri_
-#else
-	zgetri
-#endif
-	(int *, std::complex<double> *, int *, int *, std::complex<double> *, int *, int *);
-
-inline void GETRF(int m, int n, 
- 			      psimag::Matrix<std::complex<double> > &a,int lda, 
- 			      std::vector<int> &ipiv, int *info) 
-				 {
-#ifdef glyph						
-				zgetrf_
-#else
-				zgetrf
-#endif
-					(&m,&n,&(a(0,0)),&lda,&(ipiv[0]),info);
-				}
-inline void GETRI(int m, 
-			      psimag::Matrix<std::complex<double> > &a, int lda, 
-			      std::vector<int> &ipiv, 
-				  psimag::Matrix<std::complex<double> > &work, int lwork, int *info) 
-				 {
-#ifdef glyph						
-				zgetri_
-#else
-				zgetri
-#endif
-					(&m,&(a(0,0)),&lda,&(ipiv[0]),&(work(0,0)),&lwork,info);
-				}
-
-
-
-typedef psimag::Matrix<std::complex<double> > ComplexMatrixType;
-typedef psimag::Matrix<double>  MatrixType;
+inline void GETRF(int m, int n, psimag::Matrix<double>& a, int lda, std::vector<int>& ipiv,
+                  int* info) {
+  psimag::LAPACK::dgetrf_(&m, &n, &(a(0, 0)), &lda, &(ipiv[0]), info);
+}
+inline void GETRI(int m, psimag::Matrix<double>& a, int lda, std::vector<int>& ipiv,
+                  psimag::Matrix<double>& work, int lwork, int* info) {
+  psimag::LAPACK::dgetri_(&m, &(a(0, 0)), &lda, &(ipiv[0]), &(work(0, 0)), &lwork, info);
+}
 
+extern "C" void zgetrf_(int*, int*, std::complex<double>*, int*, int*, int*);
 
-inline void matMul(ComplexMatrixType& matrix0, ComplexMatrixType& matrix1, ComplexMatrixType& matrix2) {
+extern "C" void zgetri_(int*, std::complex<double>*, int*, int*, std::complex<double>*, int*, int*);
 
-	int m = matrix0.n_row();
-	int n = matrix1.n_col();
-	int k = matrix0.n_col();
-	std::complex<double> alpha(1.0);
-	std::complex<double> beta(0.0);
-	char transa('N');
-	char transb('N');
+inline void GETRF(int m, int n, psimag::Matrix<std::complex<double>>& a, int lda,
+                  std::vector<int>& ipiv, int* info) {
+  zgetrf_(&m, &n, &(a(0, 0)), &lda, &(ipiv[0]), info);
+}
+inline void GETRI(int m, psimag::Matrix<std::complex<double>>& a, int lda, std::vector<int>& ipiv,
+                  psimag::Matrix<std::complex<double>>& work, int lwork, int* info) {
+  zgetri_(&m, &(a(0, 0)), &lda, &(ipiv[0]), &(work(0, 0)), &lwork, info);
+}
 
-#ifdef glyph
-	psimag::BLAS::zgemm_
-#else
-	zgemm
-#endif
-		(&transa,&transb,&m,&n,&k,&alpha,&(matrix0(0,0)),
-	  	 &m,&(matrix1(0,0)),&k,&beta,&(matrix2(0,0)),&m);
+typedef psimag::Matrix<std::complex<double>> ComplexMatrixType;
+typedef psimag::Matrix<double> MatrixType;
+
+inline void matMul(ComplexMatrixType& matrix0, ComplexMatrixType& matrix1,
+                   ComplexMatrixType& matrix2) {
+  int m = matrix0.n_row();
+  int n = matrix1.n_col();
+  int k = matrix0.n_col();
+  std::complex<double> alpha(1.0);
+  std::complex<double> beta(0.0);
+  char transa('N');
+  char transb('N');
+
+  psimag::BLAS::zgemm_(&transa, &transb, &m, &n, &k, &alpha, &(matrix0(0, 0)), &m, &(matrix1(0, 0)),
+                       &k, &beta, &(matrix2(0, 0)), &m);
 }
 
-template<typename MType>
+template <typename MType>
 inline void calcInverse(MType& matrix) {
-
-		int n = matrix.n_row();
-		std::vector<int> ipiv(n);
-		int info;
-		MType work(n,n);
-		int lwork(n);
-		GETRF(n,n,matrix,n,ipiv,&info);
-		if (info!=0) throw std::runtime_error("GETRF: failed\n");
-		GETRI(n,matrix,n,ipiv,work,lwork,&info);
-		if (info!=0) throw std::runtime_error("GETRI: failed\n");
+  int n = matrix.n_row();
+  std::vector<int> ipiv(n);
+  int info;
+  MType work(n, n);
+  int lwork(n);
+  GETRF(n, n, matrix, n, ipiv, &info);
+  if (info != 0)
+    throw std::runtime_error("GETRF: failed\n");
+  GETRI(n, matrix, n, ipiv, work, lwork, &info);
+  if (info != 0)
+    throw std::runtime_error("GETRI: failed\n");
 }
 
-template<typename FieldType>
-std::ostream& operator<<(std::ostream& os,std::vector<FieldType>& v)
-{
-	for (size_t i=0;i<v.size()-1;i++) os <<v[i]<< " "; os << v[v.size()-1];
-	return os;
+template <typename FieldType>
+std::ostream& operator<<(std::ostream& os, std::vector<FieldType>& v) {
+  for (size_t i = 0; i < v.size() - 1; i++)
+    os << v[i] << " ";
+  os << v[v.size() - 1];
+  return os;
 }
 
 // template<typename FieldType, template<typename> class MatrixTemplate>
-// FieldType calcChiPhys(const rpa::parameters<FieldType,MatrixTemplate>& param, const MatrixType& chi) {
-// 	FieldType chiPhys(0.0);
+// FieldType calcChiPhys(const rpa::parameters<FieldType,MatrixTemplate>& param, const MatrixType&
+// chi) { 	FieldType chiPhys(0.0);
 // 	// diagonal terms
 // 	for (size_t l1 = 0; l1 < param.nOrb; ++l1)
 // 	{
@@ -163,7 +86,7 @@ std::ostream& operator<<(std::ostream& os,std::vector<FieldType>& v)
 // 		{
 // 			size_t ind1(l1+l1*param.nOrb);
 // 			size_t ind2(l2+l2*param.nOrb);
-// 			chiPhys += 0.5*real(chi(ind1,ind2)) ; 
+// 			chiPhys += 0.5*real(chi(ind1,ind2)) ;
 // 		}
 // 	}
 // 	FieldType factor(1.0);
@@ -173,146 +96,145 @@ std::ostream& operator<<(std::ostream& os,std::vector<FieldType>& v)
 // }
 
 // Needed for reading in CSV files (provided by G. Alvarez)
-bool hasNonBlank(const std::string& b)
-{
-	for (size_t i=0;i<b.length();i++)
-	if (b[i]!=' ' || b[i]!='\n' || b[i]!='\t') return true;
-	return false;
+bool hasNonBlank(const std::string& b) {
+  for (size_t i = 0; i < b.length(); i++)
+    if (b[i] != ' ' || b[i] != '\n' || b[i] != '\t')
+      return true;
+  return false;
 }
 
-template<typename FieldType>
-void loadVector(std::vector<FieldType>& v,const std::string& myfile)
-{
-	std::ifstream fin(myfile.c_str());
-	std::string buffer("");
-	while(!fin.eof()) {
-		char c;
-		fin.get(c);
-		if (c==',' || c=='\n') {
-			if (hasNonBlank(buffer)) 
-			v.push_back(atof(buffer.c_str())); // use atoi for ints
-			buffer="";
-		} else {
-			buffer = buffer + c;
-		}
-	}
-	fin.close();
+template <typename FieldType>
+void loadVector(std::vector<FieldType>& v, const std::string& myfile) {
+  std::ifstream fin(myfile.c_str());
+  std::string buffer("");
+  while (!fin.eof()) {
+    char c;
+    fin.get(c);
+    if (c == ',' || c == '\n') {
+      if (hasNonBlank(buffer))
+        v.push_back(atof(buffer.c_str()));  // use atoi for ints
+      buffer = "";
+    }
+    else {
+      buffer = buffer + c;
+    }
+  }
+  fin.close();
 }
 
-
-template<typename FieldType, template<typename> class MatrixTemplate>
-std::ostream& operator<<(std::ostream& os, std::vector<MatrixTemplate<std::complex<FieldType> > > & v)
-{
-
-	size_t nel(v.size());
-	size_t nrow(v[0].n_row());
-	size_t ncol(v[0].n_col());
-	
-	for (size_t i=0;i<nel;i++) {
-		for (size_t j=0;j<nrow;j++) {
-			for (size_t k=0;k<ncol;k++) {
-				os << v[i](j,k);
-			}
-		}
-		os << "\n";
-	}
-	return os;
+template <typename FieldType, template <typename> class MatrixTemplate>
+std::ostream& operator<<(std::ostream& os, std::vector<MatrixTemplate<std::complex<FieldType>>>& v) {
+  size_t nel(v.size());
+  size_t nrow(v[0].n_row());
+  size_t ncol(v[0].n_col());
+
+  for (size_t i = 0; i < nel; i++) {
+    for (size_t j = 0; j < nrow; j++) {
+      for (size_t k = 0; k < ncol; k++) {
+        os << v[i](j, k);
+      }
+    }
+    os << "\n";
+  }
+  return os;
 }
 
-template<typename FieldType, template<typename> class MatrixTemplate>
-std::istream& operator>>(std::istream& is, std::vector<MatrixTemplate<std::complex<FieldType> > > & v)
-{
-
-	size_t nel(v.size());
-	size_t nrow(v[0].n_row());
-	size_t ncol(v[0].n_col());
-	
-	for (size_t i=0;i<nel;i++) {
-		for (size_t j=0;j<nrow;j++) {
-			for (size_t k=0;k<ncol;k++) {
-				is >> v[i](j,k);
-			}
-		}
-	}
-	return is;
+template <typename FieldType, template <typename> class MatrixTemplate>
+std::istream& operator>>(std::istream& is, std::vector<MatrixTemplate<std::complex<FieldType>>>& v) {
+  size_t nel(v.size());
+  size_t nrow(v[0].n_row());
+  size_t ncol(v[0].n_col());
+
+  for (size_t i = 0; i < nel; i++) {
+    for (size_t j = 0; j < nrow; j++) {
+      for (size_t k = 0; k < ncol; k++) {
+        is >> v[i](j, k);
+      }
+    }
+  }
+  return is;
 }
 
-
-template<typename FieldType, template<typename> class MatrixTemplate>
-void eigen(std::vector<FieldType>& eigenvals, MatrixTemplate<std::complex<FieldType> >& matrix) {
-	int n = matrix.n_row();
-	int lwork = 2*n-1;
-	std::vector<std::complex<FieldType> > work(lwork);
-	std::vector<FieldType> rwork(3*n-2);
-	int info;
-
-	GEEV('V','U',n,matrix,n,eigenvals,work,lwork,rwork,&info);
-	if (info!=0) {
-		throw std::runtime_error("zheev: failed\n");
-	}
+template <typename FieldType, template <typename> class MatrixTemplate>
+void eigen(std::vector<FieldType>& eigenvals, MatrixTemplate<std::complex<FieldType>>& matrix) {
+  int n = matrix.n_row();
+  int lwork = 2 * n - 1;
+  std::vector<std::complex<FieldType>> work(lwork);
+  std::vector<FieldType> rwork(3 * n - 2);
+  int info;
+
+  GEEV('V', 'U', n, matrix, n, eigenvals, work, lwork, rwork, &info);
+  if (info != 0) {
+    throw std::runtime_error("zheev: failed\n");
+  }
 }
 
-template<typename FieldType, template<typename> class MatrixTemplate>
-void calcRPAResult(MatrixTemplate<std::complex<FieldType> >& matrix0, 
-				   MatrixTemplate<std::complex<FieldType> >& interactionMatrix, 
-				   MatrixTemplate<std::complex<FieldType> >& matrix1, 
-				   std::vector<FieldType> q=std::vector<FieldType>(3,0.0)) {
-	int n = interactionMatrix.n_row();
-	// int m = matrix0.n_col();
-	// int k = spinMatrix.n_col();
-	// std::complex<FieldType> alpha(1.0);
-	// std::complex<FieldType> beta(0.0);
-	std::vector<int> ipiv(n);
-	int info;
-	MatrixTemplate<std::complex<FieldType> > work(n,n);
-	MatrixTemplate<std::complex<FieldType> > c(n,n);
-	int lwork(n);
-
-	// std::cout << "spinMatrix: " << "\n" << spinMatrix;
-
-	// GEMM('N','N',m,n,k,alpha,spinMatrix,n,matrix0,m,beta,c,n);
-	matMul(interactionMatrix,matrix0,c);
-	// Result of matrix multiplication is in c
-	// std::cout << "matrix0: " << "\n" << matrix0;
-	for (size_t i = 0; i < c.n_row(); ++i) for (size_t j = 0; j < c.n_col(); ++j) c(i,j) = -c(i,j);
-	for (size_t i = 0; i < c.n_row(); ++i) c(i,i) = 1.+c(i,i);
-	// Now invert
-	GETRF(n,n,c,n,ipiv,&info);
-	if (info!=0) throw std::runtime_error("GETRF: failed\n");
-	GETRI(n,c,n,ipiv,work,lwork,&info);
-	if (info!=0) throw std::runtime_error("GETRI: failed\n");
-	// Now multiply result with matrix0
-	// std::cout << "inv(c): " << "\n" << c;
-	// GEMM('N','N',m,n,k,alpha,matrix0,m,c,n,beta,matrix1,n);
-	matMul(matrix0,c,matrix1);
-	// std::cout << "matrix1: " << "\n" << matrix1;
-
+template <typename FieldType, template <typename> class MatrixTemplate>
+void calcRPAResult(MatrixTemplate<std::complex<FieldType>>& matrix0,
+                   MatrixTemplate<std::complex<FieldType>>& interactionMatrix,
+                   MatrixTemplate<std::complex<FieldType>>& matrix1,
+                   std::vector<FieldType> q = std::vector<FieldType>(3, 0.0)) {
+  int n = interactionMatrix.n_row();
+  // int m = matrix0.n_col();
+  // int k = spinMatrix.n_col();
+  // std::complex<FieldType> alpha(1.0);
+  // std::complex<FieldType> beta(0.0);
+  std::vector<int> ipiv(n);
+  int info;
+  MatrixTemplate<std::complex<FieldType>> work(n, n);
+  MatrixTemplate<std::complex<FieldType>> c(n, n);
+  int lwork(n);
+
+  // std::cout << "spinMatrix: " << "\n" << spinMatrix;
+
+  // GEMM('N','N',m,n,k,alpha,spinMatrix,n,matrix0,m,beta,c,n);
+  matMul(interactionMatrix, matrix0, c);
+  // Result of matrix multiplication is in c
+  // std::cout << "matrix0: " << "\n" << matrix0;
+  for (size_t i = 0; i < c.n_row(); ++i)
+    for (size_t j = 0; j < c.n_col(); ++j)
+      c(i, j) = -c(i, j);
+  for (size_t i = 0; i < c.n_row(); ++i)
+    c(i, i) = 1. + c(i, i);
+  // Now invert
+  GETRF(n, n, c, n, ipiv, &info);
+  if (info != 0)
+    throw std::runtime_error("GETRF: failed\n");
+  GETRI(n, c, n, ipiv, work, lwork, &info);
+  if (info != 0)
+    throw std::runtime_error("GETRI: failed\n");
+  // Now multiply result with matrix0
+  // std::cout << "inv(c): " << "\n" << c;
+  // GEMM('N','N',m,n,k,alpha,matrix0,m,c,n,beta,matrix1,n);
+  matMul(matrix0, c, matrix1);
+  // std::cout << "matrix1: " << "\n" << matrix1;
 }
 
-template<typename FieldType, typename SuscType, template<typename> class MatrixTemplate>
-void calcRPAResult(SuscType& matrix0, 
-		   MatrixTemplate<std::complex<FieldType> >& interactionMatrix, 
-		   SuscType& matrix1, 
-		   std::vector<FieldType> q=std::vector<FieldType>(3,0.0)) {
-	int n = interactionMatrix.n_row();
-	std::vector<int> ipiv(n);
-	int info;
-	MatrixTemplate<std::complex<FieldType> > work(n,n);
-	MatrixTemplate<std::complex<FieldType> > c(n,n);
-	int lwork(n);
-
-
-	matMul(interactionMatrix,matrix0,c);
-	// Result of matrix multiplication is in c
-	for (size_t i = 0; i < c.n_row(); ++i) for (size_t j = 0; j < c.n_col(); ++j) c(i,j) = -c(i,j);
-	for (size_t i = 0; i < c.n_row(); ++i) c(i,i) = 1.+c(i,i);
-	// Now invert
-	GETRF(n,n,c,n,ipiv,&info);
-	if (info!=0) throw std::runtime_error("GETRF: failed\n");
-	GETRI(n,c,n,ipiv,work,lwork,&info);
-	if (info!=0) throw std::runtime_error("GETRI: failed\n");
-	// Now multiply result with matrix0
-	matMul(matrix0,c,matrix1);
-
+template <typename FieldType, typename SuscType, template <typename> class MatrixTemplate>
+void calcRPAResult(SuscType& matrix0, MatrixTemplate<std::complex<FieldType>>& interactionMatrix,
+                   SuscType& matrix1, std::vector<FieldType> q = std::vector<FieldType>(3, 0.0)) {
+  int n = interactionMatrix.n_row();
+  std::vector<int> ipiv(n);
+  int info;
+  MatrixTemplate<std::complex<FieldType>> work(n, n);
+  MatrixTemplate<std::complex<FieldType>> c(n, n);
+  int lwork(n);
+
+  matMul(interactionMatrix, matrix0, c);
+  // Result of matrix multiplication is in c
+  for (size_t i = 0; i < c.n_row(); ++i)
+    for (size_t j = 0; j < c.n_col(); ++j)
+      c(i, j) = -c(i, j);
+  for (size_t i = 0; i < c.n_row(); ++i)
+    c(i, i) = 1. + c(i, i);
+  // Now invert
+  GETRF(n, n, c, n, ipiv, &info);
+  if (info != 0)
+    throw std::runtime_error("GETRF: failed\n");
+  GETRI(n, c, n, ipiv, work, lwork, &info);
+  if (info != 0)
+    throw std::runtime_error("GETRI: failed\n");
+  // Now multiply result with matrix0
+  matMul(matrix0, c, matrix1);
 }
 #endif

From 2b649bc39d0a0c3ef3707356d2ca267e46fe48be Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Wed, 7 May 2025 17:27:19 -0400
Subject: [PATCH 02/11] basic GPU vector and infrastructure from DCA++

---
 CMakeLists.txt                        |   3 +
 linalg/device_type.hpp                |  21 ++
 linalg/util/CMakeLists.txt            |  15 ++
 linalg/util/allocators/allocators.hpp |  73 ++++++
 linalg/util/copy.hpp                  | 335 ++++++++++++++++++++++++++
 linalg/util/error_cuda.hpp            |  67 ++++++
 linalg/util/error_gpu.cpp             |  69 ++++++
 linalg/util/error_gpuBLAS.cpp         |  58 +++++
 linalg/util/error_gpuBLAS.hpp         |  60 +++++
 linalg/util/error_hip.hpp             |  69 ++++++
 linalg/util/gpuBLAS_handles.hpp       |  66 +++++
 linalg/util/gpu_stream.hpp            | 116 +++++++++
 linalg/util/handle_functions.hpp      |  87 +++++++
 linalg/util/info_gpu.cpp              |  91 +++++++
 linalg/util/info_gpu.hpp              |  29 +++
 linalg/util/memory.hpp                | 138 +++++++++++
 linalg/util/stream_container.hpp      |  70 ++++++
 linalg/util/stream_functions.hpp      |  48 ++++
 linalg/util/util_gpublas.cpp          |  43 ++++
 linalg/util/util_gpublas.hpp          |  27 +++
 linalg/vector.hpp                     |  12 +-
 platform/dca_gpu.h                    |  37 ---
 22 files changed, 1491 insertions(+), 43 deletions(-)
 create mode 100644 linalg/device_type.hpp
 create mode 100644 linalg/util/CMakeLists.txt
 create mode 100644 linalg/util/allocators/allocators.hpp
 create mode 100644 linalg/util/copy.hpp
 create mode 100644 linalg/util/error_cuda.hpp
 create mode 100644 linalg/util/error_gpu.cpp
 create mode 100644 linalg/util/error_gpuBLAS.cpp
 create mode 100644 linalg/util/error_gpuBLAS.hpp
 create mode 100644 linalg/util/error_hip.hpp
 create mode 100644 linalg/util/gpuBLAS_handles.hpp
 create mode 100644 linalg/util/gpu_stream.hpp
 create mode 100644 linalg/util/handle_functions.hpp
 create mode 100644 linalg/util/info_gpu.cpp
 create mode 100644 linalg/util/info_gpu.hpp
 create mode 100644 linalg/util/memory.hpp
 create mode 100644 linalg/util/stream_container.hpp
 create mode 100644 linalg/util/stream_functions.hpp
 create mode 100644 linalg/util/util_gpublas.cpp
 create mode 100644 linalg/util/util_gpublas.hpp
 delete mode 100644 platform/dca_gpu.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2511a9b2e..b204b1e6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,9 @@ set_property(CACHE MRPAPP_MODEL PROPERTY STRINGS SRRUO SRRUO3D SRRUO3DSUH 1BAND
 include(mrpapp_defines)
 mrpapp_write_definitions_file()
 
+include(mrpapp_linking)
+add_subdirectory(linalg/util)
+
 add_executable(mrpapp ${MRPAPP_SRC})
 target_include_directories(mrpapp PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
   ${PROJECT_SOURCE_DIR}/PartialPsimag 
diff --git a/linalg/device_type.hpp b/linalg/device_type.hpp
new file mode 100644
index 000000000..ed1492683
--- /dev/null
+++ b/linalg/device_type.hpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Staar (taa@zurich.ibm.com)
+//
+// This file provides the type of devices used for linear algebra routines.
+
+#ifndef MRPAPP_DEVICE_TYPE_HPP
+#define MRPAPP_DEVICE_TYPE_HPP
+
+namespace linalg {
+
+enum DeviceType : int { CPU, GPU };
+
+}
+
+#endif
diff --git a/linalg/util/CMakeLists.txt b/linalg/util/CMakeLists.txt
new file mode 100644
index 000000000..fad18aad8
--- /dev/null
+++ b/linalg/util/CMakeLists.txt
@@ -0,0 +1,15 @@
+# cuda and cublas utils
+
+if(MRPAPP_HAVE_GPU)
+  set(srcs util_gpublas.cpp error_gpuBLAS.cpp info_gpu.cpp error_gpu.cpp)
+  add_library(gpu_utils STATIC ${srcs})
+  message("project source dir in ${PROJECT_SOURCE_DIR}")
+  target_include_directories(gpu_utils PUBLIC ${PROJECT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
+  mrpapp_gpu_runtime_link(gpu_utils)
+  mrpapp_gpu_blas_link(gpu_utils)
+  get_property(util_include_dirs TARGET gpu_utils PROPERTY INCLUDE_DIRECTORIES)
+  message("gpu_utils includes: ${util_include_dirs}")
+  get_property(interface_util_include_dirs TARGET gpu_utils PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+  message("gpu_utils interface includes: ${interface_util_include_dirs}")
+endif()
+
diff --git a/linalg/util/allocators/allocators.hpp b/linalg/util/allocators/allocators.hpp
new file mode 100644
index 000000000..9007ee749
--- /dev/null
+++ b/linalg/util/allocators/allocators.hpp
@@ -0,0 +1,73 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides include all types of allocators, and provides a default selector.
+
+#ifndef DCA_LINALG_UTIL_ALLOCATORS_HPP
+#define DCA_LINALG_UTIL_ALLOCATORS_HPP
+
+#include <stdexcept>
+#include "dca/config/haves_defines.hpp"
+#include "aligned_allocator.hpp"
+#include "dca/linalg/device_type.hpp"
+#ifdef DCA_HAVE_GPU
+#include "device_allocator.hpp"
+#include "managed_allocator.hpp"
+#include "pinned_allocator.hpp"
+#endif  // DCA_HAVE_GPU
+
+namespace dca {
+namespace linalg {
+namespace util {
+namespace selector {
+// dca::linalg::util::selector::
+template <typename T, DeviceType device>
+struct DefaultAllocator;
+
+#ifdef DCA_HAVE_GPU
+template <typename T>
+struct DefaultAllocator<T, CPU> {
+  using type = PinnedAllocator<T>;
+};
+
+template <typename T>
+struct DefaultAllocator<T, GPU> {
+  using type = DeviceAllocator<T>;
+};
+#else
+
+template <typename T>
+struct DefaultAllocator<T, CPU> {
+  using type = AlignedAllocator<T>;
+};
+
+template <typename T>
+struct DefaultAllocator<T, GPU> {
+  struct UnusedAllocator {
+    T* allocate(std::size_t) {
+      throw(std::logic_error("GPU not available."));
+    }
+    void deallocate(T*& /*ptr*/, std::size_t /*n*/ = 0) {}
+  };
+  using type = UnusedAllocator;
+};
+
+#endif  // DCA_HAVE_GPU
+
+}  // selector
+// dca::linalg::util:
+
+template <typename T, DeviceType device>
+using DefaultAllocator = typename selector::DefaultAllocator<T, device>::type;
+
+}  // util
+}  // linalg
+}  // dca
+
+#endif  // DCA_LINALG_UTIL_ALLOCATORS_HPP
diff --git a/linalg/util/copy.hpp b/linalg/util/copy.hpp
new file mode 100644
index 000000000..0ea42cdef
--- /dev/null
+++ b/linalg/util/copy.hpp
@@ -0,0 +1,335 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file provides memory copy utilities.
+
+#ifndef DCA_LINALG_UTIL_COPY_HPP
+#define DCA_LINALG_UTIL_COPY_HPP
+
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include "dca/linalg/device_type.hpp"
+#include "gpu_stream.hpp"
+
+#ifdef DCA_HAVE_GPU
+#include "dca/platform/dca_gpu.h"
+#endif
+#include "dca/linalg/util/stream_functions.hpp"
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+template <typename ScalarType>
+inline void memoryCopyCpu(ScalarType* dest, const ScalarType* src, size_t sz) {
+  std::memcpy(dest, src, sz * sizeof(ScalarType));
+}
+
+template <typename T>
+const void* constCastToVoid(T t) {
+  static_assert(
+      std::is_const_v<typename std::remove_pointer<typename std::remove_reference<T>::type>::type>);
+  if constexpr (std::is_array_v<std::remove_pointer<T>>)
+    return static_cast<const void*>(&t[0]);
+  else
+    return static_cast<const void*>(t);
+}
+
+template <typename T>
+void* castToVoid(T t) {
+  if constexpr (std::is_array_v<typename std::remove_pointer<typename std::remove_reference<T>::type>::type>)
+    return (void*)t;
+  else
+    return static_cast<void*>(t);
+}
+
+template <typename Scalar1, typename Scalar2>
+inline void memoryCopyCpu(Scalar1* dest, const Scalar2* src, size_t sz) {
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2), "can't copy between unequally signed types");
+  std::memcpy(castToVoid(dest), constCastToVoid(src), sz * sizeof(Scalar1));
+}
+
+template <typename ScalarType>
+void memoryCopyCpu(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                   std::pair<int, int> size) {
+  assert(size.first <= ld_dest);
+  assert(size.first <= ld_src);
+  assert(size.first >= 0);
+  assert(size.second >= 0);
+  size_t ncols = size.second;
+  for (size_t i = 0; i < ncols; ++i) {
+    memoryCopyCpu(dest + i * ld_dest, src + i * ld_src, size.first);
+  }
+}
+
+#ifdef DCA_HAVE_GPU
+// Fully synchronous 1D memory copy, i.e. all operations in the GPU queue are executed before the
+// execution of this copy.
+// The host continues the execution of the program when the copy is terminated.
+template <typename ScalarType>
+void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size) {
+  if (size == 0)
+    return;
+  cudaError_t ret = cudaMemcpy(dest, src, size * sizeof(ScalarType), cudaMemcpyDefault);
+  checkRC(ret);
+}
+
+// Fully synchronous 1D memory copy, i.e. all operations in the GPU queue are executed before the
+// execution of this copy.
+// The host continues the execution of the program when the copy is terminated.
+template <typename Scalar1, typename Scalar2>
+void memoryCopy(Scalar1* dest, const Scalar2* src, size_t size) {
+  if (size == 0)
+    return;
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  cudaError_t ret =
+      cudaMemcpy(castToVoid(dest), constCastToVoid(src), size * sizeof(Scalar1), cudaMemcpyDefault);
+  checkRC(ret);
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopyH2H(Scalar1* dest, const Scalar2* src, size_t size) {
+  if (size == 0)
+    return;
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  cudaError_t ret = cudaMemcpy(castToVoid(dest), constCastToVoid(src), size * sizeof(Scalar1),
+                               cudaMemcpyHostToHost);
+  checkRC(ret);
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopyH2D(Scalar1* dest, const Scalar2* src, size_t size) {
+  if (size == 0)
+    return;
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  cudaError_t ret =
+      cudaMemcpy(castToVoid(dest), castToVoid(src), size * sizeof(Scalar1), cudaMemcpyHostToDevice);
+  checkRC(ret);
+}
+
+// Fully synchronous 1D memory copy, i.e. all operations in the GPU queue are executed before the
+// execution of this copy.
+// The host continues the execution of the program when the copy is terminated.
+template <typename Scalar1, typename Scalar2>
+void memoryCopyD2H(Scalar1* dest, const Scalar2* src, size_t size) {
+  if (size == 0)
+    return;
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  cudaError_t ret = cudaMemcpy(castToVoid(dest), constCastToVoid(src), size * sizeof(Scalar1),
+                               cudaMemcpyDeviceToHost);
+  checkRC(ret);
+}
+
+// Fully synchronous 2D memory copy, i.e. all operations in the GPU queue are executed before the
+// execution of this copy.
+// The host continues the execution of the program when the copy is terminated.
+// Preconditions: ld_dest >= size.first, ld_src >= size.first.
+template <typename ScalarType>
+void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                std::pair<int, int> size) {
+  if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0))
+    return;
+  cudaError_t ret = cudaMemcpy2D(dest, ld_dest * sizeof(ScalarType), src, ld_src * sizeof(ScalarType),
+                                 size.first * sizeof(ScalarType), size.second, cudaMemcpyDefault);
+  try {
+    checkRC(ret);
+  }
+  catch (...) {
+    std::cout << "Failed memorycopy!\n";
+    throw;
+  }
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopy(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src, std::pair<int, int> size) {
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0))
+    return;
+  cudaError_t ret = cudaMemcpy2D(castToVoid(dest), ld_dest * sizeof(Scalar1), constCastToVoid(src),
+                                 ld_src * sizeof(Scalar2), size.first * sizeof(Scalar1),
+                                 size.second, cudaMemcpyDefault);
+  try {
+    checkRC(ret);
+  }
+  catch (...) {
+    std::cout << "Failed memorycopy!\n";
+    throw;
+  }
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopyH2D(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
+                   std::pair<int, int> size) {
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0))
+    return;
+  cudaError_t ret = cudaMemcpy2D(castToVoid(dest), ld_dest * sizeof(Scalar1), constCastToVoid(src),
+                                 ld_src * sizeof(Scalar2), size.first * sizeof(Scalar1),
+                                 size.second, cudaMemcpyHostToDevice);
+  try {
+    checkRC(ret);
+  }
+  catch (...) {
+    std::cout << "Failed memorycopy!\n";
+    throw;
+  }
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopyD2H(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
+                   std::pair<int, int> size) {
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0))
+    return;
+  cudaError_t ret = cudaMemcpy2D(castToVoid(dest), ld_dest * sizeof(Scalar1), constCastToVoid(src),
+                                 ld_src * sizeof(Scalar2), size.first * sizeof(Scalar1),
+                                 size.second, cudaMemcpyDeviceToHost);
+  try {
+    checkRC(ret);
+  }
+  catch (...) {
+    std::cout << "Failed memorycopy!\n";
+    throw;
+  }
+}
+
+// Asynchronous 1D memory copy.
+template <typename ScalarType>
+void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const cudaStream_t stream) {
+  if (size == 0)
+    return;
+  cudaError_t ret = cudaMemcpyAsync(dest, src, size * sizeof(ScalarType), cudaMemcpyDefault, stream);
+  try {
+    checkRC(ret);
+  }
+  catch (...) {
+    std::cout << "Failed memorycopy!\n";
+    throw;
+  }
+}
+
+// Asynchronous 1D memory copy (stream = getStream(thread_id, stream_id)).
+// Preconditions: 0 <= thread_id < DCA_MAX_THREADS,
+//                0 <= stream_id < DCA_STREAMS_PER_THREADS.
+template <typename ScalarType>
+void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, int thread_id,
+                     int stream_id = 0) {
+  memoryCopyAsync(dest, src, size, getStream(thread_id, stream_id));
+}
+
+// Asynchronous 2D memory copy.
+// Preconditions: ld_dest >= size.first, ld_src >= size.firs.
+template <typename ScalarType>
+void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                     std::pair<int, int> size, const cudaStream_t stream) {
+  if (ld_dest == 0 || ld_src == 0 || (size.first == 0 && size.second == 0))
+    return;
+  cudaError_t ret =
+      cudaMemcpy2DAsync(dest, ld_dest * sizeof(ScalarType), src, ld_src * sizeof(ScalarType),
+                        size.first * sizeof(ScalarType), size.second, cudaMemcpyDefault, stream);
+  try {
+    checkRC(ret);
+  }
+  catch (...) {
+    std::cout << "Failed memorycopy!\n";
+    throw;
+  }
+}
+
+// Asynchronous 2D memory copy (stream = getStream(thread_id, stream_id)).
+// Preconditions: ld_dest >= size.first, ld_src >= size.first,
+//                0 <= thread_id < DCA_MAX_THREADS,
+//                0 <= stream_id < DCA_STREAMS_PER_THREADS.
+template <typename ScalarType>
+void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                     std::pair<int, int> size, int thread_id, int stream_id) {
+  memoryCopyAsync(dest, ld_dest, src, ld_src, size, getStream(thread_id, stream_id));
+}
+
+// Asynchronous 1D memory copy (stream = getStream(thread_id, stream_id))
+// + synchronization of stream.
+// Preconditions: 0 <= thread_id < DCA_MAX_THREADS,
+template <typename ScalarType>
+void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size, int thread_id, int stream_id) {
+  memoryCopyAsync(dest, src, size, thread_id, stream_id);
+  syncStream(thread_id, stream_id);
+}
+
+// Asynchronous 2D memory copy (stream = getStream(thread_id, stream_id))
+// + synchronization of stream.
+// Preconditions: ld_dest >= size.first, ld_src >= size.first,
+//                0 <= thread_id < DCA_MAX_THREADS,
+//                0 <= stream_id < DCA_STREAMS_PER_THREADS.
+template <typename ScalarType>
+void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                std::pair<int, int> size, int thread_id, int stream_id) {
+  memoryCopyAsync(dest, ld_dest, src, ld_src, size, thread_id, stream_id);
+  syncStream(thread_id, stream_id);
+}
+
+#else
+
+template <typename ScalarType>
+void memoryCopy(ScalarType* dest, const ScalarType* src, size_t sz, int /*thread_id*/ = 0,
+                int /*stream_id*/ = 0) {
+  memoryCopyCpu(dest, src, sz);
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopy(Scalar1* dest, const Scalar2* src, size_t sz, int /*thread_id*/ = 0,
+                int /*stream_id*/ = 0) {
+  memoryCopyCpu(dest, src, sz);
+}
+
+template <typename ScalarType>
+void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                std::pair<int, int> size, int /*thread_id*/ = 0, int /*stream_id*/ = 0) {
+  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopy(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
+                std::pair<int, int> size, int /*thread_id*/ = 0, int /*stream_id*/ = 0) {
+  static_assert(sizeof(Scalar1) == sizeof(Scalar2));
+  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
+}
+
+// Synchronous 1D memory copy fallback.
+template <typename ScalarType>
+void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size,
+                     const util::GpuStream& /*s*/) {
+  memoryCopyCpu(dest, src, size);
+}
+template <typename ScalarType>
+void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
+                     std::pair<int, int> size, const util::GpuStream& /*s*/) {
+  memoryCopyCpu(dest, ld_dest, src, ld_src, size);
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopyH2D(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
+                   std::pair<int, int> size) {
+  throw std::runtime_error("memoryCopyH2D should never be called in a non GPU build.");
+}
+
+template <typename Scalar1, typename Scalar2>
+void memoryCopyD2H(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
+                   std::pair<int, int> size) {
+  throw std::runtime_error("memoryCopyH2D should never be called in a non GPU build.");
+}
+
+#endif  // DCA_HAVE_GPU
+
+}  // namespace util
+}  // namespace linalg
+}  // namespace dca
+
+#endif  // DCA_LINALG_UTIL_COPY_HPP
diff --git a/linalg/util/error_cuda.hpp b/linalg/util/error_cuda.hpp
new file mode 100644
index 000000000..9e4728b26
--- /dev/null
+++ b/linalg/util/error_cuda.hpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file provides cuda related utilities to
+// - return code checking,
+// - error message printing.
+
+#ifndef DCA_LINALG_UTIL_ERROR_CUDA_HPP
+#define DCA_LINALG_UTIL_ERROR_CUDA_HPP
+
+#include <cuda_runtime.h>
+#include <string>
+#include <stdexcept>
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+// Performs extra cuda code checking when debugging.
+// Remark: Use the macro checkErrorsCudaDebug instead of the function in the code to avoid overhead
+// when DEBUG_CUDA is not defined.
+void checkErrorsCudaDebugInternal(std::string function_name, std::string file_name, int line);
+#ifdef DEBUG_CUDA
+#define checkErrorsCudaDebug() \
+  dca::linalg::util::checkErrorsCudaDebugInternal(__FUNCTION__, __FILE__, __LINE__)
+#else
+#define checkErrorsCudaDebug()
+#endif
+
+// Prints an error message containing error, function_name, file_name, line and extra_error_string.
+void printErrorMessage(std::string error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string = "");
+
+// Prints an error message containing function_name, file_name, line, extra_error_string, and the
+// error message and error code related to the cudaError_t error.
+void printErrorMessage(cudaError_t error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string = "");
+
+// Prints an error message and throws a std::logic_error if the return code of a cuda function is
+// not cudaSuccess.
+// The macros provide the interfaces that automatically pass the function name, the filename, and
+// the line to the function call.
+#define checkRC(return_code) \
+  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__)
+#define checkRCMsg(return_code, extra_error_string)                                 \
+  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__, \
+                                     extra_error_string)
+inline void checkRCInternal(cudaError_t return_code, std::string function_name,
+                            std::string file_name, int line, std::string extra_error_string = "") {
+  if (return_code != cudaSuccess) {
+    printErrorMessage(return_code, function_name, file_name, line, extra_error_string);
+    throw std::logic_error(function_name);
+  }
+}
+
+}  // util
+}  // linalg
+}  // dca
+
+#endif  // DCA_LINALG_UTIL_ERROR_CUDA_HPP
diff --git a/linalg/util/error_gpu.cpp b/linalg/util/error_gpu.cpp
new file mode 100644
index 000000000..df4441776
--- /dev/null
+++ b/linalg/util/error_gpu.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file implements error_cuda functions.
+#include "defines.hpp"
+#include "platform/mrpapp_gpu.h"
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <stdexcept>
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+void checkErrorsCudaDebugInternal(std::string function_name, std::string file_name, int line) {
+  // cudaDeviceSynchronize();
+
+  cudaError_t ret = cudaGetLastError();
+
+#ifdef DCA_HAVE_HIP
+  // hip reports this whenever you call getLastError and the stream is not empty of calls
+  // To us this is not an error we're in an Async regime it's expected
+  if(ret == hipErrorNotReady)
+    return;
+#endif
+
+  if (ret != cudaSuccess) {
+    std::stringstream s;
+
+    s << std::endl;
+    s << "DEBUG_CUDA: error in function: " << function_name;
+    s << " (" << file_name << ":" << line << ")" << std::endl;
+    s << "cudaGetLastError returned: " << cudaGetErrorString(ret) << " (" << ret << ")" << std::endl;
+
+    std::cout << s.str() << std::endl;
+
+    throw std::logic_error(s.str());
+  }
+}
+
+void printErrorMessage(cudaError_t error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string) {
+  printErrorMessage(std::string(cudaGetErrorString(error)) + " (" + std::to_string(error) + ")",
+                    function_name, file_name, line, extra_error_string);
+}
+
+void printErrorMessage(std::string error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string) {
+  std::stringstream s;
+
+  s << "Error in function: " << function_name;
+  s << " (" << file_name << ":" << line << ")" << std::endl;
+  s << "The function returned: " << error << std::endl;
+  if (extra_error_string != "")
+    s << extra_error_string << std::endl;
+
+  std::cout << s.str() << std::endl;
+}
+}  // util
+}  // linalg
+}  // dca
diff --git a/linalg/util/error_gpuBLAS.cpp b/linalg/util/error_gpuBLAS.cpp
new file mode 100644
index 000000000..d9d302b63
--- /dev/null
+++ b/linalg/util/error_gpuBLAS.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// This file implements cublas related utilities.
+
+#include "linalg/util/error_gpuBLAS.hpp"
+#include <stdexcept>
+#include <string>
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+std::string errorStringCublas(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    default:
+      return "UNKNOWN_CUBLAS_ERROR";
+  }
+}
+
+void printErrorMessage(cublasStatus_t error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string) {
+  auto cuda_error = cudaGetLastError();
+  std::string cuda_error_str(cudaGetErrorString(cuda_error));
+  cuda_error_str += extra_error_string;
+  printErrorMessage(errorStringCublas(error) + " (" + std::to_string(error) + ")", function_name,
+                    file_name, line, cuda_error_str);
+}
+
+}  // util
+}  // linalg
+}  // dca
diff --git a/linalg/util/error_gpuBLAS.hpp b/linalg/util/error_gpuBLAS.hpp
new file mode 100644
index 000000000..5533e21ca
--- /dev/null
+++ b/linalg/util/error_gpuBLAS.hpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// This file provides cublas related utilities to
+// - return code checking,
+// - error message printing.
+
+#ifndef MRPAPP_ERROR_GPUBLAS_HPP
+#define MRPAPP_ERROR_GPUBLAS_HPP
+
+#include "platform/mrpapp_gpu.h"
+#include "platform/mrpapp_gpu_blas.h"
+
+#if defined(MRPAPP_HAVE_CUDA)
+#include <cublas_v2.h>
+#include "error_cuda.hpp"
+#elif defined(MRPAPP_HAVE_HIP)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_complex.h>
+#include "cuda2hip.h"
+#include "error_hip.hpp"
+#endif
+#include <stdexcept>
+#include <string>
+
+namespace linalg {
+namespace util {
+
+// Returns the error string related to error.
+std::string errorStringCublas(cublasStatus_t error);
+
+// Prints an error message containing function_name, file_name, line, and the error message and
+// error code related to error.
+void printErrorMessage(cublasStatus_t error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string = "");
+
+// Prints an error message and throws a std::logic_error if the return code of a cuda function is
+// not CUBLAS_STATUS_SUCCESS.
+// This function can be invoked with the macros checkRC and checkRCMsg (defined in error_cuda.hpp)
+// that automatically include the function name, the filename, and the line to the function call.
+inline void checkRCInternal(cublasStatus_t return_code, std::string function_name,
+                            std::string file_name, int line, std::string extra_error_string = "") {
+  if (return_code != CUBLAS_STATUS_SUCCESS) {
+    printErrorMessage(return_code, function_name, file_name, line, extra_error_string);
+    throw std::logic_error(function_name);
+  }
+}
+
+}  // util
+}  // linalg
+
+#endif  // MRPAPP_ERROR_CUBLAS_HPP
diff --git a/linalg/util/error_hip.hpp b/linalg/util/error_hip.hpp
new file mode 100644
index 000000000..e1a3eeb5b
--- /dev/null
+++ b/linalg/util/error_hip.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// This file provides hip related utilities to
+// - return code checking,
+// - error message printing.
+
+#ifndef DCA_LINALG_UTIL_ERROR_HIP_HPP
+#define DCA_LINALG_UTIL_ERROR_HIP_HPP
+
+#include <hip/hip_runtime.h>
+#include "dca/util/cuda2hip.h"
+#include <string>
+#include <stdexcept>
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+// Performs extra cuda code checking when debugging.
+// Remark: Use the macro checkErrorsCudaDebug instead of the function in the code to avoid overhead
+// when DEBUG_CUDA is not defined.
+void checkErrorsCudaDebugInternal(std::string function_name, std::string file_name, int line);
+#ifdef DEBUG_CUDA
+#define checkErrorsCudaDebug() \
+  dca::linalg::util::checkErrorsCudaDebugInternal(__FUNCTION__, __FILE__, __LINE__)
+#else
+#define checkErrorsCudaDebug()
+#endif
+
+// Prints an error message containing error, function_name, file_name, line and extra_error_string.
+void printErrorMessage(std::string error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string = "");
+
+// Prints an error message containing function_name, file_name, line, extra_error_string, and the
+// error message and error code related to the cudaError_t error.
+void printErrorMessage(cudaError_t error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string = "");
+
+// Prints an error message and throws a std::logic_error if the return code of a cuda function is
+// not cudaSuccess.
+// The macros provide the interfaces that automatically pass the function name, the filename, and
+// the line to the function call.
+#define checkRC(return_code) \
+  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__)
+#define checkRCMsg(return_code, extra_error_string)                                 \
+  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__, \
+                                     extra_error_string)
+inline void checkRCInternal(cudaError_t return_code, std::string function_name,
+                            std::string file_name, int line, std::string extra_error_string = "") {
+  if (return_code != cudaSuccess) {
+    printErrorMessage(return_code, function_name, file_name, line, extra_error_string);
+    throw std::logic_error(function_name);
+  }
+}
+  
+}  // util
+}  // linalg
+}  // dca
+
+#endif  // DCA_LINALG_UTIL_ERROR_HIP_HPP
diff --git a/linalg/util/gpuBLAS_handles.hpp b/linalg/util/gpuBLAS_handles.hpp
new file mode 100644
index 000000000..733e9af85
--- /dev/null
+++ b/linalg/util/gpuBLAS_handles.hpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE.txt for terms of usage.
+// See CITATION.txt for citation guidelines if you use this code for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// RAII wrapper for a gpu handle.
+
+#ifndef MRPAPP_GPUBLAS_HANDLE_HPP
+#define MRPAPP_GPUBLAS_HANDLE_HPP
+
+#if defined(MRPAPP_HAVE_CUDA)
+#include <cublas_v2.h>
+#elif defined(MRPAPP_HAVE_HIP)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_complex.h>
+#include "cuda2hip.h"
+#endif
+
+#include "error_gpuBLAS.hpp"
+#include "gpu_stream.hpp"
+
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+class GpuBLASHandle {
+public:
+  GpuBLASHandle() {
+    cublasStatus_t ret = cublasCreate(&handle_);
+    checkRC(ret);
+  }
+
+  GpuBLASHandle& operator=(const GpuBLASHandle& other) = delete;
+
+  GpuBLASHandle(GpuBLASHandle&& other) {
+    std::swap(handle_, other.handle_);
+  }
+
+  ~GpuBLASHandle() {
+    if (handle_)
+      cublasDestroy(handle_);
+  }
+
+  void setStream(cudaStream_t stream) {
+    cublasStatus_t ret = cublasSetStream(handle_, stream);
+    checkRC(ret);
+  }
+
+  operator cublasHandle_t() const {
+    return handle_;
+  }
+
+private:
+  cublasHandle_t handle_ = nullptr;
+};
+
+}  // util
+}  // linalg
+
+#endif  // MRPAPP_GPUBLAS_HANDLE_HPP
diff --git a/linalg/util/gpu_stream.hpp b/linalg/util/gpu_stream.hpp
new file mode 100644
index 000000000..68b7613f0
--- /dev/null
+++ b/linalg/util/gpu_stream.hpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE.txt for terms of usage.
+// See CITATION.txt for citation guidelines if you use this code for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// RAII wrapper for gpu stream.
+
+#ifndef MRPAPP_GPU_STREAM_HPP
+#define MRPAPP_GPU_STREAM_HPP
+
+#include <iostream>
+#include "defines.hpp"
+#include "platform/mrpapp_gpu.h"
+
+
+namespace linalg {
+namespace util {
+
+#ifdef MRPAPP_HAVE_GPU
+
+// dca::linalg::util::
+
+class GpuStream {
+public:
+  GpuStream() {
+    checkRC(cudaStreamCreate(&stream_));
+    owning_ = true;
+  }
+
+  GpuStream(const cudaStream_t& stream) { 
+    stream_ = stream;
+    owning_ = false;
+  }
+
+  GpuStream(const GpuStream& other) {
+    stream_ = other.stream_;
+    owning_ = false;
+  }
+
+  /** simple assignment does not take possesion of the cuda stream
+   */
+  GpuStream& operator=(const GpuStream& other)
+  {
+    if (owning_ && stream_)
+      checkRC(cudaStreamDestroy(stream_));
+    stream_ = other.stream_;
+    owning_ = false;
+    return *this;
+  }
+
+  GpuStream(GpuStream&& other) noexcept {
+    swap(other);
+  }
+
+  // clang at least can't do the GpuStream_t() conversion
+  cudaStream_t streamActually() const {
+    return stream_;
+  }
+
+  GpuStream& operator=(GpuStream&& other) noexcept {
+    swap(other);
+    return *this;
+  }
+
+  void sync() const {
+    try {
+    checkRC(cudaStreamSynchronize(stream_));
+    } catch(...) {
+      std::cout << "exception thrown from StreamSynchronize.\n";
+    }
+  }
+
+  ~GpuStream() {
+    if (owning_ && stream_)
+      checkRC(cudaStreamDestroy(stream_));
+  }
+
+  operator cudaStream_t() const {
+    return stream_;
+  }
+
+  void swap(GpuStream& other) noexcept {
+    std::swap(stream_, other.stream_);
+  }
+
+private:
+  cudaStream_t stream_ = nullptr;
+  bool owning_ = false;
+};
+
+#else  // MRPAPP_HAVE_GPU
+
+// Mock object.
+class GpuStream {
+public:
+  GpuStream() = default;
+
+  void sync() const {}
+
+  // clang at least can't do the GpuStream_t() conversion
+  auto streamActually(){
+    return 0;
+  }
+};
+
+#endif  // MRPAPP_HAVE_GPU
+
+}  // namespace util
+}  // namespace linalg
+
+#endif  // MRPAPP_CUDA_STREAM_HPP
diff --git a/linalg/util/handle_functions.hpp b/linalg/util/handle_functions.hpp
new file mode 100644
index 000000000..c3c99464c
--- /dev/null
+++ b/linalg/util/handle_functions.hpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// This file provides a global container providing access to a CUBLAS handle per thread, and
+// utilities related to it.
+
+#ifndef MRPAPP_HANDLE_FUNCTIONS_HPP
+#define MRPAPP_HANDLE_FUNCTIONS_HPP
+
+#include <vector>
+
+#ifdef MRPAPP_HAVE_GPU
+#include "platform/mrpapp_gpu.h"
+#include "platform/mrpapp_gpu_blas.h"
+#endif
+
+#include "stream_functions.hpp"
+
+#ifdef MRPAPP_HAVE_GPU
+#include "gpuBLAS_handles.hpp"
+#include "gpu_stream.hpp"
+#endif
+
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+#ifdef MRPAPP_HAVE_GPU
+
+
+
+// Global handle container.
+inline std::vector<GpuBLASHandle>& getHandleContainer() {
+  static std::vector<GpuBLASHandle> handle_container(1);
+  return handle_container;
+}
+
+// Creates max_threads cublas handles and at least max_threads *
+// StreamContainer::streams_per_thread_ cuda streams.
+inline void resizeHandleContainer(const std::size_t max_threads) {
+  if (getStreamContainer().get_max_threads() < max_threads)
+    resizeStreamContainer(max_threads);
+
+  getHandleContainer().resize(max_threads);
+}
+
+// Returns the handle associated with thread 'thread_id'.
+// Preconditions: 0 <= thread_id < max_threads.
+inline cublasHandle_t getHandle(const int thread_id) {
+  assert(thread_id >= 0 && thread_id < getHandleContainer().size());
+  return getHandleContainer()[thread_id];
+}
+
+// Returns the handle associated with thread 'thread_id' after setting its cuda stream to the one
+// returned by getStream(thread_id, stream_id).
+// It returns the handle.
+// Preconditions: 0 <= thread_id < max_threads,
+//                0 <= stream_id < StreamContainer::get_streams_per_thread().
+inline cublasHandle_t getHandle(const int thread_id, const int stream_id) {
+  assert(thread_id >= 0 && thread_id < getHandleContainer().size());
+  GpuStream& stream = getStream(thread_id, stream_id);
+  getHandleContainer()[thread_id].setStream(stream.streamActually());
+  return getHandleContainer()[thread_id];
+}
+
+#else
+
+
+inline void resizeHandleContainer(const std::size_t max_threads) {
+  if (getStreamContainer().get_max_threads() < max_threads)
+    resizeStreamContainer(max_threads);
+}
+
+#endif  // MRPAPP_HAVE_GPU
+
+}  // util
+}  // linalg
+
+#endif  // MRPAPP_HANDLE_FUNCTIONS_HPP
diff --git a/linalg/util/info_gpu.cpp b/linalg/util/info_gpu.cpp
new file mode 100644
index 000000000..ada0ca592
--- /dev/null
+++ b/linalg/util/info_gpu.cpp
@@ -0,0 +1,91 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         PeterDoak (doakpw@ornl.gov)
+//
+// This file implements gpu info functions.
+
+#include "defines.hpp"
+#if defined(MRPAPP_HAVE_CUDA)
+#include "linalg/util/error_cuda.hpp"
+#elif defined(MRPAPP_HAVE_HIP)
+#include "linalg/util/error_hip.hpp"
+#include "util/cuda2hip.h"
+#endif
+#include "linalg/util/info_gpu.hpp"
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <stdexcept>
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+void printInfoDevices() {
+  int nr_devices;
+  cudaError_t error = cudaGetDeviceCount(&nr_devices);
+  if ( error != cudaSuccess)
+    throw std::runtime_error("cudaGetDeviceCount failed!");
+  
+  std::stringstream s;
+  s << "\n"
+    << "********************************************************************************\n"
+    << "**********                            CUDA                            **********\n"
+    << "********************************************************************************\n"
+    << "\n"
+    << "CUDA found " << nr_devices << " devices."
+    << "\n";
+
+  for (int i = 0; i < nr_devices; ++i) {
+    s << "\n";
+    s << "  Device " << i << ":"
+      << "\n";
+
+    cudaDeviceProp dev_prop;
+    error = cudaGetDeviceProperties(&dev_prop, i);
+    if ( error != cudaSuccess)
+      throw std::runtime_error("cudaGetDeviceProperties failed!");
+
+    s << "  Name:                      " << dev_prop.name << "\n";
+    s << "  Compute capability:        " << dev_prop.major << "." << dev_prop.minor << "\n";
+    s << "  Number of multiprocessors: " << dev_prop.multiProcessorCount << "\n";
+    s << "  Global memory:             " << dev_prop.totalGlobalMem << " bytes"
+      << "\n";
+    s << "  Constant memory:           " << dev_prop.totalConstMem << " bytes"
+      << "\n";
+    s << "  Shared memory per block:   " << dev_prop.sharedMemPerBlock << " bytes"
+      << "\n";
+    s << "  Registers per block:       " << dev_prop.regsPerBlock << "\n";
+    s << "  Maximum memory pitch:      " << dev_prop.memPitch << " bytes"
+      << "\n";
+    s << "  Warp size:                 " << dev_prop.warpSize << "\n";
+    s << "  Maximum threads per block: " << dev_prop.maxThreadsPerBlock << "\n";
+
+    s << "  Maximum size of blocks:    " << dev_prop.maxThreadsDim[0] << " x "
+      << dev_prop.maxThreadsDim[1] << " x " << dev_prop.maxThreadsDim[2] << "\n";
+
+    s << "  Maximum size of grids:     " << dev_prop.maxGridSize[0] << " x "
+      << dev_prop.maxGridSize[1] << " x " << dev_prop.maxGridSize[2] << "\n";
+
+    s << "  Clock frequency:           " << dev_prop.clockRate << " KHz"
+      << "\n";
+#ifdef MRPAPP_HAVE_CUDA
+    s << "  Async engine count:        " << dev_prop.asyncEngineCount << "\n";
+#endif
+    s << "  Kernel execution timeout:  " << (dev_prop.kernelExecTimeoutEnabled ? "Yes" : "No")
+      << "\n";
+  }
+
+  std::cout << s.str() << std::endl;
+}
+
+}  // util
+}  // linalg
+}  // dca
diff --git a/linalg/util/info_gpu.hpp b/linalg/util/info_gpu.hpp
new file mode 100644
index 000000000..c9fb6009d
--- /dev/null
+++ b/linalg/util/info_gpu.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file provides cuda related utilities to device specs printing.
+
+#ifndef DCA_LINALG_UTIL_INFO_CUDA_HPP
+#define DCA_LINALG_UTIL_INFO_CUDA_HPP
+
+#ifdef DCA_HAVE_GPU
+#include "dca/platform/dca_gpu.h"
+#endif
+
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+// Prints the specs of the cuda devices found by the application.
+void printInfoDevices();
+
+}  // util
+}  // linalg
+
+#endif  // DCA_LINALG_UTIL_INFO_CUDA_HPP
diff --git a/linalg/util/memory.hpp b/linalg/util/memory.hpp
new file mode 100644
index 000000000..482ab3f5d
--- /dev/null
+++ b/linalg/util/memory.hpp
@@ -0,0 +1,138 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file provides memory related utility:
+// - setToZero.
+
+#ifndef DCA_LINALG_UTIL_MEMORY_HPP
+#define DCA_LINALG_UTIL_MEMORY_HPP
+
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <stdexcept>
+
+#include "dca/platform/dca_gpu.h"
+#include "dca/util/type_help.hpp"
+#include "dca/linalg/device_type.hpp"
+#include "dca/linalg/util/gpu_stream.hpp"
+#include "dca/util/ignore.hpp"
+
+#ifdef DCA_HAVE_GPU
+#include "dca/linalg/util/gpu_type_mapping.hpp"
+#endif
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+template <DeviceType device_name>
+struct Memory {};
+
+template <>
+struct Memory<CPU> {
+  // Sets the elements to 0. Only defined for arithmetic types and
+  // std::complex of aritmetic types.
+  template <typename ScalarType>
+  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == true, void> setToZero(
+      ScalarType* ptr, size_t size) {
+    std::memset(ptr, 0, size * sizeof(ScalarType));
+  }
+  template <typename ScalarType>
+  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == true, void> setToZero(
+      std::complex<ScalarType>* ptr, size_t size) {
+    std::complex<ScalarType> c_zero{0.0, 0.0};
+    std::fill_n(
+        ptr, size,
+        c_zero);  // memset(static_cast<void*>(ptr), 0, size * sizeof(std::complex<ScalarType>));
+  }
+  template <typename ScalarType>
+  static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& /*s*/) {
+    setToZero(ptr, size);
+  }
+  template <typename ScalarType>
+  static void setToZero(ScalarType* ptr, size_t size, const GpuStream& /*s*/) {
+    setToZero(ptr, size);
+  }
+#ifdef DCA_HAVE_GPU
+  template <typename Scalar>
+  static std::enable_if_t<dca::util::IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
+      Scalar* ptr, size_t size) {
+    std::memset(ptr, 0, sizeof(Scalar) * size);
+  }
+
+#ifdef DCA_HAVE_HIP
+  template <typename Scalar>
+  static std::enable_if_t<dca::util::IsMagmaComplex_t<Scalar>::value == true, void> setToZero(
+      Scalar* ptr, size_t size) {
+    std::memset(ptr, 0, sizeof(Scalar) * size);
+  }
+#endif
+#endif
+};
+
+#ifdef DCA_HAVE_GPU
+template <>
+struct Memory<GPU> {
+  // Sets the elements to 0. Only defined for arithmetic types and
+  // std::complex of aritmetic types.
+
+  /// Specialization for float2, double2, cuComplex, cuDoubleComplex
+  template <typename ScalarType>
+  static std::enable_if_t<dca::util::IsCUDAComplex_t<ScalarType>::value == true, void> setToZero(ScalarType ptr, size_t size) {
+    checkRC(cudaMemset(ptr, 0, size * sizeof(ScalarType)));
+  }
+
+  template <typename ScalarType>
+  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == true, void> setToZero(
+      ScalarType* ptr, size_t size) {
+    checkRC(cudaMemset(ptr, 0, size * sizeof(ScalarType)));
+  }
+  template <typename ScalarType>
+  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == true, void> setToZero(
+      std::complex<ScalarType>* ptr, size_t size) {
+    checkRC(cudaMemset(ptr, 0, size * sizeof(std::complex<ScalarType>)));
+  }
+
+  template <typename Scalar>
+  static std::enable_if_t<dca::util::IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
+      Scalar* ptr, size_t size) {
+    checkRC(cudaMemset(ptr, 0, size * sizeof(Scalar)));
+  }
+
+
+  
+
+  // Do nothing for non arithmetic types.
+  template <typename ScalarType>
+  static std::enable_if_t<std::is_arithmetic<ScalarType>::value == false, void> setToZero(
+      ScalarType /*ptr*/, size_t /*size*/) {}
+
+  template <typename ScalarType>
+  static void setToZeroAsync(ScalarType* ptr, size_t size, const GpuStream& stream) {
+    checkRC(cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream));
+  }
+
+  template <typename ScalarType>
+  static void setToZero(ScalarType* ptr, size_t size, const GpuStream& stream) {
+    checkRC(cudaMemsetAsync(ptr, 0, size * sizeof(ScalarType), stream));
+    cudaEvent_t zero_event;
+    checkRC(cudaEventCreateWithFlags(&zero_event, cudaEventBlockingSync));
+    checkRC(cudaEventRecord(zero_event, stream));
+    checkRC(cudaEventSynchronize(zero_event));
+  }
+};
+#endif  // DCA_HAVE_GPU
+
+}  // namespace util
+}  // namespace linalg
+}  // namespace dca
+
+#endif  // DCA_LINALG_UTIL_MEMORY_HPP
diff --git a/linalg/util/stream_container.hpp b/linalg/util/stream_container.hpp
new file mode 100644
index 000000000..14cf1636d
--- /dev/null
+++ b/linalg/util/stream_container.hpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides a cuda stream container
+
+#ifndef MRPAPP_STREAM_CONTAINER_HPP
+#define MRPAPP_STREAM_CONTAINER_HPP
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+#include "gpu_stream.hpp"
+
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+class StreamContainer {
+public:
+  StreamContainer(std::size_t max_threads = 0) : streams_(max_threads) {}
+
+  std::size_t get_max_threads() const {
+    return streams_.size();
+  }
+
+  std::size_t get_streams_per_thread() const {
+    return streams_per_thread_;
+  }
+
+  void resize(const int max_threads) {
+    streams_.resize(max_threads);
+  }
+
+  StreamContainer(const StreamContainer&) = delete;
+  StreamContainer& operator=(const StreamContainer&) = delete;
+
+  // Returns the 'stream_id'-th stream associated with thread 'thread_id'.
+  // Preconditions: 0 <= thread_id < get_max_threads(),
+  //                0 <= stream_id < streams_per_thread_.
+  GpuStream& operator()(int thread_id, int stream_id) {
+    assert(thread_id >= 0 && thread_id < get_max_threads());
+    assert(stream_id >= 0 && stream_id < streams_per_thread_);
+    return streams_[thread_id][stream_id];
+  }
+
+  // Synchronizes the 'stream_id'-th stream associated with thread 'thread_id'.
+  // Preconditions: 0 <= thread_id < get_max_threads(),
+  //                0 <= stream_id < streams_per_thread_.
+  void sync(int thread_id, int stream_id) {
+    operator()(thread_id, stream_id).sync();
+  }
+
+private:
+  constexpr static std::size_t streams_per_thread_ = 2;
+  std::vector<std::array<GpuStream, streams_per_thread_>> streams_;
+};
+
+}  // namespace util
+}  // namespace linalg
+
+#endif  // MRPAPP_STREAM_CONTAINER_HPP
diff --git a/linalg/util/stream_functions.hpp b/linalg/util/stream_functions.hpp
new file mode 100644
index 000000000..5605e958c
--- /dev/null
+++ b/linalg/util/stream_functions.hpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides access to a global instance of StreamContainer.
+
+#ifndef MRPAPP_STREAM_FUNCTIONS_HPP
+#define MRPAPP_STREAM_FUNCTIONS_HPP
+
+#include "stream_container.hpp"
+
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+// Global stream container.
+inline StreamContainer& getStreamContainer() {
+  // Initialize resources for one thread.
+  static StreamContainer stream_container(1);
+  return stream_container;
+}
+
+inline void resizeStreamContainer(const int max_threads) {
+  getStreamContainer().resize(max_threads);
+}
+
+// Preconditions: 0 <= thread_id < max_threads,
+//                0 <= stream_id < StreamContainer::streams_per_thread_.
+inline GpuStream& getStream(int thread_id, int stream_id) {
+  return getStreamContainer()(thread_id, stream_id);
+}
+
+// Preconditions: 0 <= thread_id < max_threads,
+//                0 <= stream_id < StreamContainer::streams_per_thread_.
+inline void syncStream(int thread_id, int stream_id) {
+  getStreamContainer().sync(thread_id, stream_id);
+}
+
+}  // namespace util
+}  // namespace linalg
+
+#endif  // MRPAPP_STREAM_FUNCTIONS_HPP
diff --git a/linalg/util/util_gpublas.cpp b/linalg/util/util_gpublas.cpp
new file mode 100644
index 000000000..347501a68
--- /dev/null
+++ b/linalg/util/util_gpublas.cpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2024 ETH Zurich
+// Copyright (C) 2024 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// This file implements cublas related utilities.
+
+#include "platform/mrpapp_gpu.h"
+#include "platform/mrpapp_gpu_blas.h"
+#include "util_gpublas.hpp"
+#include "handle_functions.hpp"
+
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+#if defined(MRPAPP_HAVE_GPU)
+
+#if defined(MRPAPP_HAVE_CUDA)
+int getGpuBLASVersion() {
+  int version = 0;
+  cublasStatus_t ret = cublasGetVersion(getHandle(0), &version);
+  checkRC(ret);
+  return version;
+}
+#elif defined(MRPAPP_HAVE_HIP)
+int getGpuBLASVersion() {
+  int version = hipblasVersionMajor;
+  return version;
+} 
+#endif
+  
+#else
+
+#endif  // MRPAPP_HAVE_CUDA
+
+}  // namespace util
+}  // namespace linalg
diff --git a/linalg/util/util_gpublas.hpp b/linalg/util/util_gpublas.hpp
new file mode 100644
index 000000000..1fe45890f
--- /dev/null
+++ b/linalg/util/util_gpublas.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter Doak      (doakpw@ornl.gov)
+//
+
+
+#ifndef DCA_LINALG_UTIL_UTIL_GPUBLAS_HPP
+#define DCA_LINALG_UTIL_UTIL_GPUBLAS_HPP
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+int getGpuBLASVersion();
+
+}  // util
+}  // linalg
+}  // dca
+
+#endif  // DCA_LINALG_UTIL_UTIL_CUBLAS_HPP
diff --git a/linalg/vector.hpp b/linalg/vector.hpp
index a21fd0203..157b65cc3 100644
--- a/linalg/vector.hpp
+++ b/linalg/vector.hpp
@@ -23,12 +23,12 @@
 #include <type_traits>
 #include <vector>
 
-#include "dca/platform/dca_gpu.h"
-#include "dca/linalg/device_type.hpp"
-#include "dca/linalg/util/memory.hpp"
-#include "dca/linalg/util/allocators/allocators.hpp"
-#include "dca/linalg/util/copy.hpp"
-#include "dca/linalg/util/stream_functions.hpp"
+#include "platform/mrpapp_gpu.h"
+#include "linalg/device_type.hpp"
+#include "linalg/util/memory.hpp"
+#include "linalg/util/allocators/allocators.hpp"
+#include "linalg/util/copy.hpp"
+#include "linalg/util/stream_functions.hpp"
 
 namespace dca {
 namespace linalg {
diff --git a/platform/dca_gpu.h b/platform/dca_gpu.h
deleted file mode 100644
index d9097f729..000000000
--- a/platform/dca_gpu.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (C) 2021 ETH Zurich
-// Copyright (C) 2021 UT-Battelle, LLC
-// All rights reserved.
-//
-// See LICENSE for terms of usage.
-// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
-//
-// Author: Peter Doak (doakpw@ornl.gov)
-//
-
-/** \file
- *  This file provides vender independent basic gpu headers.
- *
- *  This file turns our to be essential to make sure compilation units in libraries
- *  include the same haves_defines.hpp and have the expected symbols defined.
- *
- *  Since having DCA_HAVE_GPU defined means at least basic GPU types need to be defined
- *  this is more often included rather than haves_defines.hpp directly
- */
-#ifndef MRPAPP_GPU_H
-#define MRPAPP_GPU_H
-
-#include "dca/config/haves_defines.hpp"
-#if defined(DCA_HAVE_CUDA)
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "dca/linalg/util/error_cuda.hpp"
-
-#define HIP_SYMBOL(x) x
-
-#elif defined(DCA_HAVE_HIP)
-#include <hip/hip_runtime.h>
-#include "dca/util/cuda2hip.h"
-#include "dca/linalg/util/error_hip.hpp"
-#endif
-
-#endif

From 4700b9bef16b314167f7f337bad94ba31e4f7398 Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Wed, 7 May 2025 17:31:28 -0400
Subject: [PATCH 03/11] additional files needed for build and platform

---
 CMake/mrpapp_defines.cmake |  39 ++++++++
 CMake/mrpapp_linking.cmake |  47 +++++++++
 defines.hpp.in             |  12 +++
 platform/cuda2hip.h        | 197 +++++++++++++++++++++++++++++++++++++
 platform/error_gpuBLAS.hpp |  62 ++++++++++++
 platform/mrpapp_gpu.h      |  37 +++++++
 platform/mrpapp_gpu_blas.h |  30 ++++++
 7 files changed, 424 insertions(+)
 create mode 100644 CMake/mrpapp_defines.cmake
 create mode 100644 CMake/mrpapp_linking.cmake
 create mode 100644 defines.hpp.in
 create mode 100644 platform/cuda2hip.h
 create mode 100644 platform/error_gpuBLAS.hpp
 create mode 100644 platform/mrpapp_gpu.h
 create mode 100644 platform/mrpapp_gpu_blas.h

diff --git a/CMake/mrpapp_defines.cmake b/CMake/mrpapp_defines.cmake
new file mode 100644
index 000000000..185307e45
--- /dev/null
+++ b/CMake/mrpapp_defines.cmake
@@ -0,0 +1,39 @@
+# // Copyright (C) 2024 UT-Battelle, LLC
+# // All rights reserved.
+# //
+# // See LICENSE for terms of usage.
+# //
+
+################################################################################
+# Adds a definition to the global 'have definitions' string.
+function(mrpapp_add_define definition)
+  if(ARGN)
+    set_property(GLOBAL APPEND PROPERTY MRPAPP_CXX_DEFINITIONS "${definition} ${ARGN}")
+  else()
+    set_property(GLOBAL APPEND PROPERTY MRPAPP_CXX_DEFINITIONS "${definition}")
+  endif()
+endfunction()
+
+################################################################################
+# Generates in the build directory the haves_defines.hpp that contains all 'haves preprocessor
+# definitions'.
+function(mrpapp_write_definitions_file)
+  get_property(MRPAPP_CXX_DEFINITIONS_VAR GLOBAL PROPERTY MRPAPP_CXX_DEFINITIONS)
+
+  list(SORT MRPAPP_CXX_DEFINITIONS_VAR)
+  list(REMOVE_DUPLICATES MRPAPP_CXX_DEFINITIONS_VAR)
+  list(REMOVE_ITEM MRPAPP_CXX_DEFINITIONS_VAR "")
+
+  set(mrpapp_cxx_defines_string "")
+  message("cxx definitions: ${MRPAPP_CXX_DEFINITIONS_VAR}")
+  foreach(def ${MRPAPP_CXX_DEFINITIONS_VAR})
+    string(CONCAT mrpapp_cxx_defines_string
+        "${mrpapp_cxx_defines_string}"
+        "#ifndef ${def}\n"
+        " #define ${def} ${${def}_define}\n"
+        "#endif\n\n")
+  endforeach()
+
+  configure_file("${PROJECT_SOURCE_DIR}/defines.hpp.in"
+    "${CMAKE_BINARY_DIR}/defines.hpp")
+endfunction()
diff --git a/CMake/mrpapp_linking.cmake b/CMake/mrpapp_linking.cmake
new file mode 100644
index 000000000..d79794b9a
--- /dev/null
+++ b/CMake/mrpapp_linking.cmake
@@ -0,0 +1,47 @@
+
+#link the correct gpu runtime library
+function(mrpapp_gpu_runtime_link target_name)
+  if(MRPAPP_HAVE_HIP)
+    target_link_libraries(${target_name} PUBLIC hip::host roc::hipblas roc::hipsparse)
+    message("linking target ${target_name} to hip::host")
+  elseif(MRPAPP_HAVE_CUDA)
+    target_link_libraries(${target_name} PUBLIC CUDA::cudart)
+  endif()
+endfunction()
+
+#link the correct gpu runtime library
+function(mrpapp_gpu_blas_link target_name)
+  if(MRPAPP_HAVE_HIP)
+    target_link_libraries(${target_name} PUBLIC roc::hipblas roc::hipsparse)
+    message("linking target ${target_name} to roc::hipblas")
+  elseif(MRPAPP_HAVE_CUDA)
+    target_link_libraries(${target_name} PUBLIC CUDA::cublas)
+  endif()
+endfunction()
+
+function(mrpapp_gpu_device_link target_name)
+  if(MRPAPP_HAVE_HIP)
+    set_target_properties( ${target_name} PROPERTIES LINKER_LANGUAGE "HIP")
+    set_target_properties( ${target_name}
+      PROPERTIES HIP_SEPARABLE_COMPILATION ON)
+    set_target_properties( ${target_name}
+      PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON)
+    target_link_libraries(${target_name} PRIVATE hip::device roc::hipblas roc::hipsparse roc::rocthrust)
+    get_target_property(_srcs ${target_name} SOURCES)
+    get_target_property(_src_dir ${target_name} SOURCE_DIR)
+    #
+    # Mark all cu source files as HIP code.
+    foreach(_src IN LISTS _srcs)
+        #message("${_src_dir}/${_src}")
+        if(_src MATCHES ".*\.cu$")
+            set_source_files_properties(${_src} PROPERTIES LANGUAGE HIP)
+        endif()
+    endforeach()
+  elseif(MRPAPP_HAVE_CUDA)
+    set_target_properties( ${target_name}
+      PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    set_target_properties( ${target_name}
+      PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+   # target_compile_definitions(lapack_kernels PRIVATE MRPAPP_HAVE_CUDA)
+  endif()
+endfunction()
diff --git a/defines.hpp.in b/defines.hpp.in
new file mode 100644
index 000000000..01c2f7278
--- /dev/null
+++ b/defines.hpp.in
@@ -0,0 +1,12 @@
+# // Copyright (C) 2023 UT-Battelle, LLC
+# // All rights reserved.
+# //
+# // See LICENSE for terms of usage.
+# //
+
+#ifndef MRPAPP_DEFINES_HPP
+#define MRPAPP_DEFINES_HPP
+
+@mrpapp_cxx_defines_string@
+
+#endif    
diff --git a/platform/cuda2hip.h b/platform/cuda2hip.h
new file mode 100644
index 000000000..4de6d8caa
--- /dev/null
+++ b/platform/cuda2hip.h
@@ -0,0 +1,197 @@
+/////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright(C) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright(C) 2021 UT-Battelle, LLC
+//
+// File developed by: Jakub Kurzak, jakurzak@amd.com, Advanced Micro Devices, Inc.
+//                    Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
+//
+// File created by: Jakub Kurzak, jakurzak@amd.com, Advanced Micro Devices, Inc.
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef CUDA2HIP_H
+#define CUDA2HIP_H
+
+#define CUBLAS_DIAG_UNIT                HIPBLAS_DIAG_UNIT
+#define CUBLAS_DIAG_NON_UNIT            HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_LOWER          HIPBLAS_FILL_MODE_LOWER
+#define CUBLAS_FILL_MODE_UPPER          HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_SIDE_LEFT                HIPBLAS_SIDE_LEFT
+#define CUBLAS_SIDE_RIGHT               HIPBLAS_SIDE_RIGHT
+#define CUBLAS_OP_N                     HIPBLAS_OP_N
+#define CUBLAS_OP_T                     HIPBLAS_OP_T
+#define CUBLAS_OP_C                     HIPBLAS_OP_C
+#define CUBLAS_STATUS_ALLOC_FAILED      HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_ARCH_MISMATCH     HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_EXECUTION_FAILED  HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR    HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_INVALID_VALUE     HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_LICENSE_ERROR     HIPBLAS_STATUS_LICENSE_ERROR
+#define CUBLAS_STATUS_MAPPING_ERROR     HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_NOT_INITIALIZED   HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_NOT_SUPPORTED     HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_SUCCESS           HIPBLAS_STATUS_SUCCESS
+
+#define cublasCgemmBatched      hipblasCgemmBatched
+#define cublasCgetrfBatched     hipblasCgetrfBatched
+#define cublasCgetriBatched     hipblasCgetriBatched
+#define cublasComplex           hipblasComplex
+#define cublasCreate            hipblasCreate
+#define cublasDestroy           hipblasDestroy
+#define cublasSaxpy             hipblasSaxpy
+#define cublasDaxpy             hipblasDaxpy
+#define cublasCaxpy             hipblasCaxpy
+#define cublasZaxpy             hipblasZaxpy
+#define cublasScopy             hipblasScopy
+#define cublasDcopy             hipblasDcopy
+#define cublasCcopy             hipblasCcopy
+#define cublasZcopy             hipblasZcopy
+#define cublasDiagType_t        hipblasDiagType_t
+#define cublasFillMode_t        hipblasFillMode_t
+#define cublasSgemm             hipblasSgemm
+#define cublasDgemm             hipblasDgemm
+#define cublasCgemm             hipblasCgemm
+#define cublasZgemm             hipblasZgemm
+#define cublasDgemmBatched      hipblasDgemmBatched
+#define cublasDgetrfBatched     hipblasDgetrfBatched
+#define cublasDgetriBatched     hipblasDgetriBatched
+#define cublasDoubleComplex     hipblasDoubleComplex
+#define cublasGetVersion        hipRuntimeGetVersion
+#define cublasHandle_t          hipblasHandle_t
+#define cublasOperation_t       hipblasOperation_t
+#define cublasGetStream         hipblasGetStream
+#define cublasSetStream         hipblasSetStream
+#define cublasSgemmBatched      hipblasSgemmBatched
+#define cublasSgetrfBatched     hipblasSgetrfBatched
+#define cublasSgetriBatched     hipblasSgetriBatched
+#define cublasStatus_t          hipblasStatus_t
+#define cublasSscal             hipblasSscal
+#define cublasDscal             hipblasDscal
+#define cublasCscal             hipblasCscal
+#define cublasZscal             hipblasZscal
+#define cublasStrsm            hipblasStrsm
+#define cublasDtrsm            hipblasDtrsm
+#define cublasCtrsm            hipblasCtrsm
+#define cublasZtrsm            hipblasZtrsm
+#define cublasSswap             hipblasSswap
+#define cublasDswap             hipblasDswap
+#define cublasCswap             hipblasCswap
+#define cublasZswap             hipblasZswap
+#define cublasSideMode_t        hipblasSideMode_t
+#define cublasZgemmBatched      hipblasZgemmBatched
+#define cublasZgetrfBatched     hipblasZgetrfBatched
+#define cublasZgetriBatched     hipblasZgetriBatched
+#define cusparseHandle_t        hipsparseHandle_t
+#define cusparseCreate          hipsparseCreate
+#define cusparseDestroy         hipsparseDestroy
+#define magma_queue_create_from_cuda_internal  magma_queue_create_from_hip_internal
+#define magma_queue_get_cuda_stream  magma_queue_get_hip_stream
+
+
+#define cudaStreamAttachMemAsync        hipStreamAttachMemAsync
+// hipComplex types are broken with respect to operator overloads
+#define cuComplex                       magmaFloatComplex
+#define cudaAddressModeClamp            hipAddressModeClamp
+#define cudaArray                       hipArray
+#define cudaBindTextureToArray          hipBindTextureToArray
+#define cudaChannelFormatDesc           hipChannelFormatDesc
+#define cudaChannelFormatKindFloat      hipChannelFormatKindFloat
+#define cudaCreateChannelDesc           hipCreateChannelDesc
+#define cudaDeviceProp                  hipDeviceProp_t
+#define cudaDeviceReset                 hipDeviceReset
+#define cudaDeviceSynchronize           hipDeviceSynchronize
+#define cudaError_t                     hipError_t
+#define cudaErrorInvalidValue           hipErrorInvalidValue
+#define cudaErrorDeviceAlreadyInUse     hipErrorContextAlreadyInUse
+#define cudaErrorIllegalAddress		hipErrorIllegalAddress    
+// This error is missing from the hip API
+#define cudaErrorIllegalInstruction	hipErrorIllegalAddress
+#define cudaErrorInvalidDevice		hipErrorInvalidDevice     
+#define cudaErrorInvalidPitchValue	hipErrorInvalidPitchValue 
+#define cudaErrorLaunchFailure		hipErrorLaunchFailure     
+#define cudaErrorMemoryAllocation	hipErrorMemoryAllocation  
+#define cudaErrorNoDevice		hipErrorNoDevice          
+#define cudaErrorUnknown		hipErrorUnknown           
+#define cudaEvent_t                     hipEvent_t
+#define cudaEventCreate                 hipEventCreate
+#define cudaEventCreateWithFlags        hipEventCreateWithFlags
+#define cudaEventDestroy                hipEventDestroy
+#define cudaEventDisableTiming          hipEventDisableTiming
+#define cudaEventElapsedTime            hipEventElapsedTime
+#define cudaEventQuery                  hipEventQuery
+#define cudaEventRecord                 hipEventRecord
+#define cudaEventSynchronize            hipEventSynchronize
+#define cudaEventBlockingSync           hipEventBlockingSync
+#define cudaFilterModeLinear            hipFilterModeLinear
+#define cudaFree                        hipFree
+#define cudaFreeHost                    hipHostFree
+#define cudaGetDevice                   hipGetDevice
+#define cudaGetDeviceCount              hipGetDeviceCount
+#define cudaGetDeviceProperties         hipGetDeviceProperties
+#define cudaGetErrorString              hipGetErrorString
+#define cudaGetLastError                hipGetLastError
+#define cudaPeekAtLastError             hipPeekAtLastError
+#define cudaHostAlloc                   hipHostMalloc
+#define cudaHostAllocMapped             hipHostMallocMapped
+#define cudaHostAllocDefault            hipHostMallocDefault
+#define cudaHostAllocPortable           hipHostMallocPortable
+#define cudaIpcGetMemHandle             hipIpcGetMemHandle
+#define cudaIpcMemHandle_t              hipIpcMemHandle_t
+#define cudaIpcMemLazyEnablePeerAccess  hipIpcMemLazyEnablePeerAccess
+#define cudaIpcOpenMemHandle            hipIpcOpenMemHandle
+#define cudaMalloc                      hipMalloc
+#define cudaMallocArray                 hipMallocArray
+#define cudaMallocManaged               hipMallocManaged
+#define cudaMemAdvise                   hipMemAdvise
+#define cudaMemAdviseSetAccessedBy      hipMemAdviseSetAccessedBy
+#define cudaMemAdviseSetReadMostly      hipMemAdviseSetReadMostly
+#define cudaMemAttachGlobal             hipMemAttachGlobal
+#define cudaMemcpy                      hipMemcpy
+#define cudaMemcpy2D                    hipMemcpy2D
+#define cudaMemcpyAsync                 hipMemcpyAsync
+#define cudaMemcpy2DAsync               hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice        hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost          hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice          hipMemcpyHostToDevice
+#define cudaMemcpyHostToHost            hipMemcpyHostToHost
+#define cudaMemset                      hipMemset
+#define cudaMemsetAsync                 hipMemsetAsync
+#define cudaMemcpyDefault               hipMemcpyDefault
+#define cudaMemcpyToArrayAsync          hipMemcpyToArray
+#define cudaMemcpyToSymbol              hipMemcpyToSymbol
+#define cudaMemcpyToSymbolAsync         hipMemcpyToSymbolAsync
+#define cudaMemPrefetchAsync            hipMemPrefetchAsync
+#define cudaMemoryType                  hipMemoryType
+#define cudaMemoryTypeDevice            hipMemoryTypeDevice
+#define cudaMemoryTypeHost              hipMemoryTypeHost
+#define cudaPointerAttributes           hipPointerAttribute_t
+#define cudaPointerGetAttributes        hipPointerGetAttributes
+#define cudaReadModeElementType         hipReadModeElementType
+#define cudaSetDevice                   hipSetDevice
+#define cudaStream_t                    hipStream_t
+#define cudaStreamCreate                hipStreamCreate
+#define cudaStreamDestroy               hipStreamDestroy
+#define cudaStreamSynchronize           hipStreamSynchronize
+#define cudaStreamWaitEvent             hipStreamWaitEvent
+#define cudaStreamLegacy                hipStreamLegacy
+#define hipStreamLegacy 0
+#define cudaSuccess                     hipSuccess
+#define cuFloatComplex                  magmaFloatComplex
+#define cuDoubleComplex                 magmaDoubleComplex
+#define make_cuComplex                  make_hipComplex
+#define make_cuDoubleComplex            make_hipDoubleComplex
+
+#define cuCaddf                         hipCaddf
+#define cuCsubf                         hipCsubf
+#define cuCmulf                         hipCmulf
+#define cuCdivf                         hipCdivf
+#define cuCadd                          hipCadd
+#define cuCsub                          hipCsub
+#define cuCmul                          hipCmul
+#define cuCdiv                          hipCdiv
+#define make_cuComplex                  make_hipComplex
+#define cudaDeviceSetLimit(limit, falue) ;
+
+#endif /* CUDA2HIP_H */
diff --git a/platform/error_gpuBLAS.hpp b/platform/error_gpuBLAS.hpp
new file mode 100644
index 000000000..0850752ca
--- /dev/null
+++ b/platform/error_gpuBLAS.hpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//         Peter Doak (doakpw@ornl.gov)
+//
+// This file provides cublas related utilities to
+// - return code checking,
+// - error message printing.
+
+#ifndef DCA_LINALG_UTIL_ERROR_GPUBLAS_HPP
+#define DCA_LINALG_UTIL_ERROR_GPUBLAS_HPP
+
+#include "defines.hpp"
+
+#if defined(DCA_HAVE_CUDA)
+#include <cublas_v2.h>
+#include "linalg/util/error_cuda.hpp"
+#elif defined(DCA_HAVE_HIP)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_complex.h>
+#include "util/cuda2hip.h"
+#include "linalg/util/error_hip.hpp"
+#endif
+#include <stdexcept>
+#include <string>
+
+namespace dca {
+namespace linalg {
+namespace util {
+// dca::linalg::util::
+
+// Returns the error string related to error.
+std::string errorStringCublas(cublasStatus_t error);
+
+// Prints an error message containing function_name, file_name, line, and the error message and
+// error code related to error.
+void printErrorMessage(cublasStatus_t error, std::string function_name, std::string file_name,
+                       int line, std::string extra_error_string = "");
+
+// Prints an error message and throws a std::logic_error if the return code of a cuda function is
+// not CUBLAS_STATUS_SUCCESS.
+// This function can be invoked with the macros checkRC and checkRCMsg (defined in error_cuda.hpp)
+// that automatically include the function name, the filename, and the line to the function call.
+inline void checkRCInternal(cublasStatus_t return_code, std::string function_name,
+                            std::string file_name, int line, std::string extra_error_string = "") {
+  if (return_code != CUBLAS_STATUS_SUCCESS) {
+    printErrorMessage(return_code, function_name, file_name, line, extra_error_string);
+    throw std::logic_error(function_name);
+  }
+}
+
+}  // util
+}  // linalg
+}  // dca
+
+#endif  // DCA_LINALG_UTIL_ERROR_CUBLAS_HPP
diff --git a/platform/mrpapp_gpu.h b/platform/mrpapp_gpu.h
new file mode 100644
index 000000000..e33dd0c7a
--- /dev/null
+++ b/platform/mrpapp_gpu.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Doak (doakpw@ornl.gov)
+//
+
+/** \file
+ *  This file provides vender independent basic gpu headers.
+ *
+ *  This file turns our to be essential to make sure compilation units in libraries
+ *  include the same haves_defines.hpp and have the expected symbols defined.
+ *
+ *  Since having DCA_HAVE_GPU defined means at least basic GPU types need to be defined
+ *  this is more often included rather than haves_defines.hpp directly
+ */
+#ifndef MRPAPP_GPU_H
+#define MRPAPP_GPU_H
+
+#include "defines.hpp"
+#if defined(MRPAPP_HAVE_CUDA)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "linalg/util/error_cuda.hpp"
+
+#define HIP_SYMBOL(x) x
+
+#elif defined(MRPAPP_HAVE_HIP)
+#include <hip/hip_runtime.h>
+#include "platform/cuda2hip.h"
+#include "linalg/util/error_hip.hpp"
+#endif
+
+#endif
diff --git a/platform/mrpapp_gpu_blas.h b/platform/mrpapp_gpu_blas.h
new file mode 100644
index 000000000..b3baa822b
--- /dev/null
+++ b/platform/mrpapp_gpu_blas.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Doak (doakpw@ornl.gov)
+//
+
+/** \file
+ *  This file provides vender independent basic gpu headers.
+ */
+#ifndef DCA_GPU_BLAS_H
+#define DCA_GPU_BLAS_H
+
+#include "defines.hpp"
+#if defined(MRPAPP_HAVE_CUDA)
+#include <cublas_v2.h>
+#include "linalg/util/error_cuda.hpp"
+#elif defined(MRPAPP_HAVE_HIP)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_complex.h>
+#include "platform/cuda2hip.h"
+#include "linalg/util/error_hip.hpp"
+#endif
+#include "platform/error_gpuBLAS.hpp"
+
+#endif

From 375bcb11e35dc237f7991364a6b14d5bdaef781e Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Wed, 7 May 2025 17:31:51 -0400
Subject: [PATCH 04/11] clang format for MRPAPP

---
 .clang-format | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..660aec884
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,81 @@
+---
+# Need clang-format version 3.8 or above
+
+Language:        Cpp
+
+AccessModifierOffset: -2
+AlignAfterOpenBracket: true
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakBeforeFirstCallParameter: 10
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 400
+PenaltyBreakString: 10000
+PenaltyExcessCharacter: 10
+PenaltyReturnTypeOnItsOwnLine: 100000000
+PointerAlignment: Left
+#ReflowComments: true # Supported only from clang 3.9
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
+

From ffca3b91512b4f8ee20e68c64aa026e9f1392782 Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 10:34:49 -0400
Subject: [PATCH 05/11] porting DCA gpu support infrastructure

---
 CMake/mrpapp_tests.cmake                     |  28 ++
 CMakeLists.txt                               |  29 +-
 linalg/CMakeLists.txt                        |  17 ++
 linalg/device_type.hpp                       |   2 +-
 linalg/tests/CMakeLists.txt                  |   9 +
 linalg/tests/gpu_test_util.hpp               |  75 +++++
 linalg/tests/vector_cpu_gpu_test.cpp         | 214 +++++++++++++
 linalg/util/allocators/aligned_allocator.hpp |  43 +++
 linalg/util/allocators/allocators.hpp        |  23 +-
 linalg/util/allocators/device_allocator.hpp  |  58 ++++
 linalg/util/allocators/managed_allocator.hpp |  69 +++++
 linalg/util/allocators/pinned_allocator.hpp  |  91 ++++++
 linalg/util/copy.hpp                         |  40 ++-
 linalg/util/error_cuda.hpp                   |  22 +-
 linalg/util/error_gpu.cpp                    |   4 +-
 linalg/util/error_gpuBLAS.cpp                |   4 +-
 linalg/util/error_gpuBLAS.hpp                |   2 +-
 linalg/util/error_hip.hpp                    |  10 +-
 linalg/util/gpuBLAS_handles.hpp              |   4 +-
 linalg/util/gpu_stream.hpp                   |   6 +-
 linalg/util/gpu_type_mapping.hpp             | 150 +++++++++
 linalg/util/handle_functions.hpp             |   4 +-
 linalg/util/ignore.hpp                       |  34 +++
 linalg/util/info_gpu.cpp                     |   4 +-
 linalg/util/info_gpu.hpp                     |   4 +-
 linalg/util/memory.hpp                       |  42 ++-
 linalg/util/stream_container.hpp             |   6 +-
 linalg/util/stream_functions.hpp             |   6 +-
 linalg/util/util_gpublas.cpp                 |   8 +-
 linalg/util/util_gpublas.hpp                 |   4 +-
 linalg/vector.cpp                            |  20 ++
 linalg/vector.hpp                            |  21 +-
 platform/error_gpuBLAS.hpp                   |   4 +-
 platform/mrpapp_gpu_blas.h                   |   4 +-
 platform/mrpapp_gpu_complex.h                |  60 ++++
 util/type_fundamentals.hpp                   |  46 +++
 util/type_help.hpp                           | 303 +++++++++++++++++++
 util/type_mapping.hpp                        |  32 ++
 38 files changed, 1378 insertions(+), 124 deletions(-)
 create mode 100644 CMake/mrpapp_tests.cmake
 create mode 100644 linalg/CMakeLists.txt
 create mode 100644 linalg/tests/CMakeLists.txt
 create mode 100644 linalg/tests/gpu_test_util.hpp
 create mode 100644 linalg/tests/vector_cpu_gpu_test.cpp
 create mode 100644 linalg/util/allocators/aligned_allocator.hpp
 create mode 100644 linalg/util/allocators/device_allocator.hpp
 create mode 100644 linalg/util/allocators/managed_allocator.hpp
 create mode 100644 linalg/util/allocators/pinned_allocator.hpp
 create mode 100644 linalg/util/gpu_type_mapping.hpp
 create mode 100644 linalg/util/ignore.hpp
 create mode 100644 linalg/vector.cpp
 create mode 100644 platform/mrpapp_gpu_complex.h
 create mode 100644 util/type_fundamentals.hpp
 create mode 100644 util/type_help.hpp
 create mode 100644 util/type_mapping.hpp

diff --git a/CMake/mrpapp_tests.cmake b/CMake/mrpapp_tests.cmake
new file mode 100644
index 000000000..e77dec786
--- /dev/null
+++ b/CMake/mrpapp_tests.cmake
@@ -0,0 +1,28 @@
+# Runs unit tests
+function(ADD_UNIT_TEST TESTNAME PROCS THREADS TEST_BINARY)
+  message(VERBOSE "Adding test ${TESTNAME}")
+  math(EXPR TOT_PROCS "${PROCS} * ${THREADS}")
+  if(HAVE_MPI)
+    add_test(NAME ${TESTNAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${PROCS} ${MPIEXEC_PREFLAGS}
+                                      ${TEST_BINARY} ${ARGN})
+    set(TEST_ADDED TRUE)
+  else()
+    if((${PROCS} STREQUAL "1"))
+      add_test(NAME ${TESTNAME} COMMAND ${TEST_BINARY} ${ARGN})
+      set(TEST_ADDED TRUE)
+    else()
+      message(VERBOSE "Disabling test ${TESTNAME} (building without MPI)")
+    endif()
+  endif()
+
+  if(TEST_ADDED)
+    set_tests_properties(${TESTNAME} PROPERTIES PROCESSORS ${TOT_PROCS} ENVIRONMENT OMP_NUM_THREADS=${THREADS}
+                                                PROCESSOR_AFFINITY TRUE)
+
+    if(ENABLE_GPU)
+      set_tests_properties(${TESTNAME} PROPERTIES RESOURCE_LOCK exclusively_owned_gpus)
+    endif()
+
+  endif()
+
+endfunction()
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b204b1e6e..010037a53 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,10 +25,10 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 set(PROJECT_CMAKE ${mrpapp_SOURCE_DIR}/CMake)
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_CMAKE})
 
-set(QMC_CXX_STANDARD
+set(MRPAPP_CXX_STANDARD
     17
     CACHE STRING "MRPAPP C++ language standard")
-if(NOT QMC_CXX_STANDARD EQUAL 17)
+if(NOT MRPAPP_CXX_STANDARD EQUAL 17)
   message(WARNING "C++17 is the only language standard officially supported by this MRPAPP version. "
                   "Using other versions of the C++ standard is unsupported and done entirely at user's own risk.")
 endif()
@@ -60,8 +60,27 @@ endif(ENABLE_GPU)
 find_package(LAPACK REQUIRED)
 find_package(BLAS REQUIRED)
 
+include(ExternalProject)
+find_package(Git REQUIRED)
+
+find_package(Catch2 3)
+if (NOT CATCH2_FOUND)
+ExternalProject_Add(
+  catch2
+  PREFIX ${CMAKE_BINARY_DIR}/catch2
+  GIT_REPOSITORY git@github.com:catchorg/Catch2.git
+  GIT_TAG devel
+  TIMEOUT 30
+  UPDATE_COMMAND ${GIT_EXECUTABLE} pull
+  CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/catch2"
+  LOG_DOWNLOAD ON)
+
+set(CMAKE_PREFIX_PATH ${CMAKE_BINARY_DIR}/catch2)
+find_package(Catch2 3 REQUIRED)
+endif()
+
 set(MRPAPP_SRC
-  main.cpp)
+main.cpp)
 
 set(MRPAPP_MODEL "1BAND" CACHE STRING "RPA model options")
 set_property(CACHE MRPAPP_MODEL PROPERTY STRINGS SRRUO SRRUO3D SRRUO3DSUH 1BAND 1BANDWSPIN BILAYER_FESC BILAYER_1BAND ORTHOIIBILAYER BSCCOBILAYER BILAYER_FESC BAFEAS KFE2SE2 FOURORBITAL TBFILE COUPLEDLADDERS NDNIO2 MODELFROMFILESO KAGOME 1BANDABWSPIN 1BANDALTERMAGNET 1BANDAB)
@@ -70,7 +89,7 @@ include(mrpapp_defines)
 mrpapp_write_definitions_file()
 
 include(mrpapp_linking)
-add_subdirectory(linalg/util)
+add_subdirectory(linalg)
 
 add_executable(mrpapp ${MRPAPP_SRC})
 target_include_directories(mrpapp PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -87,3 +106,5 @@ target_link_libraries(mrpapp ${MRPAPP_GPU_LIBS} LAPACK::LAPACK BLAS::BLAS ${MPI_
 
 add_custom_target(genexdebug COMMAND ${CMAKE_COMMAND} -E echo "$<IF:$<OR:$<BOOL:${ENABLE_CUDA}>,$<BOOL:${ENABLE_CUDA}>>,ON,OFF>")
 #$<$<BOOL:${MPI_FOUND}>:USE_MPI>")
+
+include(mrpapp_tests)
diff --git a/linalg/CMakeLists.txt b/linalg/CMakeLists.txt
new file mode 100644
index 000000000..f943ee135
--- /dev/null
+++ b/linalg/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_subdirectory(util)
+
+set(srcs vector.cpp)
+add_library(linalg STATIC ${srcs})
+target_include_directories(linalg PUBLIC
+"${PROJECT_SOURCE_DIR};${PROJECT_BINARY_DIR};${PROJECT_SOURCE_DIR}/linalg;${PROJECT_SOURCE_DIR}/linalg/util")
+
+
+if(MRPAPP_HAVE_GPU)
+   target_link_libraries(linalg PRIVATE gpu_utils)
+endif(MRPAPP_HAVE_GPU)
+
+get_property(linalg_include_dirs TARGET linalg PROPERTY INCLUDE_DIRECTORIES)
+message("linalg target exports includes: ${linalg_include_dirs}")
+
+
+add_subdirectory(tests)
diff --git a/linalg/device_type.hpp b/linalg/device_type.hpp
index ed1492683..381c67c0a 100644
--- a/linalg/device_type.hpp
+++ b/linalg/device_type.hpp
@@ -12,7 +12,7 @@
 #ifndef MRPAPP_DEVICE_TYPE_HPP
 #define MRPAPP_DEVICE_TYPE_HPP
 
-namespace linalg {
+namespace mrpapp {
 
 enum DeviceType : int { CPU, GPU };
 
diff --git a/linalg/tests/CMakeLists.txt b/linalg/tests/CMakeLists.txt
new file mode 100644
index 000000000..9ebdbfc7a
--- /dev/null
+++ b/linalg/tests/CMakeLists.txt
@@ -0,0 +1,9 @@
+# linalg unit tests
+
+add_executable(test_vector vector_cpu_gpu_test.cpp)
+target_link_libraries(test_vector PUBLIC linalg Catch2::Catch2WithMain)
+mrpapp_gpu_runtime_link(test_vector)
+
+
+
+
diff --git a/linalg/tests/gpu_test_util.hpp b/linalg/tests/gpu_test_util.hpp
new file mode 100644
index 000000000..bcc3be0ae
--- /dev/null
+++ b/linalg/tests/gpu_test_util.hpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2021 ETH Zurich
+// Copyright (C) 2021 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file provides some utilities to test simple Matrix<GPU> operations.
+
+#ifndef DCA_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
+#define DCA_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
+
+#include "dca/platform/dca_gpu.h"
+#include "dca/linalg/matrix.hpp"
+
+namespace testing {
+template <typename ScalarType>
+cudaMemoryType PointerType(const ScalarType* ptr) {
+  cudaPointerAttributes attributes;
+  cudaError_t ret = cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
+  // If the pointer is not managed by CUDA, assume it is a Host memory pointer.
+  if (ret == cudaErrorInvalidValue)
+    return cudaMemoryTypeHost;
+  checkRC(ret);
+#if defined(DCA_HAVE_CUDA)
+  return attributes.type;
+#elif defined(DCA_HAVE_HIP)
+  #if HIP_VERSION_MAJOR >= 6
+  return attributes.type;
+  #else
+  return attributes.memoryType;
+  #endif
+#endif
+}
+
+template <typename ScalarType>
+bool isDevicePointer(const ScalarType* ptr) {
+  return cudaMemoryTypeDevice == PointerType(ptr);
+}
+
+template <typename ScalarType>
+bool isHostPointer(const ScalarType* ptr) {
+  return cudaMemoryTypeHost == PointerType(ptr);
+}
+
+template <typename ScalarType>
+void setOnDevice(ScalarType* ptr, ScalarType value) {
+  cudaError_t ret = cudaMemcpy(ptr, &value, sizeof(ScalarType), cudaMemcpyDefault);
+  checkRC(ret);
+}
+
+template <typename ScalarType>
+ScalarType getFromDevice(const ScalarType* ptr) {
+  ScalarType value;
+  cudaError_t ret = cudaMemcpy(&value, ptr, sizeof(ScalarType), cudaMemcpyDefault);
+  checkRC(ret);
+  return value;
+}
+
+// The elements of the matrix will be set with mat(i, j) = func(i, j).
+// In: func
+// Out: mat
+template <typename ScalarType, typename F>
+void setMatrixElements(dca::linalg::Matrix<ScalarType, dca::linalg::GPU>& mat, F& func) {
+  for (int j = 0; j < mat.nrCols(); ++j)
+    for (int i = 0; i < mat.nrRows(); ++i) {
+      ScalarType el(func(i, j));
+      setOnDevice(mat.ptr(i, j), el);
+    }
+}
+}  // testing
+
+#endif  // DCA_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
diff --git a/linalg/tests/vector_cpu_gpu_test.cpp b/linalg/tests/vector_cpu_gpu_test.cpp
new file mode 100644
index 000000000..9406c60de
--- /dev/null
+++ b/linalg/tests/vector_cpu_gpu_test.cpp
@@ -0,0 +1,214 @@
+// Copyright (C) 2025 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+//
+// Author: Peter Doak (doakpw@ornl.gov)
+//
+// This file tests the interaction between Vector<CPU> and Vector<GPU>.
+
+#include "linalg/vector.hpp"
+#include <complex>
+#include <string>
+#include <utility>
+#include "catch2/catch_test_macros.hpp"
+//#include "gpu_test_util.hpp"
+
+TEST_CASE(VectorCPUTest, PointerMemoryType) {
+  size_t size = 3;
+  size_t capacity = 11;
+  std::string name("vector name");
+
+  // Tests all the constructors.
+  {
+    linalg::Vector<float, linalg::CPU> vec(name, size, capacity);
+    ASSERT_TRUE(testing::isHostPointer(vec.ptr()));
+  }
+  {
+    linalg::Vector<int, linalg::CPU> vec(size);
+    EXPECT_TRUE(testing::isHostPointer(vec.ptr()));
+  }
+  {
+    linalg::Vector<std::complex<double>, linalg::CPU> vec(size, capacity);
+    EXPECT_TRUE(testing::isHostPointer(vec.ptr()));
+  }
+  {
+    linalg::Vector<int, linalg::CPU> vec(name, size);
+    EXPECT_TRUE(testing::isHostPointer(vec.ptr()));
+  }
+}
+
+// TEST(VectorCPUGPUTest, Constructors) {
+//   size_t size = 3;
+
+//   dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//   // Set the elements.
+//   for (int i = 0; i < vec.size(); ++i) {
+//     float el = 3 * i - 2;
+//     vec[i] = el;
+//   }
+
+//   dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(vec);
+//   ASSERT_EQ(vec.size(), vec_copy.size());
+//   ASSERT_LE(vec.size(), vec_copy.capacity());
+//   ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
+
+//   dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(vec_copy);
+//   EXPECT_EQ(vec.size(), vec_copy_copy.size());
+//   EXPECT_LE(vec.size(), vec_copy_copy.capacity());
+//   EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
+
+//   for (int i = 0; i < vec.size(); ++i) {
+//     EXPECT_EQ(vec[i], vec_copy_copy[i]);
+//     EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
+//   }
+// }
+
+// TEST(VectorCPUGPUTest, Assignement) {
+//   {
+//     // Assign a vector that fits into the capacity.
+//     size_t size = 3;
+
+//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
+//     auto old_ptr = vec_copy.ptr();
+//     auto capacity = vec_copy.capacity();
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     auto old_ptr_2 = vec_copy_copy.ptr();
+//     auto capacity_2 = vec_copy_copy.capacity();
+
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     // Set the elements.
+//     for (int i = 0; i < vec.size(); ++i) {
+//       float el = 3 * i - 2;
+//       vec[i] = el;
+//     }
+
+//     vec_copy = vec;
+//     ASSERT_EQ(vec.size(), vec_copy.size());
+//     ASSERT_EQ(capacity, vec_copy.capacity());
+//     ASSERT_EQ(old_ptr, vec_copy.ptr());
+//     ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
+
+//     vec_copy_copy = vec_copy;
+//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
+//     EXPECT_EQ(capacity_2, vec_copy_copy.capacity());
+//     EXPECT_EQ(old_ptr_2, vec_copy_copy.ptr());
+//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
+
+//     for (int i = 0; i < vec.size(); ++i) {
+//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
+//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
+//     }
+//   }
+//   {
+//     // Assign a vector that doesn't fit into the capacity.
+//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
+
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     // Set the elements.
+//     for (int i = 0; i < vec.size(); ++i) {
+//       float el = 3 * i - 2;
+//       vec[i] = el;
+//     }
+
+//     vec_copy = vec;
+//     ASSERT_EQ(vec.size(), vec_copy.size());
+//     ASSERT_LE(vec.size(), vec_copy.capacity());
+//     ASSERT_FALSE(testing::isHostPointer(vec_copy.ptr()));
+
+//     vec_copy_copy = vec_copy;
+//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
+//     EXPECT_LE(vec.size(), vec_copy_copy.capacity());
+//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
+
+//     for (int i = 0; i < vec.size(); ++i) {
+//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
+//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
+//     }
+//   }
+// }
+
+// TEST(VectorCPUGPUTest, Set) {
+//   {
+//     // Assign a vector that fits into the capacity.
+//     size_t size = 3;
+
+//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
+//     auto old_ptr = vec_copy.ptr();
+//     auto capacity = vec_copy.capacity();
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     auto old_ptr_2 = vec_copy_copy.ptr();
+//     auto capacity_2 = vec_copy_copy.capacity();
+
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     // Set the elements.
+//     for (int i = 0; i < vec.size(); ++i) {
+//       float el = 3 * i - 2;
+//       vec[i] = el;
+//     }
+
+//     vec_copy.set(vec, 0, 1);
+//     ASSERT_EQ(vec.size(), vec_copy.size());
+//     ASSERT_EQ(capacity, vec_copy.capacity());
+//     ASSERT_EQ(old_ptr, vec_copy.ptr());
+//     ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
+
+//     vec_copy_copy.set(vec_copy, 0, 1);
+//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
+//     EXPECT_EQ(capacity_2, vec_copy_copy.capacity());
+//     EXPECT_EQ(old_ptr_2, vec_copy_copy.ptr());
+//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
+
+//     for (int i = 0; i < vec.size(); ++i) {
+//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
+//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
+//     }
+//   }
+//   {
+//     // Assign a vector that doesn't fit into the capacity.
+//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
+
+//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     // Set the elements.
+//     for (int i = 0; i < vec.size(); ++i) {
+//       float el = 3 * i - 2;
+//       vec[i] = el;
+//     }
+
+//     vec_copy.set(vec, 0, 1);
+//     ASSERT_EQ(vec.size(), vec_copy.size());
+//     ASSERT_LE(vec.size(), vec_copy.capacity());
+//     ASSERT_FALSE(testing::isHostPointer(vec_copy.ptr()));
+
+//     vec_copy_copy.set(vec_copy, 0, 1);
+//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
+//     EXPECT_LE(vec.size(), vec_copy_copy.capacity());
+//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
+
+//     for (int i = 0; i < vec.size(); ++i) {
+//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
+//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
+//     }
+//   }
+// }
+
+// TEST(VectorCPUTest, setAsync) {
+//   std::vector<int> vec(4, 1);
+
+//   dca::linalg::Vector<int, dca::linalg::GPU> vec_copy;
+//   dca::linalg::Vector<int, dca::linalg::CPU> vec_copy_copy;
+
+//   mrpapp::GpuStream stream;
+
+//   vec_copy.setAsync(vec, stream);
+//   vec_copy_copy.setAsync(vec_copy, stream);
+//   stream.sync();
+
+//   EXPECT_EQ(vec.size(), vec_copy_copy.size());
+//   for (int i = 0; i < vec.size(); ++i)
+//     EXPECT_EQ(vec[i], vec_copy_copy[i]);
+}
diff --git a/linalg/util/allocators/aligned_allocator.hpp b/linalg/util/allocators/aligned_allocator.hpp
new file mode 100644
index 000000000..60cc33675
--- /dev/null
+++ b/linalg/util/allocators/aligned_allocator.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides allocators with pinned or mapped memory usable with std::vector.
+
+#ifndef MRPAPP_LINALG_UTIL_ALLOCATORS_ALIGNED_ALLOCATOR_HPP
+#define MRPAPP_LINALG_UTIL_ALLOCATORS_ALIGNED_ALLOCATOR_HPP
+#include <cstdlib>
+#include <new>
+
+namespace linalg {
+namespace mrpapp {
+
+template <typename T>
+class AlignedAllocator {
+protected:
+  T* allocate(std::size_t n) {
+    if (!n)
+      return nullptr;
+
+    T* ptr;
+    int err = posix_memalign((void**)&ptr, 128, n * sizeof(T));
+    if (err)
+      throw(std::bad_alloc());
+    return ptr;
+  }
+
+  void deallocate(T*& ptr, std::size_t /*n*/ = 0) noexcept {
+    free(ptr);
+    ptr = nullptr;
+  }
+};
+
+}  // util
+}  // linalg
+
+#endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_ALIGNED_ALLOCATOR_HPP
diff --git a/linalg/util/allocators/allocators.hpp b/linalg/util/allocators/allocators.hpp
index 9007ee749..0b074c99c 100644
--- a/linalg/util/allocators/allocators.hpp
+++ b/linalg/util/allocators/allocators.hpp
@@ -9,28 +9,26 @@
 //
 // This file provides include all types of allocators, and provides a default selector.
 
-#ifndef DCA_LINALG_UTIL_ALLOCATORS_HPP
-#define DCA_LINALG_UTIL_ALLOCATORS_HPP
+#ifndef MRPAPP_LINALG_UTIL_ALLOCATORS_HPP
+#define MRPAPP_LINALG_UTIL_ALLOCATORS_HPP
 
 #include <stdexcept>
-#include "dca/config/haves_defines.hpp"
 #include "aligned_allocator.hpp"
-#include "dca/linalg/device_type.hpp"
-#ifdef DCA_HAVE_GPU
+#include "device_type.hpp"
+#ifdef MRPAPP_HAVE_GPU
 #include "device_allocator.hpp"
 #include "managed_allocator.hpp"
 #include "pinned_allocator.hpp"
-#endif  // DCA_HAVE_GPU
+#endif  // MRPAPP_HAVE_GPU
 
-namespace dca {
 namespace linalg {
-namespace util {
+namespace mrpapp {
 namespace selector {
-// dca::linalg::util::selector::
+// mrpapp::selector::
 template <typename T, DeviceType device>
 struct DefaultAllocator;
 
-#ifdef DCA_HAVE_GPU
+#ifdef MRPAPP_HAVE_GPU
 template <typename T>
 struct DefaultAllocator<T, CPU> {
   using type = PinnedAllocator<T>;
@@ -58,7 +56,7 @@ struct DefaultAllocator<T, GPU> {
   using type = UnusedAllocator;
 };
 
-#endif  // DCA_HAVE_GPU
+#endif  // MRPAPP_HAVE_GPU
 
 }  // selector
 // dca::linalg::util:
@@ -68,6 +66,5 @@ using DefaultAllocator = typename selector::DefaultAllocator<T, device>::type;
 
 }  // util
 }  // linalg
-}  // dca
 
-#endif  // DCA_LINALG_UTIL_ALLOCATORS_HPP
+#endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_HPP
diff --git a/linalg/util/allocators/device_allocator.hpp b/linalg/util/allocators/device_allocator.hpp
new file mode 100644
index 000000000..d6665d55c
--- /dev/null
+++ b/linalg/util/allocators/device_allocator.hpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides allocators with pinned or mapped memory usable with std::vector.
+
+#ifndef MRPAPP_LINALG_UTIL_ALLOCATORS_DEVICE_ALLOCATOR_HPP
+#define MRPAPP_LINALG_UTIL_ALLOCATORS_DEVICE_ALLOCATOR_HPP
+
+#ifdef MRPAPP_HAVE_GPU
+#include "platform/mrpapp_gpu.h"
+#else
+#pragma error "This file requires GPU support."
+#endif
+
+namespace linalg {
+namespace mrpapp {
+// mrpapp::
+
+template <typename T>
+class DeviceAllocator {
+protected:
+  T* allocate(std::size_t n) {
+    if (n == 0)
+      return nullptr;
+    T* ptr;
+    cudaError_t ret = cudaMalloc((void**)&ptr, n * sizeof(T));
+    if (ret != cudaSuccess) {
+      printErrorMessage(ret, __FUNCTION__, __FILE__, __LINE__,
+                        "\t DEVICE size requested : " + std::to_string(n * sizeof(T)));
+      throw(std::bad_alloc());
+    }
+    return ptr;
+  }
+
+  void deallocate(T*& ptr, std::size_t /*n*/ = 0) noexcept {
+    cudaError_t ret = cudaFree(ptr);
+    if (ret != cudaSuccess) {
+      printErrorMessage(ret, __FUNCTION__, __FILE__, __LINE__);
+      std::terminate();
+    }
+    ptr = nullptr;
+  }
+
+public:
+  // SFINAE method for setting managed memory stream.
+  void setStream(const cudaStream_t /*stream*/) const {}
+};
+
+}  // util
+}  // linalg
+
+#endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_DEVICE_ALLOCATOR_HPP
diff --git a/linalg/util/allocators/managed_allocator.hpp b/linalg/util/allocators/managed_allocator.hpp
new file mode 100644
index 000000000..5b3c5a9bc
--- /dev/null
+++ b/linalg/util/allocators/managed_allocator.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides allocators with pinned or mapped memory usable with std::vector.
+
+#ifndef MRPAPP_LINALG_UTIL_ALLOCATORS_MANAGED_ALLOCATOR_HPP
+#define MRPAPP_LINALG_UTIL_ALLOCATORS_MANAGED_ALLOCATOR_HPP
+
+#ifdef MRPAPP_HAVE_GPU
+#include "platform/mrpapp_gpu.h"
+#else
+#error "This file requires GPU support."
+#endif
+
+namespace linalg {
+namespace mrpapp {
+// mrpapp::
+
+template <typename T>
+class ManagedAllocator {
+protected:
+  T* allocate(std::size_t n) {
+    if (n == 0)
+      return nullptr;
+
+    cudaError_t ret = cudaMallocManaged((void**)&ptr_, n * sizeof(T));
+    if (ret != cudaSuccess) {
+      printErrorMessage(ret, __FUNCTION__, __FILE__, __LINE__,
+                        "\t Managed size requested : " + std::to_string(n * sizeof(T)));
+      throw(std::bad_alloc());
+    }
+
+    if (stream_)
+      checkRC(cudaStreamAttachMemAsync(stream_, reinterpret_cast<void**>(&ptr_)));
+
+    return ptr_;
+  }
+
+  void deallocate(T*& ptr, std::size_t /*n*/ = 0) noexcept {
+    cudaError_t ret = cudaFree(ptr);
+    if (ret != cudaSuccess) {
+      printErrorMessage(ret, __FUNCTION__, __FILE__, __LINE__);
+      std::terminate();
+    }
+    ptr_ = ptr = nullptr;
+  }
+
+public:
+  void setStream(cudaStream_t stream) {
+    stream_ = stream;
+    if (ptr_)
+      cudaStreamAttachMemAsync(stream, ptr_);
+  }
+
+private:
+  T* ptr_ = nullptr;
+  cudaStream_t stream_ = nullptr;
+};
+
+}  // util
+}  // linalg
+
+#endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_MANAGED_ALLOCATOR_HPP
diff --git a/linalg/util/allocators/pinned_allocator.hpp b/linalg/util/allocators/pinned_allocator.hpp
new file mode 100644
index 000000000..f66748ea1
--- /dev/null
+++ b/linalg/util/allocators/pinned_allocator.hpp
@@ -0,0 +1,91 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// This file provides allocators with pinned or mapped memory usable with std::vector.
+
+#ifndef MRPAPP_LINALG_UTIL_ALLOCATORS_PINNED_ALLOCATOR_HPP
+#define MRPAPP_LINALG_UTIL_ALLOCATORS_PINNED_ALLOCATOR_HPP
+
+#ifdef MRPAPP_HAVE_GPU
+#include "platform/mrpapp_gpu.h"
+#else
+#error "This file requires GPU support."
+#endif
+
+namespace linalg {
+namespace mrpapp {
+// mrpapp::
+
+template <typename T>
+class PinnedAllocator {
+public:
+  PinnedAllocator() = default;
+
+  using size_type = std::size_t;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using value_type = T;
+
+  /** Allocates pinned memory through the GPU runtime.
+   *  It does this in the crudest way possible on HIP which is to just always allocate as many 4k
+   * blocks as the allocation can minimally fit, the rest of the memory is wasted.  We have no way
+   * of knowing if this was what the runtime always did or not.
+   */
+  T* allocate(std::size_t n, const void* /*hint*/ = nullptr) {
+    if (n == 0)
+      return nullptr;
+#ifdef MRPAPP_HAVE_HIP
+    std::size_t actual_size = ((n * sizeof(T)) / 4096 + 1) * 4096;
+#else
+    std::size_t actual_size = n * sizeof(T);
+#endif
+    void* ptr;
+    cudaError_t ret =
+        cudaHostAlloc(&ptr, actual_size, cudaHostAllocPortable);  // cudaHostAllocDefault
+    if (ret != cudaSuccess) {
+      printErrorMessage(ret, __FUNCTION__, __FILE__, __LINE__,
+                        "\t HOST size requested : " + std::to_string(n * sizeof(T)));
+      throw(std::bad_alloc());
+    }
+    return static_cast<T*>(ptr);
+  }
+
+  void deallocate(T*& ptr, std::size_t /*n*/ = 0) noexcept {
+    //    assert(ptr == host_ptr_);
+    cudaError_t ret = cudaFreeHost(ptr);
+    if (ret != cudaSuccess) {
+      printErrorMessage(ret, __FUNCTION__, __FILE__, __LINE__);
+      std::terminate();
+    }
+    //host_ptr_ = nullptr;
+    ptr = nullptr;
+  }
+
+private:
+  //std::size_t actual_size_ = 0;
+  //T* host_ptr_{nullptr};
+};
+
+// These are part of the requirements for a C++ Allocator however these are insufficient
+// and they do not appear to be relevant to our codebase yet.
+// They are part of what's needed to do a std::move on a PinnedAllocator backed object
+// and avoid deallocation and reallocation.
+template <class T, class U>
+bool operator==(const PinnedAllocator<T>&, const PinnedAllocator<U>&) {
+  return true;
+}
+template <class T, class U>
+bool operator!=(const PinnedAllocator<T>&, const PinnedAllocator<U>&) {
+  return false;
+}
+
+}  // namespace mrpapp
+}  // namespace linalg
+
+#endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_PINNED_ALLOCATOR_HPP
diff --git a/linalg/util/copy.hpp b/linalg/util/copy.hpp
index 0ea42cdef..79ab1a911 100644
--- a/linalg/util/copy.hpp
+++ b/linalg/util/copy.hpp
@@ -9,24 +9,23 @@
 //
 // This file provides memory copy utilities.
 
-#ifndef DCA_LINALG_UTIL_COPY_HPP
-#define DCA_LINALG_UTIL_COPY_HPP
+#ifndef MRPAPP_LINALG_UTIL_COPY_HPP
+#define MRPAPP_LINALG_UTIL_COPY_HPP
 
 #include <cassert>
 #include <complex>
 #include <cstring>
-#include "dca/linalg/device_type.hpp"
+#include "device_type.hpp"
 #include "gpu_stream.hpp"
 
-#ifdef DCA_HAVE_GPU
-#include "dca/platform/dca_gpu.h"
+#ifdef MRPAPP_HAVE_GPU
+#include "platform/mrpapp_gpu.h"
 #endif
-#include "dca/linalg/util/stream_functions.hpp"
+#include "stream_functions.hpp"
 
-namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 template <typename ScalarType>
 inline void memoryCopyCpu(ScalarType* dest, const ScalarType* src, size_t sz) {
@@ -70,7 +69,7 @@ void memoryCopyCpu(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_
   }
 }
 
-#ifdef DCA_HAVE_GPU
+#ifdef MRPAPP_HAVE_GPU
 // Fully synchronous 1D memory copy, i.e. all operations in the GPU queue are executed before the
 // execution of this copy.
 // The host continues the execution of the program when the copy is terminated.
@@ -217,8 +216,8 @@ void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, const
 }
 
 // Asynchronous 1D memory copy (stream = getStream(thread_id, stream_id)).
-// Preconditions: 0 <= thread_id < DCA_MAX_THREADS,
-//                0 <= stream_id < DCA_STREAMS_PER_THREADS.
+// Preconditions: 0 <= thread_id < MRPAPP_MAX_THREADS,
+//                0 <= stream_id < MRPAPP_STREAMS_PER_THREADS.
 template <typename ScalarType>
 void memoryCopyAsync(ScalarType* dest, const ScalarType* src, size_t size, int thread_id,
                      int stream_id = 0) {
@@ -246,8 +245,8 @@ void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int l
 
 // Asynchronous 2D memory copy (stream = getStream(thread_id, stream_id)).
 // Preconditions: ld_dest >= size.first, ld_src >= size.first,
-//                0 <= thread_id < DCA_MAX_THREADS,
-//                0 <= stream_id < DCA_STREAMS_PER_THREADS.
+//                0 <= thread_id < MRPAPP_MAX_THREADS,
+//                0 <= stream_id < MRPAPP_STREAMS_PER_THREADS.
 template <typename ScalarType>
 void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
                      std::pair<int, int> size, int thread_id, int stream_id) {
@@ -256,7 +255,7 @@ void memoryCopyAsync(ScalarType* dest, int ld_dest, const ScalarType* src, int l
 
 // Asynchronous 1D memory copy (stream = getStream(thread_id, stream_id))
 // + synchronization of stream.
-// Preconditions: 0 <= thread_id < DCA_MAX_THREADS,
+// Preconditions: 0 <= thread_id < MRPAPP_MAX_THREADS,
 template <typename ScalarType>
 void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size, int thread_id, int stream_id) {
   memoryCopyAsync(dest, src, size, thread_id, stream_id);
@@ -266,8 +265,8 @@ void memoryCopy(ScalarType* dest, const ScalarType* src, size_t size, int thread
 // Asynchronous 2D memory copy (stream = getStream(thread_id, stream_id))
 // + synchronization of stream.
 // Preconditions: ld_dest >= size.first, ld_src >= size.first,
-//                0 <= thread_id < DCA_MAX_THREADS,
-//                0 <= stream_id < DCA_STREAMS_PER_THREADS.
+//                0 <= thread_id < MRPAPP_MAX_THREADS,
+//                0 <= stream_id < MRPAPP_STREAMS_PER_THREADS.
 template <typename ScalarType>
 void memoryCopy(ScalarType* dest, int ld_dest, const ScalarType* src, int ld_src,
                 std::pair<int, int> size, int thread_id, int stream_id) {
@@ -326,10 +325,9 @@ void memoryCopyD2H(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
   throw std::runtime_error("memoryCopyH2D should never be called in a non GPU build.");
 }
 
-#endif  // DCA_HAVE_GPU
+#endif  // MRPAPP_HAVE_GPU
 
-}  // namespace util
+}  // namespace mrpapp
 }  // namespace linalg
-}  // namespace dca
 
-#endif  // DCA_LINALG_UTIL_COPY_HPP
+#endif  // MRPAPP_LINALG_UTIL_COPY_HPP
diff --git a/linalg/util/error_cuda.hpp b/linalg/util/error_cuda.hpp
index 9e4728b26..5c53c9840 100644
--- a/linalg/util/error_cuda.hpp
+++ b/linalg/util/error_cuda.hpp
@@ -11,17 +11,15 @@
 // - return code checking,
 // - error message printing.
 
-#ifndef DCA_LINALG_UTIL_ERROR_CUDA_HPP
-#define DCA_LINALG_UTIL_ERROR_CUDA_HPP
+#ifndef MRPAPP_LINALG_UTIL_ERROR_CUDA_HPP
+#define MRPAPP_LINALG_UTIL_ERROR_CUDA_HPP
 
 #include <cuda_runtime.h>
 #include <string>
 #include <stdexcept>
 
-namespace dca {
-namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 // Performs extra cuda code checking when debugging.
 // Remark: Use the macro checkErrorsCudaDebug instead of the function in the code to avoid overhead
@@ -29,7 +27,7 @@ namespace util {
 void checkErrorsCudaDebugInternal(std::string function_name, std::string file_name, int line);
 #ifdef DEBUG_CUDA
 #define checkErrorsCudaDebug() \
-  dca::linalg::util::checkErrorsCudaDebugInternal(__FUNCTION__, __FILE__, __LINE__)
+  mrpapp::checkErrorsCudaDebugInternal(__FUNCTION__, __FILE__, __LINE__)
 #else
 #define checkErrorsCudaDebug()
 #endif
@@ -48,9 +46,9 @@ void printErrorMessage(cudaError_t error, std::string function_name, std::string
 // The macros provide the interfaces that automatically pass the function name, the filename, and
 // the line to the function call.
 #define checkRC(return_code) \
-  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__)
+  mrpapp::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__)
 #define checkRCMsg(return_code, extra_error_string)                                 \
-  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__, \
+  mrpapp::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__, \
                                      extra_error_string)
 inline void checkRCInternal(cudaError_t return_code, std::string function_name,
                             std::string file_name, int line, std::string extra_error_string = "") {
@@ -60,8 +58,4 @@ inline void checkRCInternal(cudaError_t return_code, std::string function_name,
   }
 }
 
-}  // util
-}  // linalg
-}  // dca
-
-#endif  // DCA_LINALG_UTIL_ERROR_CUDA_HPP
+#endif  // MRPAPP_LINALG_UTIL_ERROR_CUDA_HPP
diff --git a/linalg/util/error_gpu.cpp b/linalg/util/error_gpu.cpp
index df4441776..639082f97 100644
--- a/linalg/util/error_gpu.cpp
+++ b/linalg/util/error_gpu.cpp
@@ -17,8 +17,8 @@
 
 namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 void checkErrorsCudaDebugInternal(std::string function_name, std::string file_name, int line) {
   // cudaDeviceSynchronize();
diff --git a/linalg/util/error_gpuBLAS.cpp b/linalg/util/error_gpuBLAS.cpp
index d9d302b63..bd2d0a2a0 100644
--- a/linalg/util/error_gpuBLAS.cpp
+++ b/linalg/util/error_gpuBLAS.cpp
@@ -16,8 +16,8 @@
 
 namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 std::string errorStringCublas(cublasStatus_t error) {
   switch (error) {
diff --git a/linalg/util/error_gpuBLAS.hpp b/linalg/util/error_gpuBLAS.hpp
index 5533e21ca..7f6a2f8cf 100644
--- a/linalg/util/error_gpuBLAS.hpp
+++ b/linalg/util/error_gpuBLAS.hpp
@@ -32,7 +32,7 @@
 #include <string>
 
 namespace linalg {
-namespace util {
+namespace mrpapp {
 
 // Returns the error string related to error.
 std::string errorStringCublas(cublasStatus_t error);
diff --git a/linalg/util/error_hip.hpp b/linalg/util/error_hip.hpp
index e1a3eeb5b..18c427502 100644
--- a/linalg/util/error_hip.hpp
+++ b/linalg/util/error_hip.hpp
@@ -22,8 +22,8 @@
 
 namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 // Performs extra cuda code checking when debugging.
 // Remark: Use the macro checkErrorsCudaDebug instead of the function in the code to avoid overhead
@@ -31,7 +31,7 @@ namespace util {
 void checkErrorsCudaDebugInternal(std::string function_name, std::string file_name, int line);
 #ifdef DEBUG_CUDA
 #define checkErrorsCudaDebug() \
-  dca::linalg::util::checkErrorsCudaDebugInternal(__FUNCTION__, __FILE__, __LINE__)
+  mrpapp::checkErrorsCudaDebugInternal(__FUNCTION__, __FILE__, __LINE__)
 #else
 #define checkErrorsCudaDebug()
 #endif
@@ -50,9 +50,9 @@ void printErrorMessage(cudaError_t error, std::string function_name, std::string
 // The macros provide the interfaces that automatically pass the function name, the filename, and
 // the line to the function call.
 #define checkRC(return_code) \
-  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__)
+  mrpapp::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__)
 #define checkRCMsg(return_code, extra_error_string)                                 \
-  dca::linalg::util::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__, \
+  mrpapp::checkRCInternal(return_code, __FUNCTION__, __FILE__, __LINE__, \
                                      extra_error_string)
 inline void checkRCInternal(cudaError_t return_code, std::string function_name,
                             std::string file_name, int line, std::string extra_error_string = "") {
diff --git a/linalg/util/gpuBLAS_handles.hpp b/linalg/util/gpuBLAS_handles.hpp
index 733e9af85..7ab7c1528 100644
--- a/linalg/util/gpuBLAS_handles.hpp
+++ b/linalg/util/gpuBLAS_handles.hpp
@@ -26,8 +26,8 @@
 #include "gpu_stream.hpp"
 
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 class GpuBLASHandle {
 public:
diff --git a/linalg/util/gpu_stream.hpp b/linalg/util/gpu_stream.hpp
index 68b7613f0..e9e37ed61 100644
--- a/linalg/util/gpu_stream.hpp
+++ b/linalg/util/gpu_stream.hpp
@@ -19,11 +19,11 @@
 
 
 namespace linalg {
-namespace util {
+namespace mrpapp {
 
 #ifdef MRPAPP_HAVE_GPU
 
-// dca::linalg::util::
+// mrpapp::
 
 class GpuStream {
 public:
@@ -110,7 +110,7 @@ class GpuStream {
 
 #endif  // MRPAPP_HAVE_GPU
 
-}  // namespace util
+}  // namespace mrpapp
 }  // namespace linalg
 
 #endif  // MRPAPP_CUDA_STREAM_HPP
diff --git a/linalg/util/gpu_type_mapping.hpp b/linalg/util/gpu_type_mapping.hpp
new file mode 100644
index 000000000..e1e360976
--- /dev/null
+++ b/linalg/util/gpu_type_mapping.hpp
@@ -0,0 +1,150 @@
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter W.  Doak (doakpw@ornl.gov)
+//
+
+/** \file
+ *  This file provides better type mapping between host and gpu types
+ */
+
+#ifndef MRPAPP_GPU_TYPE_MAPPING_HPP
+#define MRPAPP_GPU_TYPE_MAPPING_HPP
+
+#include <complex>
+#include <type_traits>
+#include <string>
+#include <memory>
+#include "util/type_mapping.hpp"
+#include "util/type_fundamentals.hpp"
+#include "platform/mrpapp_gpu_complex.h"
+
+namespace mrpapp {
+
+#ifdef MRPAPP_HAVE_GPU
+
+/** Type maps to handle conversion of complex types from GPU to std C++ representation
+ *  representations.
+ */
+template <typename T>
+using HOSTTypeMap =
+    typename std::disjunction<OnTypesEqual<T, float, float>, OnTypesEqual<T, double, double>,
+                              OnTypesEqual<T, const float, const float>,
+                              OnTypesEqual<T, const double, const double>,
+                              OnTypesEqual<T, float2, std::complex<float>>,
+                              OnTypesEqual<T, double2, std::complex<double>>, default_type<void>>::type;
+
+/** Type maps to handle cast from of complex type pointers from GPU to std C++ representation
+ *  representations.
+ */
+template <typename T>
+using HOSTPointerMap = typename std::disjunction<
+
+    OnTypesEqual<T, float, float>, OnTypesEqual<T, double, double>, OnTypesEqual<T, float*, float*>,
+    OnTypesEqual<T, double*, double*>, OnTypesEqual<T, const float*, const float*>,
+    OnTypesEqual<T, const double*, const double*>, OnTypesEqual<T, float**, float**>,
+    OnTypesEqual<T, double**, double**>, OnTypesEqual<T, const float**, const float**>,
+    OnTypesEqual<T, const double**, const double**>, OnTypesEqual<T, std::complex<double>, cuDoubleComplex>,
+    OnTypesEqual<T, float2*, std::complex<float>*>, OnTypesEqual<T, double2*, std::complex<double>*>,
+    OnTypesEqual<T, float2**, std::complex<float>**>, OnTypesEqual<T, double2**, std::complex<double>**>,
+    OnTypesEqual<T, const float2*, const std::complex<float>*>,
+    OnTypesEqual<T, const double2*, const std::complex<double>*>,
+    OnTypesEqual<T, const float2**, const std::complex<float>**>,
+    OnTypesEqual<T, const double2**, const std::complex<double>**>,
+    OnTypesEqual<T, std::complex<float>*, std::complex<float>*>,
+    OnTypesEqual<T, std::complex<double>*, std::complex<double>*>,
+    OnTypesEqual<T, std::complex<float>**, std::complex<float>**>,
+    OnTypesEqual<T, std::complex<double>**, std::complex<double>**>,
+    OnTypesEqual<T, const std::complex<float>*, const std::complex<float>*>,
+    OnTypesEqual<T, const std::complex<double>*, const std::complex<double>*>,
+    OnTypesEqual<T, const std::complex<float>**, const std::complex<float>**>,
+    OnTypesEqual<T, const std::complex<double>**, const std::complex<double>**>,
+    default_type<void>>::type;
+
+/** Type maps to handle conversion of complex types from std c++ to GPU representation
+ *  representations.
+ */
+template <typename T>
+using CUDATypeMap = typename std::disjunction<
+    OnTypesEqual<T, float, float>, OnTypesEqual<T, double, double>, OnTypesEqual<T, float*, float*>,
+    OnTypesEqual<T, double*, double*>, OnTypesEqual<T, const float*, const float*>,
+    OnTypesEqual<T, const double*, const double*>, OnTypesEqual<T, float**, float**>,
+    OnTypesEqual<T, const float**, const float**>, OnTypesEqual<T, double**, double**>,
+    OnTypesEqual<T, const double**, const double**>, OnTypesEqual<T, std::complex<double>, cuDoubleComplex>,
+    OnTypesEqual<T, std::complex<float>, cuComplex>, OnTypesEqual<T, std::complex<double>, cuDoubleComplex>,
+    OnTypesEqual<T, std::complex<double>*, cuDoubleComplex*>,
+    OnTypesEqual<T, std::complex<float>**, cuComplex**>,
+    OnTypesEqual<T, std::complex<double>**, cuDoubleComplex**>,
+    OnTypesEqual<T, std::complex<float>*, cuComplex*>, OnTypesEqual<T, float2, cuComplex>,
+    OnTypesEqual<T, double2, cuDoubleComplex>,
+    OnTypesEqual<T, const std::complex<double>*, const cuDoubleComplex*>,
+    OnTypesEqual<T, const std::complex<float>*, const cuComplex*>,
+    OnTypesEqual<T, const std::complex<double>&, const cuDoubleComplex&>,
+    OnTypesEqual<T, const std::complex<float>&, const cuComplex&>,
+    OnTypesEqual<T, const std::complex<float>**, const cuComplex**>,
+    OnTypesEqual<T, const std::complex<double>**, const cuDoubleComplex**>,
+    OnTypesEqual<T, const std::complex<float>* const*, const cuComplex* const*>,
+    OnTypesEqual<T, const std::complex<double>* const*, const cuDoubleComplex* const*>,
+    default_type<void>>::type;
+
+template <typename T>
+__device__ __host__ CUDATypeMap<T> castGPUType(T var) {
+  return reinterpret_cast<CUDATypeMap<T>>(var);
+}
+
+template <typename T>
+using CUDARealAliasMap =
+    typename std::disjunction<OnTypesEqual<T, float2, float>, OnTypesEqual<T, double2, double>,
+                              OnTypesEqual<T, cuDoubleComplex, double>,
+                              OnTypesEqual<T, cuComplex, float>, default_type<void>>::type;
+
+template <typename T>
+CUDARealAliasMap<T> realAliasGPU(T var) {
+  return reinterpret_cast<CUDARealAliasMap<T>>(var);
+}
+
+template <typename T>
+struct IsCUDAComplex_t
+    : std::disjunction<std::is_same<float2, T>, std::is_same<double2, T>, std::false_type> {};
+  
+template <typename T>
+using IsCUDAComplex = std::enable_if_t<IsCUDAComplex_t<std::decay_t<T>>::value, bool>;
+  
+template <typename Real>
+struct Real2CudaComplex;
+
+template <>
+struct Real2CudaComplex<double> {
+  using type = cuDoubleComplex;
+};
+template <>
+struct Real2CudaComplex<float> {
+  using type = cuComplex;
+};
+
+#ifdef MRPAPP_HAVE_CUDA
+template <typename Real>
+using GPUComplex = typename Real2CudaComplex<Real>::type;
+#endif
+
+template <typename Real>
+using CUDAComplex = typename Real2CudaComplex<Real>::type;
+
+#endif
+
+#ifdef MRPAPP_HAVE_GPU
+template <typename T>
+__device__ __host__ HOSTPointerMap<T> castHostType(T var) {
+  if constexpr (std::is_same_v<HOSTPointerMap<T>, T>)
+    return var;
+  else if constexpr (std::is_pointer_v<T>)
+    return reinterpret_cast<HOSTPointerMap<T>>(var);
+}
+#endif
+
+}  // namespace mrpapp
+
+#endif
diff --git a/linalg/util/handle_functions.hpp b/linalg/util/handle_functions.hpp
index c3c99464c..89bc62e50 100644
--- a/linalg/util/handle_functions.hpp
+++ b/linalg/util/handle_functions.hpp
@@ -30,8 +30,8 @@
 #endif
 
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 #ifdef MRPAPP_HAVE_GPU
 
diff --git a/linalg/util/ignore.hpp b/linalg/util/ignore.hpp
new file mode 100644
index 000000000..d5a8442dc
--- /dev/null
+++ b/linalg/util/ignore.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Urs R. Haehner (haehneru@itp.phys.ethz.ch)
+//         John Biddiscombe (john.biddiscombe@cscs.ch)
+//
+// This file provides a utility function that takes a variadic list of arguments and returns
+// nothing.
+// We provide this function with different names for code readability.
+
+#ifndef MRPAPP_UTIL_IGNORE_HPP
+#define MRPAPP_UTIL_IGNORE_HPP
+
+namespace dca {
+namespace mrpapp {
+// dca::util::
+
+// Silences an unused parameter/variable compiler warning.
+// Reference: Stack Overflow http://stackoverflow.com/questions/15763937/unused-parameter-in-c11
+template <typename... Ts>
+void ignoreUnused(Ts&&...) {}
+
+// Allows to call arbitrary functions with a variadic pack expansion.
+template <typename... Ts>
+void ignoreReturnValues(Ts&&...) {}
+
+}  // util
+}  // dca
+
+#endif  // MRPAPP_UTIL_IGNORE_HPP
diff --git a/linalg/util/info_gpu.cpp b/linalg/util/info_gpu.cpp
index ada0ca592..8fc24186d 100644
--- a/linalg/util/info_gpu.cpp
+++ b/linalg/util/info_gpu.cpp
@@ -25,8 +25,8 @@
 
 namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 void printInfoDevices() {
   int nr_devices;
diff --git a/linalg/util/info_gpu.hpp b/linalg/util/info_gpu.hpp
index c9fb6009d..22b0c2bca 100644
--- a/linalg/util/info_gpu.hpp
+++ b/linalg/util/info_gpu.hpp
@@ -17,8 +17,8 @@
 #endif
 
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 // Prints the specs of the cuda devices found by the application.
 void printInfoDevices();
diff --git a/linalg/util/memory.hpp b/linalg/util/memory.hpp
index 482ab3f5d..f2cde9741 100644
--- a/linalg/util/memory.hpp
+++ b/linalg/util/memory.hpp
@@ -10,28 +10,26 @@
 // This file provides memory related utility:
 // - setToZero.
 
-#ifndef DCA_LINALG_UTIL_MEMORY_HPP
-#define DCA_LINALG_UTIL_MEMORY_HPP
+#ifndef MRPAPP_LINALG_UTIL_MEMORY_HPP
+#define MRPAPP_LINALG_UTIL_MEMORY_HPP
 
 #include <cassert>
 #include <complex>
 #include <cstring>
 #include <stdexcept>
 
-#include "dca/platform/dca_gpu.h"
-#include "dca/util/type_help.hpp"
-#include "dca/linalg/device_type.hpp"
-#include "dca/linalg/util/gpu_stream.hpp"
-#include "dca/util/ignore.hpp"
+#include "platform/mrpapp_gpu.h"
+#include "util/type_help.hpp"
+#include "device_type.hpp"
+#include "gpu_stream.hpp"
+#include "util/ignore.hpp"
 
-#ifdef DCA_HAVE_GPU
-#include "dca/linalg/util/gpu_type_mapping.hpp"
+#ifdef MRPAPP_HAVE_GPU
+#include "gpu_type_mapping.hpp"
 #endif
 
-namespace dca {
-namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 template <DeviceType device_name>
 struct Memory {};
@@ -61,16 +59,16 @@ struct Memory<CPU> {
   static void setToZero(ScalarType* ptr, size_t size, const GpuStream& /*s*/) {
     setToZero(ptr, size);
   }
-#ifdef DCA_HAVE_GPU
+#ifdef MRPAPP_HAVE_GPU
   template <typename Scalar>
-  static std::enable_if_t<dca::util::IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
+  static std::enable_if_t<util::IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
       Scalar* ptr, size_t size) {
     std::memset(ptr, 0, sizeof(Scalar) * size);
   }
 
-#ifdef DCA_HAVE_HIP
+#ifdef MRPAPP_HAVE_HIP
   template <typename Scalar>
-  static std::enable_if_t<dca::util::IsMagmaComplex_t<Scalar>::value == true, void> setToZero(
+  static std::enable_if_t<util::IsMagmaComplex_t<Scalar>::value == true, void> setToZero(
       Scalar* ptr, size_t size) {
     std::memset(ptr, 0, sizeof(Scalar) * size);
   }
@@ -78,7 +76,7 @@ struct Memory<CPU> {
 #endif
 };
 
-#ifdef DCA_HAVE_GPU
+#ifdef MRPAPP_HAVE_GPU
 template <>
 struct Memory<GPU> {
   // Sets the elements to 0. Only defined for arithmetic types and
@@ -129,10 +127,8 @@ struct Memory<GPU> {
     checkRC(cudaEventSynchronize(zero_event));
   }
 };
-#endif  // DCA_HAVE_GPU
+#endif  // MRPAPP_HAVE_GPU
 
-}  // namespace util
-}  // namespace linalg
-}  // namespace dca
+}  // namespace mrpapp
 
-#endif  // DCA_LINALG_UTIL_MEMORY_HPP
+#endif  // MRPAPP_LINALG_UTIL_MEMORY_HPP
diff --git a/linalg/util/stream_container.hpp b/linalg/util/stream_container.hpp
index 14cf1636d..0721be2d9 100644
--- a/linalg/util/stream_container.hpp
+++ b/linalg/util/stream_container.hpp
@@ -21,8 +21,8 @@
 #include "gpu_stream.hpp"
 
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 class StreamContainer {
 public:
@@ -64,7 +64,7 @@ class StreamContainer {
   std::vector<std::array<GpuStream, streams_per_thread_>> streams_;
 };
 
-}  // namespace util
+}  // namespace mrpapp
 }  // namespace linalg
 
 #endif  // MRPAPP_STREAM_CONTAINER_HPP
diff --git a/linalg/util/stream_functions.hpp b/linalg/util/stream_functions.hpp
index 5605e958c..760d61cea 100644
--- a/linalg/util/stream_functions.hpp
+++ b/linalg/util/stream_functions.hpp
@@ -16,8 +16,8 @@
 #include "stream_container.hpp"
 
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 // Global stream container.
 inline StreamContainer& getStreamContainer() {
@@ -42,7 +42,7 @@ inline void syncStream(int thread_id, int stream_id) {
   getStreamContainer().sync(thread_id, stream_id);
 }
 
-}  // namespace util
+}  // namespace mrpapp
 }  // namespace linalg
 
 #endif  // MRPAPP_STREAM_FUNCTIONS_HPP
diff --git a/linalg/util/util_gpublas.cpp b/linalg/util/util_gpublas.cpp
index 347501a68..948957445 100644
--- a/linalg/util/util_gpublas.cpp
+++ b/linalg/util/util_gpublas.cpp
@@ -15,9 +15,8 @@
 #include "util_gpublas.hpp"
 #include "handle_functions.hpp"
 
-namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 #if defined(MRPAPP_HAVE_GPU)
 
@@ -39,5 +38,4 @@ int getGpuBLASVersion() {
 
 #endif  // MRPAPP_HAVE_CUDA
 
-}  // namespace util
-}  // namespace linalg
+}  // namespace mrpapp
diff --git a/linalg/util/util_gpublas.hpp b/linalg/util/util_gpublas.hpp
index 1fe45890f..ab98f0fcf 100644
--- a/linalg/util/util_gpublas.hpp
+++ b/linalg/util/util_gpublas.hpp
@@ -15,8 +15,8 @@
 
 namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 int getGpuBLASVersion();
 
diff --git a/linalg/vector.cpp b/linalg/vector.cpp
new file mode 100644
index 000000000..0eb65be38
--- /dev/null
+++ b/linalg/vector.cpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2025 ETH Zurich
+// Copyright (C) 2025 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter W. Doak (doakpw@ornl.gov)
+//
+// This file provides the explicit instantiation Vector object
+
+#include "vector.hpp"
+
+namespace linalg {
+
+template clase Vector<double, linalg::CPU>;
+
+}  // namespace linalg
+
+
diff --git a/linalg/vector.hpp b/linalg/vector.hpp
index 157b65cc3..c4df12655 100644
--- a/linalg/vector.hpp
+++ b/linalg/vector.hpp
@@ -1,5 +1,5 @@
-// Copyright (C) 2018 ETH Zurich
-// Copyright (C) 2018 UT-Battelle, LLC
+// Copyright (C) 2025 ETH Zurich
+// Copyright (C) 2025 UT-Battelle, LLC
 // All rights reserved.
 //
 // See LICENSE for terms of usage.
@@ -8,11 +8,12 @@
 // Author: Peter Staar (taa@zurich.ibm.com)
 //         Raffaele Solca' (rasolca@itp.phys.ethz.ch)
 //         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//         Peter W. Doak (doakpw@ornl.gov)
 //
 // This file provides the Vector object for different device types.
 
-#ifndef DCA_LINALG_VECTOR_HPP
-#define DCA_LINALG_VECTOR_HPP
+#ifndef MRPAPP_LINALG_VECTOR_HPP
+#define MRPAPP_LINALG_VECTOR_HPP
 
 #include <cassert>
 #include <cmath>
@@ -30,7 +31,6 @@
 #include "linalg/util/copy.hpp"
 #include "linalg/util/stream_functions.hpp"
 
-namespace dca {
 namespace linalg {
 // dca::linalg::
 
@@ -325,7 +325,7 @@ template <typename ScalarType, DeviceType device_name, class Allocator>
 void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
                                                                 [[maybe_unused]]) {
   // TODO: implement in copy.hpp.
-#ifdef DCA_HAVE_GPU
+#ifdef MRPAPP_HAVE_GPU
   checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream));
 #else
   std::memset(data_, 0, size_ * sizeof(ScalarType));
@@ -335,14 +335,14 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
 template <typename ScalarType, DeviceType device_name, class Allocator>
 void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
                                                            [[maybe_unused]]) {
-  dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
+  mrpapp::Memory<device_name>::setToZero(data_, size_, stream);
 }
 
 // template <typename ScalarType, DeviceType device_name, class Allocator>
 // void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
 // [[maybe_unused]]) {
 //   // TODO: implement in copy.hpp.
-//   dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
+//   mrpapp::memory<device_name>::setToZero(data_, size_, stream);
 // }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
@@ -429,7 +429,8 @@ std::size_t Vector<ScalarType, device_name, Allocator>::deviceFingerprint() cons
   return device_name == GPU ? capacity_ * sizeof(ScalarType) : 0;
 }
 
+extern template clase Vector<double, linalg::CPU>;
+  
 }  // namespace linalg
-}  // namespace dca
 
-#endif  // DCA_LINALG_VECTOR_HPP
+#endif  // MRPAPP_LINALG_VECTOR_HPP
diff --git a/platform/error_gpuBLAS.hpp b/platform/error_gpuBLAS.hpp
index 0850752ca..24173ffcd 100644
--- a/platform/error_gpuBLAS.hpp
+++ b/platform/error_gpuBLAS.hpp
@@ -32,8 +32,8 @@
 
 namespace dca {
 namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace mrpapp {
+// mrpapp::
 
 // Returns the error string related to error.
 std::string errorStringCublas(cublasStatus_t error);
diff --git a/platform/mrpapp_gpu_blas.h b/platform/mrpapp_gpu_blas.h
index b3baa822b..eee11c8c9 100644
--- a/platform/mrpapp_gpu_blas.h
+++ b/platform/mrpapp_gpu_blas.h
@@ -17,13 +17,13 @@
 #include "defines.hpp"
 #if defined(MRPAPP_HAVE_CUDA)
 #include <cublas_v2.h>
-#include "linalg/util/error_cuda.hpp"
+#include "error_cuda.hpp"
 #elif defined(MRPAPP_HAVE_HIP)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_complex.h>
 #include "platform/cuda2hip.h"
-#include "linalg/util/error_hip.hpp"
+#include "error_hip.hpp"
 #endif
 #include "platform/error_gpuBLAS.hpp"
 
diff --git a/platform/mrpapp_gpu_complex.h b/platform/mrpapp_gpu_complex.h
new file mode 100644
index 000000000..b061124e6
--- /dev/null
+++ b/platform/mrpapp_gpu_complex.h
@@ -0,0 +1,60 @@
+// Copyright (C) 2023 ETH Zurich
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter Doak (doakpw@ornl.gov)
+//
+
+/** \file
+ *  This file provides working vender or magma complex gpu headers.
+ *  Order of includes is quite brittle for complex types and operators,
+ *  check all platforms when making any changes.
+ */
+#ifndef MRPAPP_GPU_COMPLEX_H
+#define MRPAPP_GPU_COMPLEX_H
+#include <type_traits>
+#include <complex>
+
+#ifdef MRPAPP_HAVE_CUDA
+#include <cuComplex.h>
+#endif
+
+#include "util/type_fundamentals.hpp"
+
+#if defined(MRPAPP_HAVE_HIP)
+#include "platform/cuda2hip.h"
+
+namespace linalg {
+
+template <typename T>
+__device__ __host__ inline void assign(T& a, const T b) {
+  a = b;
+}
+
+}  // namespace linalg
+#endif
+
+namespace linalg {
+#ifdef MRPAPP_HAVE_GPU
+// The contents of the cast come from en.cppreference.com/w/cpp/numeric/complex
+template <typename T>
+__device__ __host__ inline void assign(std::complex<T>& a, const T b) {
+  a = {b, 0.0};
+}
+
+__device__ __host__ inline void assign(double2& a, const int8_t b) {
+  a.x = static_cast<double>(b);
+  a.y = 0.0;
+}
+
+__device__ __host__ inline void assign(float2& a, const int8_t b) {
+  a.x = static_cast<float>(b);
+  a.y = 0.0;
+}
+#endif
+}  // namespace linalg
+
+#endif
diff --git a/util/type_fundamentals.hpp b/util/type_fundamentals.hpp
new file mode 100644
index 000000000..400a3cee6
--- /dev/null
+++ b/util/type_fundamentals.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter W.  Doak (doakpw@ornl.gov)
+//
+
+#ifndef MRPAPP_UTIL_TYPE_FUNDAMENTALS_HPP
+#define MRPAPP_UTIL_TYPE_FUNDAMENTALS_HPP
+
+#include <complex>
+
+namespace mrpapp {
+template <typename T, typename = bool>
+struct TheOne;
+template <typename T, typename = bool>
+struct TheZero;
+
+template <typename T>
+using IsReal = std::enable_if_t<std::is_floating_point<T>::value, bool>;
+
+template <typename T>
+struct IsComplex_t : public std::false_type {};
+template <typename T>
+struct IsComplex_t<std::complex<T>> : public std::true_type {};
+
+// template <typename T>
+// using IsComplex_t< CudaComplex<T>> : public std::true_type {};
+
+template <typename T>
+using IsComplex = std::enable_if_t<IsComplex_t<T>::value, bool>;
+
+template <typename T, typename = bool>
+struct RealAlias_impl {};
+
+template <typename T>
+struct RealAlias_impl<T, IsReal<T>> {
+  using value_type = T;
+};
+
+  
+}  // namespace mrpapp
+
+#endif
diff --git a/util/type_help.hpp b/util/type_help.hpp
new file mode 100644
index 000000000..b2e38c40c
--- /dev/null
+++ b/util/type_help.hpp
@@ -0,0 +1,303 @@
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter W.  Doak (doakpw@ornl.gov)
+//
+
+#ifndef MRPAPP_UTIL_TYPE_HELP_HPP
+#define MRPAPP_UTIL_TYPE_HELP_HPP
+
+#include <type_traits>
+#include <complex>
+#include <tuple>
+#include <cstdint>
+
+#include "util/type_fundamentals.hpp"
+
+#ifdef MRPAPP_HAVE_GPU
+#include "platform/mrpapp_gpu_complex.h"
+#include "gpu_type_mapping.hpp"
+#endif
+
+namespace mrpapp {
+
+
+#ifdef MRPAPP_HAVE_GPU
+template <>
+struct RealAlias_impl<cuComplex, bool> {
+  using value_type = float;
+};
+
+template <>
+struct RealAlias_impl<cuDoubleComplex, bool> {
+  using value_type = double;
+};
+#endif
+
+template <typename T>
+struct RealAlias_impl<T, IsComplex<T>> {
+  using value_type = typename T::value_type;
+};
+
+/** If you have a function templated on a value that can be real or complex
+ *   and you need to get the base Real type if its complex or just the real.
+ *
+ *  If you try to do this on anything but a fp or a std::complex<fp> you will
+ *  get a compilation error.
+ */
+template <typename T>
+using RealAlias = typename RealAlias_impl<T>::value_type;
+
+/** default implementation
+ *  This will cause ComplexAlias to fail which is what we want if its not a floating
+ *  point scalar type expected, i.e. real floating point, std::complex, cu.*Complex.
+ */
+template <typename T, typename = bool>
+struct ComplexAlias_impl {};
+
+template <typename T>
+struct ComplexAlias_impl<T, IsReal<T>> {
+  using value_type = std::complex<T>;
+};
+
+template <typename T>
+struct ComplexAlias_impl<T, IsComplex<T>> {
+  using value_type = T;
+};
+
+#ifdef MRPAPP_HAVE_GPU
+template <typename T, typename = bool>
+struct CUDAScalar_impl {};
+
+template <typename T>
+struct CUDAScalar_impl<T, IsReal<T>> {
+  using value_type = T;
+};
+
+template <typename T>
+struct CUDAScalar_impl<T, IsComplex<T>> {
+  using value_type = util::CUDAComplex<RealAlias<T>>;
+};
+
+template <typename T>
+using CUDAScalar = typename CUDAScalar_impl<T>::value_type;
+
+template <typename T>
+struct CUDAScalarStruct {
+  typename CUDAScalar_impl<T>::value_type value;
+};  
+
+  template <typename T>
+struct ComplexAlias_impl<T, IsCUDAComplex<T>> {
+  using value_type = T;
+};
+
+template <typename T>
+struct ComplexAlias_impl<T*, IsCUDAComplex<T>> {
+  using value_type = T*;
+};
+
+template <typename T>
+struct ComplexAlias_impl<T**, IsCUDAComplex<T>> {
+  using value_type = T**;
+};
+#endif
+/* template <> */
+/* struct ComplexAlias_impl<float2> { */
+/*   using value_type = cuComplex; */
+/* }; */
+
+/* template <typename T> */
+/* struct ComplexAlias_impl<T, IsCUDAComplex<T>> { */
+/*   using value_type = CudaComplex<std::remove_pointer<T*>>; */
+/* }; */
+
+template <typename T>
+using ComplexAlias = typename ComplexAlias_impl<T>::value_type;
+
+template <typename T>
+using HostComplexAlias = ComplexAlias<RealAlias<T>>;
+
+template <typename REAL, bool complex>
+struct ScalarSelect {
+  using type = REAL;
+};
+
+template <typename REAL>
+struct ScalarSelect<REAL, false> {
+  using type = REAL;
+};
+
+template <typename REAL>
+struct ScalarSelect<REAL, true> {
+  using type = std::complex<REAL>;
+};
+
+template <typename T>
+struct TheOne<T, IsReal<T>> {
+  static constexpr T value = 1.0;
+};
+
+template <typename T>
+struct TheZero<T, IsReal<T>> {
+  static constexpr T value = 0.0;
+};
+
+#ifdef MRPAPP_HAVE_CUDA
+
+#endif
+
+#ifdef MRPAPP_HAVE_HIP
+template <typename T>
+struct TheOne<T, IsMagmaComplex<T>> {
+  static constexpr T value{1.0, 0.0};
+};
+
+template <typename T>
+struct TheZero<T, IsMagmaComplex<T>> {
+  static constexpr T value{0.0, 0.0};
+};
+
+template <typename T>
+struct ComplexAlias_impl<T, IsMagmaComplex<T>> {
+  using value_type = T;
+};
+
+template <typename T>
+struct ComplexAlias_impl<T*, IsMagmaComplex<T>> {
+  using value_type = T*;
+};
+
+template <typename T>
+struct ComplexAlias_impl<T**, IsMagmaComplex<T>> {
+  using value_type = T**;
+};
+
+template <typename T>
+std::enable_if_t<IsMagmaComplex_t<T>::value, void> makeOne(T& one) {
+  one = T{1.0, 0.0};
+}
+
+template <typename T>
+std::enable_if_t<IsMagmaComplex_t<T>::value, void> makeZero(T& zero) {
+  zero = T{0.0, 0.0};
+}
+
+#endif
+
+template <typename T>
+std::enable_if_t<std::is_floating_point<T>::value, void> makeOne(T& one) {
+  one = 1.0;
+}
+
+template <typename T>
+std::enable_if_t<IsComplex_t<T>::value, void> makeOne(T& one) {
+  one = T{1.0, 0.0};
+}
+
+template <typename T>
+std::enable_if_t<std::is_floating_point<T>::value, void> makeZero(T& zero) {
+  zero = 1.0;
+}
+
+template <typename T>
+std::enable_if_t<IsComplex_t<T>::value, void> makeZero(T& zero) {
+  zero = T{1.0, 0.0};
+}
+
+template <typename T>
+struct TheOne<T, IsComplex<T>> {
+  static constexpr T value{1.0, 0.0};
+};
+
+template <typename T>
+struct TheZero<T, IsComplex<T>> {
+  static constexpr T value = {0.0, 0.0};
+};
+
+template <typename T, typename T2>
+auto makeMaybe(
+    const T2 t2,
+    typename std::enable_if_t<IsComplex_t<T>::value || std::is_floating_point<T>::value>* = 0) {
+  return T(t2);
+}
+
+#ifdef MRPAPP_HAVE_GPU
+/** to handle making double2 and float2 values from Real in a generic way.
+ *  static cast required to deal with possibility of narrowing conversion from literal expressed as double.
+ */
+template <typename T, typename T2>
+auto makeMaybe(const T2 t2, typename std::enable_if_t<IsCUDAComplex_t<T>::value>* = 0) {
+  using Real = RealAlias<T>;
+  return T{static_cast<Real>(t2), 0.0};
+}
+
+#ifdef MRPAPP_HAVE_HIP
+template <typename T, typename T2>
+auto makeMaybe(const T2 t2, typename std::enable_if_t<IsMagmaComplex_t<T>::value>* = 0) {
+  using Real = RealAlias<T>;
+  return T{static_cast<Real>(t2), 0.0};
+}
+#endif
+
+template <typename T>
+inline auto GPUTypeConversion(T var, typename std::enable_if_t<IsCUDAComplex_t<T>::value>* = 0) {
+  return HOSTTypeMap<T>{var.x, var.y};
+}
+
+template <typename T>
+inline auto GPUTypeConversion(
+    T var, typename std::enable_if_t<IsComplex_t<T>::value && (!IsCUDAComplex_t<T>::value)>* = 0) {
+  return CUDATypeMap<T>{var.real(), var.imag()};
+}
+
+template <typename T>
+inline auto HOSTTypeConversion(
+    T var, typename std::enable_if_t<IsComplex_t<T>::value && (!IsCUDAComplex_t<T>::value)>* = 0) {
+  return HOSTTypeMap<T>{var.x, var.y};
+}
+
+template <typename T>
+inline auto GPUTypeConversion(T var,
+                              typename std::enable_if_t<std::is_floating_point<T>::value>* = 0) {
+  return var;
+}
+
+template <typename T>
+inline auto GPUTypeConversion(T var, typename std::enable_if_t<std::is_integral<T>::value>* = 0) {
+  return var;
+}
+#endif
+
+template <typename ARRAY, std::size_t... SIZE>
+auto Array2Tuple_impl(const ARRAY& a, std::index_sequence<SIZE...>) {
+  return std::make_tuple(a[SIZE]...);
+}
+
+template <typename T, std::size_t N, typename Indices = std::make_index_sequence<N>>
+auto Array2Tuple(const std::array<T, N>& a) {
+  return Array2Tuple_impl(a, Indices{});
+}
+
+template <typename T, typename = bool>
+struct SignType_impl {};
+
+template <typename T>
+struct SignType_impl<T, IsComplex<T>> {
+  using type = T;
+};
+
+template <typename T>
+struct SignType_impl<T, IsReal<T>> {
+  using type = std::int8_t;
+};
+
+template <typename T>
+using SignType = typename SignType_impl<T>::type;
+
+}  // namespace mrpapp
+
+#endif
diff --git a/util/type_mapping.hpp b/util/type_mapping.hpp
new file mode 100644
index 000000000..9313911fc
--- /dev/null
+++ b/util/type_mapping.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2023 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Peter W.  Doak (doakpw@ornl.gov)
+//
+
+/** \file
+ *  This file provides some general type mapping tools
+ */
+
+#ifndef MRPAPP_TYPE_MAPPING_H
+#define MRPAPP_TYPE_MAPPING_H
+
+#include <type_traits>
+
+namespace mrpapp {
+template <typename V1, typename V2, typename T>
+struct OnTypesEqual : std::bool_constant<std::is_same<V1, V2>::value> {
+  using type = T;
+};
+
+template <typename T>
+struct default_type : std::true_type {
+  using type = T;
+};
+
+}  // namespace mrpapp
+
+#endif

From f830bfec59581233e3faaab3c2838d55f4fe1b16 Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 10:37:37 -0400
Subject: [PATCH 06/11] delete extra file

---
 platform/error_gpuBLAS.hpp | 62 --------------------------------------
 1 file changed, 62 deletions(-)
 delete mode 100644 platform/error_gpuBLAS.hpp

diff --git a/platform/error_gpuBLAS.hpp b/platform/error_gpuBLAS.hpp
deleted file mode 100644
index 24173ffcd..000000000
--- a/platform/error_gpuBLAS.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (C) 2021 ETH Zurich
-// Copyright (C) 2021 UT-Battelle, LLC
-// All rights reserved.
-//
-// See LICENSE for terms of usage.
-// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
-//
-// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
-//         Peter Doak (doakpw@ornl.gov)
-//
-// This file provides cublas related utilities to
-// - return code checking,
-// - error message printing.
-
-#ifndef DCA_LINALG_UTIL_ERROR_GPUBLAS_HPP
-#define DCA_LINALG_UTIL_ERROR_GPUBLAS_HPP
-
-#include "defines.hpp"
-
-#if defined(DCA_HAVE_CUDA)
-#include <cublas_v2.h>
-#include "linalg/util/error_cuda.hpp"
-#elif defined(DCA_HAVE_HIP)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_complex.h>
-#include "util/cuda2hip.h"
-#include "linalg/util/error_hip.hpp"
-#endif
-#include <stdexcept>
-#include <string>
-
-namespace dca {
-namespace linalg {
-namespace mrpapp {
-// mrpapp::
-
-// Returns the error string related to error.
-std::string errorStringCublas(cublasStatus_t error);
-
-// Prints an error message containing function_name, file_name, line, and the error message and
-// error code related to error.
-void printErrorMessage(cublasStatus_t error, std::string function_name, std::string file_name,
-                       int line, std::string extra_error_string = "");
-
-// Prints an error message and throws a std::logic_error if the return code of a cuda function is
-// not CUBLAS_STATUS_SUCCESS.
-// This function can be invoked with the macros checkRC and checkRCMsg (defined in error_cuda.hpp)
-// that automatically include the function name, the filename, and the line to the function call.
-inline void checkRCInternal(cublasStatus_t return_code, std::string function_name,
-                            std::string file_name, int line, std::string extra_error_string = "") {
-  if (return_code != CUBLAS_STATUS_SUCCESS) {
-    printErrorMessage(return_code, function_name, file_name, line, extra_error_string);
-    throw std::logic_error(function_name);
-  }
-}
-
-}  // util
-}  // linalg
-}  // dca
-
-#endif  // DCA_LINALG_UTIL_ERROR_CUBLAS_HPP

From 83ac0348f5dd0042973ffd6ff140ed0b774fd688 Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 10:43:00 -0400
Subject: [PATCH 07/11] moving platform specific linalg util not just for
 linalg to platform

---
 {linalg/util => platform}/CMakeLists.txt                   | 0
 {linalg/util => platform}/allocators/aligned_allocator.hpp | 0
 {linalg/util => platform}/allocators/allocators.hpp        | 0
 {linalg/util => platform}/allocators/device_allocator.hpp  | 0
 {linalg/util => platform}/allocators/managed_allocator.hpp | 0
 {linalg/util => platform}/allocators/pinned_allocator.hpp  | 0
 {linalg => platform}/device_type.hpp                       | 0
 {linalg/util => platform}/error_cuda.hpp                   | 0
 {linalg/util => platform}/error_gpu.cpp                    | 0
 {linalg/util => platform}/error_gpuBLAS.cpp                | 0
 {linalg/util => platform}/error_gpuBLAS.hpp                | 0
 {linalg/util => platform}/error_hip.hpp                    | 0
 {linalg/util => platform}/gpuBLAS_handles.hpp              | 0
 {linalg/util => platform}/gpu_stream.hpp                   | 0
 {linalg/util => platform}/gpu_type_mapping.hpp             | 0
 {linalg/util => platform}/handle_functions.hpp             | 0
 {linalg/util => platform}/info_gpu.cpp                     | 0
 {linalg/util => platform}/info_gpu.hpp                     | 0
 {linalg/util => platform}/memory.hpp                       | 0
 {linalg/util => platform}/stream_container.hpp             | 0
 {linalg/util => platform}/stream_functions.hpp             | 0
 {linalg/util => platform}/util_gpublas.cpp                 | 0
 {linalg/util => platform}/util_gpublas.hpp                 | 0
 23 files changed, 0 insertions(+), 0 deletions(-)
 rename {linalg/util => platform}/CMakeLists.txt (100%)
 rename {linalg/util => platform}/allocators/aligned_allocator.hpp (100%)
 rename {linalg/util => platform}/allocators/allocators.hpp (100%)
 rename {linalg/util => platform}/allocators/device_allocator.hpp (100%)
 rename {linalg/util => platform}/allocators/managed_allocator.hpp (100%)
 rename {linalg/util => platform}/allocators/pinned_allocator.hpp (100%)
 rename {linalg => platform}/device_type.hpp (100%)
 rename {linalg/util => platform}/error_cuda.hpp (100%)
 rename {linalg/util => platform}/error_gpu.cpp (100%)
 rename {linalg/util => platform}/error_gpuBLAS.cpp (100%)
 rename {linalg/util => platform}/error_gpuBLAS.hpp (100%)
 rename {linalg/util => platform}/error_hip.hpp (100%)
 rename {linalg/util => platform}/gpuBLAS_handles.hpp (100%)
 rename {linalg/util => platform}/gpu_stream.hpp (100%)
 rename {linalg/util => platform}/gpu_type_mapping.hpp (100%)
 rename {linalg/util => platform}/handle_functions.hpp (100%)
 rename {linalg/util => platform}/info_gpu.cpp (100%)
 rename {linalg/util => platform}/info_gpu.hpp (100%)
 rename {linalg/util => platform}/memory.hpp (100%)
 rename {linalg/util => platform}/stream_container.hpp (100%)
 rename {linalg/util => platform}/stream_functions.hpp (100%)
 rename {linalg/util => platform}/util_gpublas.cpp (100%)
 rename {linalg/util => platform}/util_gpublas.hpp (100%)

diff --git a/linalg/util/CMakeLists.txt b/platform/CMakeLists.txt
similarity index 100%
rename from linalg/util/CMakeLists.txt
rename to platform/CMakeLists.txt
diff --git a/linalg/util/allocators/aligned_allocator.hpp b/platform/allocators/aligned_allocator.hpp
similarity index 100%
rename from linalg/util/allocators/aligned_allocator.hpp
rename to platform/allocators/aligned_allocator.hpp
diff --git a/linalg/util/allocators/allocators.hpp b/platform/allocators/allocators.hpp
similarity index 100%
rename from linalg/util/allocators/allocators.hpp
rename to platform/allocators/allocators.hpp
diff --git a/linalg/util/allocators/device_allocator.hpp b/platform/allocators/device_allocator.hpp
similarity index 100%
rename from linalg/util/allocators/device_allocator.hpp
rename to platform/allocators/device_allocator.hpp
diff --git a/linalg/util/allocators/managed_allocator.hpp b/platform/allocators/managed_allocator.hpp
similarity index 100%
rename from linalg/util/allocators/managed_allocator.hpp
rename to platform/allocators/managed_allocator.hpp
diff --git a/linalg/util/allocators/pinned_allocator.hpp b/platform/allocators/pinned_allocator.hpp
similarity index 100%
rename from linalg/util/allocators/pinned_allocator.hpp
rename to platform/allocators/pinned_allocator.hpp
diff --git a/linalg/device_type.hpp b/platform/device_type.hpp
similarity index 100%
rename from linalg/device_type.hpp
rename to platform/device_type.hpp
diff --git a/linalg/util/error_cuda.hpp b/platform/error_cuda.hpp
similarity index 100%
rename from linalg/util/error_cuda.hpp
rename to platform/error_cuda.hpp
diff --git a/linalg/util/error_gpu.cpp b/platform/error_gpu.cpp
similarity index 100%
rename from linalg/util/error_gpu.cpp
rename to platform/error_gpu.cpp
diff --git a/linalg/util/error_gpuBLAS.cpp b/platform/error_gpuBLAS.cpp
similarity index 100%
rename from linalg/util/error_gpuBLAS.cpp
rename to platform/error_gpuBLAS.cpp
diff --git a/linalg/util/error_gpuBLAS.hpp b/platform/error_gpuBLAS.hpp
similarity index 100%
rename from linalg/util/error_gpuBLAS.hpp
rename to platform/error_gpuBLAS.hpp
diff --git a/linalg/util/error_hip.hpp b/platform/error_hip.hpp
similarity index 100%
rename from linalg/util/error_hip.hpp
rename to platform/error_hip.hpp
diff --git a/linalg/util/gpuBLAS_handles.hpp b/platform/gpuBLAS_handles.hpp
similarity index 100%
rename from linalg/util/gpuBLAS_handles.hpp
rename to platform/gpuBLAS_handles.hpp
diff --git a/linalg/util/gpu_stream.hpp b/platform/gpu_stream.hpp
similarity index 100%
rename from linalg/util/gpu_stream.hpp
rename to platform/gpu_stream.hpp
diff --git a/linalg/util/gpu_type_mapping.hpp b/platform/gpu_type_mapping.hpp
similarity index 100%
rename from linalg/util/gpu_type_mapping.hpp
rename to platform/gpu_type_mapping.hpp
diff --git a/linalg/util/handle_functions.hpp b/platform/handle_functions.hpp
similarity index 100%
rename from linalg/util/handle_functions.hpp
rename to platform/handle_functions.hpp
diff --git a/linalg/util/info_gpu.cpp b/platform/info_gpu.cpp
similarity index 100%
rename from linalg/util/info_gpu.cpp
rename to platform/info_gpu.cpp
diff --git a/linalg/util/info_gpu.hpp b/platform/info_gpu.hpp
similarity index 100%
rename from linalg/util/info_gpu.hpp
rename to platform/info_gpu.hpp
diff --git a/linalg/util/memory.hpp b/platform/memory.hpp
similarity index 100%
rename from linalg/util/memory.hpp
rename to platform/memory.hpp
diff --git a/linalg/util/stream_container.hpp b/platform/stream_container.hpp
similarity index 100%
rename from linalg/util/stream_container.hpp
rename to platform/stream_container.hpp
diff --git a/linalg/util/stream_functions.hpp b/platform/stream_functions.hpp
similarity index 100%
rename from linalg/util/stream_functions.hpp
rename to platform/stream_functions.hpp
diff --git a/linalg/util/util_gpublas.cpp b/platform/util_gpublas.cpp
similarity index 100%
rename from linalg/util/util_gpublas.cpp
rename to platform/util_gpublas.cpp
diff --git a/linalg/util/util_gpublas.hpp b/platform/util_gpublas.hpp
similarity index 100%
rename from linalg/util/util_gpublas.hpp
rename to platform/util_gpublas.hpp

From 852070cd81cbceb5bdd431c27c052298b83397ad Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 11:56:56 -0400
Subject: [PATCH 08/11] simplify namespaces first test builds

---
 CMakeLists.txt                            |  2 +
 linalg/CMakeLists.txt                     |  3 +-
 linalg/tests/CMakeLists.txt               |  2 +-
 linalg/tests/gpu_test_util.hpp            | 36 +++++------
 linalg/tests/vector_cpu_gpu_test.cpp      | 59 +++++++++---------
 linalg/util/copy.hpp                      |  3 -
 linalg/vector.cpp                         |  6 +-
 linalg/vector.hpp                         | 75 +++++++++++------------
 platform/CMakeLists.txt                   |  5 ++
 platform/allocators/aligned_allocator.hpp |  4 +-
 platform/allocators/allocators.hpp        | 13 ++--
 platform/allocators/device_allocator.hpp  |  4 +-
 platform/allocators/managed_allocator.hpp |  4 +-
 platform/allocators/pinned_allocator.hpp  |  2 -
 platform/device_type.hpp                  |  2 +-
 platform/error_cuda.hpp                   |  1 +
 platform/error_gpu.cpp                    |  8 +--
 platform/error_gpuBLAS.cpp                |  9 +--
 platform/error_gpuBLAS.hpp                |  4 +-
 platform/gpuBLAS_handles.hpp              |  5 +-
 platform/gpu_stream.hpp                   |  3 -
 platform/handle_functions.hpp             |  4 +-
 platform/info_gpu.cpp                     |  6 +-
 platform/memory.hpp                       | 15 +++--
 platform/mrpapp_gpu.h                     |  4 +-
 platform/stream_container.hpp             |  3 -
 platform/stream_functions.hpp             |  2 -
 platform/util_gpublas.cpp                 |  1 +
 platform/util_gpublas.hpp                 | 14 ++---
 util/type_help.hpp                        |  4 +-
 30 files changed, 134 insertions(+), 169 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 010037a53..920362152 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,8 @@ include(mrpapp_defines)
 mrpapp_write_definitions_file()
 
 include(mrpapp_linking)
+
+add_subdirectory(platform)
 add_subdirectory(linalg)
 
 add_executable(mrpapp ${MRPAPP_SRC})
diff --git a/linalg/CMakeLists.txt b/linalg/CMakeLists.txt
index f943ee135..2121e0742 100644
--- a/linalg/CMakeLists.txt
+++ b/linalg/CMakeLists.txt
@@ -1,10 +1,9 @@
-add_subdirectory(util)
-
 set(srcs vector.cpp)
 add_library(linalg STATIC ${srcs})
 target_include_directories(linalg PUBLIC
 "${PROJECT_SOURCE_DIR};${PROJECT_BINARY_DIR};${PROJECT_SOURCE_DIR}/linalg;${PROJECT_SOURCE_DIR}/linalg/util")
 
+target_link_libraries(linalg PUBLIC platform)
 
 if(MRPAPP_HAVE_GPU)
    target_link_libraries(linalg PRIVATE gpu_utils)
diff --git a/linalg/tests/CMakeLists.txt b/linalg/tests/CMakeLists.txt
index 9ebdbfc7a..f5169a84c 100644
--- a/linalg/tests/CMakeLists.txt
+++ b/linalg/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 # linalg unit tests
 
 add_executable(test_vector vector_cpu_gpu_test.cpp)
-target_link_libraries(test_vector PUBLIC linalg Catch2::Catch2WithMain)
+target_link_libraries(test_vector PUBLIC platform linalg Catch2::Catch2WithMain)
 mrpapp_gpu_runtime_link(test_vector)
 
 
diff --git a/linalg/tests/gpu_test_util.hpp b/linalg/tests/gpu_test_util.hpp
index bcc3be0ae..46818c3bc 100644
--- a/linalg/tests/gpu_test_util.hpp
+++ b/linalg/tests/gpu_test_util.hpp
@@ -9,11 +9,11 @@
 //
 // This file provides some utilities to test simple Matrix<GPU> operations.
 
-#ifndef DCA_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
-#define DCA_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
+#ifndef MRPAPP_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
+#define MRPAPP_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
 
-#include "dca/platform/dca_gpu.h"
-#include "dca/linalg/matrix.hpp"
+#include "mrpapp_gpu.h"
+//#include "dca/linalg/matrix.hpp"
 
 namespace testing {
 template <typename ScalarType>
@@ -24,9 +24,9 @@ cudaMemoryType PointerType(const ScalarType* ptr) {
   if (ret == cudaErrorInvalidValue)
     return cudaMemoryTypeHost;
   checkRC(ret);
-#if defined(DCA_HAVE_CUDA)
+#if defined(MRPAPP_HAVE_CUDA)
   return attributes.type;
-#elif defined(DCA_HAVE_HIP)
+#elif defined(MRPAPP_HAVE_HIP)
   #if HIP_VERSION_MAJOR >= 6
   return attributes.type;
   #else
@@ -59,17 +59,17 @@ ScalarType getFromDevice(const ScalarType* ptr) {
   return value;
 }
 
-// The elements of the matrix will be set with mat(i, j) = func(i, j).
-// In: func
-// Out: mat
-template <typename ScalarType, typename F>
-void setMatrixElements(dca::linalg::Matrix<ScalarType, dca::linalg::GPU>& mat, F& func) {
-  for (int j = 0; j < mat.nrCols(); ++j)
-    for (int i = 0; i < mat.nrRows(); ++i) {
-      ScalarType el(func(i, j));
-      setOnDevice(mat.ptr(i, j), el);
-    }
-}
+// // The elements of the matrix will be set with mat(i, j) = func(i, j).
+// // In: func
+// // Out: mat
+// template <typename ScalarType, typename F>
+// void setMatrixElements(dca::linalg::Matrix<ScalarType, dca::linalg::GPU>& mat, F& func) {
+//   for (int j = 0; j < mat.nrCols(); ++j)
+//     for (int i = 0; i < mat.nrRows(); ++i) {
+//       ScalarType el(func(i, j));
+//       setOnDevice(mat.ptr(i, j), el);
+//     }
+// }
 }  // testing
 
-#endif  // DCA_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
+#endif  // MRPAPP_TEST_UNIT_LINALG_GPU_TEST_UTIL_HPP
diff --git a/linalg/tests/vector_cpu_gpu_test.cpp b/linalg/tests/vector_cpu_gpu_test.cpp
index 9406c60de..eca3e990c 100644
--- a/linalg/tests/vector_cpu_gpu_test.cpp
+++ b/linalg/tests/vector_cpu_gpu_test.cpp
@@ -12,48 +12,49 @@
 #include <string>
 #include <utility>
 #include "catch2/catch_test_macros.hpp"
-//#include "gpu_test_util.hpp"
+#include "gpu_test_util.hpp"
 
-TEST_CASE(VectorCPUTest, PointerMemoryType) {
+namespace mrpapp {
+TEST_CASE("VectorCPUTest::PointerMemoryType", "[linalg]") {
   size_t size = 3;
   size_t capacity = 11;
   std::string name("vector name");
 
   // Tests all the constructors.
   {
-    linalg::Vector<float, linalg::CPU> vec(name, size, capacity);
-    ASSERT_TRUE(testing::isHostPointer(vec.ptr()));
+    mrpapp::Vector<float, DeviceType::CPU> vec(name, size, capacity);
+    CHECK(testing::isHostPointer(vec.ptr()));
   }
   {
-    linalg::Vector<int, linalg::CPU> vec(size);
-    EXPECT_TRUE(testing::isHostPointer(vec.ptr()));
+    Vector<int, DeviceType::CPU> vec(size);
+    CHECK(testing::isHostPointer(vec.ptr()));
   }
   {
-    linalg::Vector<std::complex<double>, linalg::CPU> vec(size, capacity);
-    EXPECT_TRUE(testing::isHostPointer(vec.ptr()));
+    Vector<std::complex<double>, DeviceType::CPU> vec(size, capacity);
+    CHECK(testing::isHostPointer(vec.ptr()));
   }
   {
-    linalg::Vector<int, linalg::CPU> vec(name, size);
-    EXPECT_TRUE(testing::isHostPointer(vec.ptr()));
+    Vector<int, DeviceType::CPU> vec(name, size);
+    CHECK(testing::isHostPointer(vec.ptr()));
   }
 }
-
+}
 // TEST(VectorCPUGPUTest, Constructors) {
 //   size_t size = 3;
 
-//   dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//   dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
 //   // Set the elements.
 //   for (int i = 0; i < vec.size(); ++i) {
 //     float el = 3 * i - 2;
 //     vec[i] = el;
 //   }
 
-//   dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(vec);
+//   dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(vec);
 //   ASSERT_EQ(vec.size(), vec_copy.size());
 //   ASSERT_LE(vec.size(), vec_copy.capacity());
 //   ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
 
-//   dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(vec_copy);
+//   dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(vec_copy);
 //   EXPECT_EQ(vec.size(), vec_copy_copy.size());
 //   EXPECT_LE(vec.size(), vec_copy_copy.capacity());
 //   EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
@@ -69,14 +70,14 @@ TEST_CASE(VectorCPUTest, PointerMemoryType) {
 //     // Assign a vector that fits into the capacity.
 //     size_t size = 3;
 
-//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
+//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
 //     auto old_ptr = vec_copy.ptr();
 //     auto capacity = vec_copy.capacity();
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
 //     auto old_ptr_2 = vec_copy_copy.ptr();
 //     auto capacity_2 = vec_copy_copy.capacity();
 
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
 //     // Set the elements.
 //     for (int i = 0; i < vec.size(); ++i) {
 //       float el = 3 * i - 2;
@@ -102,11 +103,11 @@ TEST_CASE(VectorCPUTest, PointerMemoryType) {
 //   }
 //   {
 //     // Assign a vector that doesn't fit into the capacity.
-//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
 //     size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
 
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
 //     // Set the elements.
 //     for (int i = 0; i < vec.size(); ++i) {
 //       float el = 3 * i - 2;
@@ -135,14 +136,14 @@ TEST_CASE(VectorCPUTest, PointerMemoryType) {
 //     // Assign a vector that fits into the capacity.
 //     size_t size = 3;
 
-//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
+//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
 //     auto old_ptr = vec_copy.ptr();
 //     auto capacity = vec_copy.capacity();
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
 //     auto old_ptr_2 = vec_copy_copy.ptr();
 //     auto capacity_2 = vec_copy_copy.capacity();
 
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
 //     // Set the elements.
 //     for (int i = 0; i < vec.size(); ++i) {
 //       float el = 3 * i - 2;
@@ -168,11 +169,11 @@ TEST_CASE(VectorCPUTest, PointerMemoryType) {
 //   }
 //   {
 //     // Assign a vector that doesn't fit into the capacity.
-//     dca::linalg::Vector<float, dca::linalg::GPU> vec_copy(10);
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec_copy_copy(6);
+//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
 //     size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
 
-//     dca::linalg::Vector<float, dca::linalg::CPU> vec("name", size);
+//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
 //     // Set the elements.
 //     for (int i = 0; i < vec.size(); ++i) {
 //       float el = 3 * i - 2;
@@ -199,8 +200,8 @@ TEST_CASE(VectorCPUTest, PointerMemoryType) {
 // TEST(VectorCPUTest, setAsync) {
 //   std::vector<int> vec(4, 1);
 
-//   dca::linalg::Vector<int, dca::linalg::GPU> vec_copy;
-//   dca::linalg::Vector<int, dca::linalg::CPU> vec_copy_copy;
+//   dca::mrpapp::Vector<int, dca::mrpapp::GPU> vec_copy;
+//   dca::mrpapp::Vector<int, dca::mrpapp::CPU> vec_copy_copy;
 
 //   mrpapp::GpuStream stream;
 
@@ -211,4 +212,4 @@ TEST_CASE(VectorCPUTest, PointerMemoryType) {
 //   EXPECT_EQ(vec.size(), vec_copy_copy.size());
 //   for (int i = 0; i < vec.size(); ++i)
 //     EXPECT_EQ(vec[i], vec_copy_copy[i]);
-}
+// }
diff --git a/linalg/util/copy.hpp b/linalg/util/copy.hpp
index 79ab1a911..615e4d961 100644
--- a/linalg/util/copy.hpp
+++ b/linalg/util/copy.hpp
@@ -23,9 +23,7 @@
 #endif
 #include "stream_functions.hpp"
 
-namespace linalg {
 namespace mrpapp {
-// mrpapp::
 
 template <typename ScalarType>
 inline void memoryCopyCpu(ScalarType* dest, const ScalarType* src, size_t sz) {
@@ -328,6 +326,5 @@ void memoryCopyD2H(Scalar1* dest, int ld_dest, const Scalar2* src, int ld_src,
 #endif  // MRPAPP_HAVE_GPU
 
 }  // namespace mrpapp
-}  // namespace linalg
 
 #endif  // MRPAPP_LINALG_UTIL_COPY_HPP
diff --git a/linalg/vector.cpp b/linalg/vector.cpp
index 0eb65be38..835959203 100644
--- a/linalg/vector.cpp
+++ b/linalg/vector.cpp
@@ -11,10 +11,10 @@
 
 #include "vector.hpp"
 
-namespace linalg {
+namespace mrpapp {
 
-template clase Vector<double, linalg::CPU>;
+template class Vector<double, DeviceType::CPU>;
 
-}  // namespace linalg
+}
 
 
diff --git a/linalg/vector.hpp b/linalg/vector.hpp
index c4df12655..4706ac108 100644
--- a/linalg/vector.hpp
+++ b/linalg/vector.hpp
@@ -25,17 +25,16 @@
 #include <vector>
 
 #include "platform/mrpapp_gpu.h"
-#include "linalg/device_type.hpp"
-#include "linalg/util/memory.hpp"
-#include "linalg/util/allocators/allocators.hpp"
-#include "linalg/util/copy.hpp"
-#include "linalg/util/stream_functions.hpp"
+#include "device_type.hpp"
+#include "memory.hpp"
+#include "allocators/allocators.hpp"
+#include "copy.hpp"
+#include "stream_functions.hpp"
 
-namespace linalg {
-// dca::linalg::
+namespace mrpapp {
 
 template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
-          class Allocator = util::DefaultAllocator<ScalarType, device_name>>
+          class Allocator = DefaultAllocator<ScalarType, device_name>>
 class Vector : public Allocator {
 public:
   using ThisType = Vector<ScalarType, device_name, Allocator>;
@@ -80,12 +79,12 @@ class Vector : public Allocator {
   // Returns the i-th element of the vector.
   // Preconditions: 0 <= i < size().first.
   // This method is available only if device_name == CPU.
-  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == DeviceType::CPU>>
   ScalarType& operator[](size_t i) {
     assert(i < size_);
     return data_[i];
   }
-  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
+  template <DeviceType dn = device_name, typename = std::enable_if_t<dn == DeviceType::CPU>>
   const ScalarType& operator[](size_t i) const {
     assert(i < size_);
     return data_[i];
@@ -111,10 +110,10 @@ class Vector : public Allocator {
 
   // Asynchronous assignment.
   template <class Container>
-  void setAsync(const Container& rhs, const util::GpuStream& stream);
+  void setAsync(const Container& rhs, const GpuStream& stream);
 
-  void setToZeroAsync(const util::GpuStream& stream);
-  void setToZero(const util::GpuStream& stream);
+  void setToZeroAsync(const GpuStream& stream);
+  void setToZero(const GpuStream& stream);
 
   template <class Container>
   void setAsync(const Container& rhs, int thred_id, int stream_id = 0);
@@ -182,7 +181,7 @@ class Vector : public Allocator {
   ValueType* data_;
 
   template <typename ScalarType2, DeviceType device_name2, class Allocator2>
-  friend class dca::linalg::Vector;
+  friend class Vector;
 
   static const std::string default_name_;
 };
@@ -214,7 +213,7 @@ Vector<ScalarType, device_name, Allocator>::Vector(const std::string& name, size
   assert(capacity_ >= size_);
   data_ = Allocator::allocate(capacity_);
   if (size) {  // Avoid cuda calls when initializing static vectors.
-    util::Memory<device_name>::setToZero(data_, capacity_);
+    Memory<device_name>::setToZero(data_, capacity_);
   }
 }
 
@@ -249,10 +248,10 @@ template <typename ScalarType, DeviceType device_name, class Allocator>
 Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
     const ThisType& rhs) {
   resizeNoCopy(rhs.size());
-  if (device_name == CPU)
-    util::memoryCopyCpu(data_, rhs.data_, size_);
+  if (device_name == DeviceType::CPU)
+    memoryCopyCpu(data_, rhs.data_, size_);
   else
-    util::memoryCopy(data_, rhs.data_, size_);
+    memoryCopy(data_, rhs.data_, size_);
 
   return *this;
 }
@@ -262,10 +261,10 @@ template <DeviceType device_name2, class Allocator2>
 Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
     const Vector<ScalarType, device_name2, Allocator2>& rhs) {
   resizeNoCopy(rhs.size());
-  if (device_name == CPU && device_name2 == CPU)
-    util::memoryCopyCpu(data_, rhs.data_, size_);
+  if (device_name == DeviceType::CPU && device_name2 == DeviceType::CPU)
+    memoryCopyCpu(data_, rhs.data_, size_);
   else
-    util::memoryCopy(data_, rhs.data_, size_);
+    memoryCopy(data_, rhs.data_, size_);
 
   return *this;
 }
@@ -275,10 +274,10 @@ template <class Container>
 Vector<ScalarType, device_name, Allocator>& Vector<ScalarType, device_name, Allocator>::operator=(
     const Container& rhs) {
   resizeNoCopy(rhs.size());
-  if (device_name == CPU)
-    util::memoryCopyCpu(data_, rhs.data(), size_);
+  if (device_name == DeviceType::CPU)
+    memoryCopyCpu(data_, rhs.data(), size_);
   else
-    util::memoryCopy(data_, rhs.data(), size_);
+    memoryCopy(data_, rhs.data(), size_);
 
   return *this;
 }
@@ -301,7 +300,7 @@ template <class Container>
 void Vector<ScalarType, device_name, Allocator>::set(const Container& rhs, int thread_id,
                                                      int stream_id) {
   resizeNoCopy(rhs.size());
-  util::memoryCopy(data_, rhs.data(), size_, thread_id, stream_id);
+  memoryCopy(data_, rhs.data(), size_, thread_id, stream_id);
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
@@ -309,20 +308,20 @@ template <class Container>
 void Vector<ScalarType, device_name, Allocator>::copyTo(Container& rhs) const {
   if (rhs.size() != size())
     throw(std::logic_error("The size of the destination container is different."));
-  util::memoryCopy(rhs.data(), data_, size_);
+  memoryCopy(rhs.data(), data_, size_);
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
 template <class Container>
 void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
-                                                          const util::GpuStream& stream) {
+                                                          const GpuStream& stream) {
   resizeNoCopy(rhs.size());
-  util::memoryCopyAsync(data_, rhs.data(), size_, stream);
+  memoryCopyAsync(data_, rhs.data(), size_, stream);
   //  cudaDeviceSynchronize();
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
+void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const GpuStream& stream
                                                                 [[maybe_unused]]) {
   // TODO: implement in copy.hpp.
 #ifdef MRPAPP_HAVE_GPU
@@ -333,13 +332,13 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+void Vector<ScalarType, device_name, Allocator>::setToZero(const GpuStream& stream
                                                            [[maybe_unused]]) {
   mrpapp::Memory<device_name>::setToZero(data_, size_, stream);
 }
 
 // template <typename ScalarType, DeviceType device_name, class Allocator>
-// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+// void Vector<ScalarType, device_name, Allocator>::setToZero(const GpuStream& stream
 // [[maybe_unused]]) {
 //   // TODO: implement in copy.hpp.
 //   mrpapp::memory<device_name>::setToZero(data_, size_, stream);
@@ -349,7 +348,7 @@ template <typename ScalarType, DeviceType device_name, class Allocator>
 template <class Container>
 void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs, const int thread_id,
                                                           const int stream_id) {
-  setAsync(rhs, util::getStream(thread_id, stream_id));
+  setAsync(rhs, getStream(thread_id, stream_id));
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
@@ -358,7 +357,7 @@ void Vector<ScalarType, device_name, Allocator>::resize(size_t new_size) {
     int new_capacity = (new_size / 64 + 1) * 64;
 
     ValueType* new_data = Allocator::allocate(new_capacity);
-    util::memoryCopy(new_data, data_, size_);
+    memoryCopy(new_data, data_, size_);
     Allocator::deallocate(data_);
 
     data_ = new_data;
@@ -392,8 +391,8 @@ void Vector<ScalarType, device_name, Allocator>::clear() {
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
 void Vector<ScalarType, device_name, Allocator>::print() const {
-  if (device_name == GPU) {
-    Vector<ScalarType, CPU> copy(*this);
+  if (device_name == DeviceType::GPU) {
+    Vector<ScalarType, DeviceType::CPU> copy(*this);
     return copy.print();
   }
 
@@ -426,11 +425,11 @@ void Vector<ScalarType, device_name, Allocator>::printFingerprint() const {
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
 std::size_t Vector<ScalarType, device_name, Allocator>::deviceFingerprint() const {
-  return device_name == GPU ? capacity_ * sizeof(ScalarType) : 0;
+  return device_name == DeviceType::GPU ? capacity_ * sizeof(ScalarType) : 0;
 }
 
-extern template clase Vector<double, linalg::CPU>;
+extern template class Vector<double, DeviceType::CPU>;
   
-}  // namespace linalg
+}
 
 #endif  // MRPAPP_LINALG_VECTOR_HPP
diff --git a/platform/CMakeLists.txt b/platform/CMakeLists.txt
index fad18aad8..3e9b9ef71 100644
--- a/platform/CMakeLists.txt
+++ b/platform/CMakeLists.txt
@@ -1,5 +1,10 @@
 # cuda and cublas utils
 
+set(srcs allocators/aligned_allocator.cpp)
+add_library(platform STATIC ${srcs})
+target_include_directories(platform PUBLIC
+"${PROJECT_SOURCE_DIR};${PROJECT_BINARY_DIR};${PROJECT_SOURCE_DIR}/platform;")
+
 if(MRPAPP_HAVE_GPU)
   set(srcs util_gpublas.cpp error_gpuBLAS.cpp info_gpu.cpp error_gpu.cpp)
   add_library(gpu_utils STATIC ${srcs})
diff --git a/platform/allocators/aligned_allocator.hpp b/platform/allocators/aligned_allocator.hpp
index 60cc33675..f40a15654 100644
--- a/platform/allocators/aligned_allocator.hpp
+++ b/platform/allocators/aligned_allocator.hpp
@@ -14,7 +14,6 @@
 #include <cstdlib>
 #include <new>
 
-namespace linalg {
 namespace mrpapp {
 
 template <typename T>
@@ -37,7 +36,6 @@ class AlignedAllocator {
   }
 };
 
-}  // util
-}  // linalg
+}
 
 #endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_ALIGNED_ALLOCATOR_HPP
diff --git a/platform/allocators/allocators.hpp b/platform/allocators/allocators.hpp
index 0b074c99c..eb98a532c 100644
--- a/platform/allocators/allocators.hpp
+++ b/platform/allocators/allocators.hpp
@@ -21,7 +21,6 @@
 #include "pinned_allocator.hpp"
 #endif  // MRPAPP_HAVE_GPU
 
-namespace linalg {
 namespace mrpapp {
 namespace selector {
 // mrpapp::selector::
@@ -30,23 +29,23 @@ struct DefaultAllocator;
 
 #ifdef MRPAPP_HAVE_GPU
 template <typename T>
-struct DefaultAllocator<T, CPU> {
+struct DefaultAllocator<T, DeviceType::CPU> {
   using type = PinnedAllocator<T>;
 };
 
 template <typename T>
-struct DefaultAllocator<T, GPU> {
+struct DefaultAllocator<T, DeviceType::GPU> {
   using type = DeviceAllocator<T>;
 };
 #else
 
 template <typename T>
-struct DefaultAllocator<T, CPU> {
+struct DefaultAllocator<T, DeviceType::CPU> {
   using type = AlignedAllocator<T>;
 };
 
 template <typename T>
-struct DefaultAllocator<T, GPU> {
+struct DefaultAllocator<T, DeviceType::GPU> {
   struct UnusedAllocator {
     T* allocate(std::size_t) {
       throw(std::logic_error("GPU not available."));
@@ -59,12 +58,10 @@ struct DefaultAllocator<T, GPU> {
 #endif  // MRPAPP_HAVE_GPU
 
 }  // selector
-// dca::linalg::util:
 
 template <typename T, DeviceType device>
 using DefaultAllocator = typename selector::DefaultAllocator<T, device>::type;
 
-}  // util
-}  // linalg
+} 
 
 #endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_HPP
diff --git a/platform/allocators/device_allocator.hpp b/platform/allocators/device_allocator.hpp
index d6665d55c..8eae1807f 100644
--- a/platform/allocators/device_allocator.hpp
+++ b/platform/allocators/device_allocator.hpp
@@ -18,7 +18,6 @@
 #pragma error "This file requires GPU support."
 #endif
 
-namespace linalg {
 namespace mrpapp {
 // mrpapp::
 
@@ -52,7 +51,6 @@ class DeviceAllocator {
   void setStream(const cudaStream_t /*stream*/) const {}
 };
 
-}  // util
-}  // linalg
+}
 
 #endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_DEVICE_ALLOCATOR_HPP
diff --git a/platform/allocators/managed_allocator.hpp b/platform/allocators/managed_allocator.hpp
index 5b3c5a9bc..0b89ac541 100644
--- a/platform/allocators/managed_allocator.hpp
+++ b/platform/allocators/managed_allocator.hpp
@@ -18,7 +18,6 @@
 #error "This file requires GPU support."
 #endif
 
-namespace linalg {
 namespace mrpapp {
 // mrpapp::
 
@@ -63,7 +62,6 @@ class ManagedAllocator {
   cudaStream_t stream_ = nullptr;
 };
 
-}  // util
-}  // linalg
+}
 
 #endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_MANAGED_ALLOCATOR_HPP
diff --git a/platform/allocators/pinned_allocator.hpp b/platform/allocators/pinned_allocator.hpp
index f66748ea1..5e4851542 100644
--- a/platform/allocators/pinned_allocator.hpp
+++ b/platform/allocators/pinned_allocator.hpp
@@ -18,7 +18,6 @@
 #error "This file requires GPU support."
 #endif
 
-namespace linalg {
 namespace mrpapp {
 // mrpapp::
 
@@ -86,6 +85,5 @@ bool operator!=(const PinnedAllocator<T>&, const PinnedAllocator<U>&) {
 }
 
 }  // namespace mrpapp
-}  // namespace linalg
 
 #endif  // MRPAPP_LINALG_UTIL_ALLOCATORS_PINNED_ALLOCATOR_HPP
diff --git a/platform/device_type.hpp b/platform/device_type.hpp
index 381c67c0a..b32a92810 100644
--- a/platform/device_type.hpp
+++ b/platform/device_type.hpp
@@ -14,7 +14,7 @@
 
 namespace mrpapp {
 
-enum DeviceType : int { CPU, GPU };
+enum class DeviceType { CPU, GPU };
 
 }
 
diff --git a/platform/error_cuda.hpp b/platform/error_cuda.hpp
index 5c53c9840..78be4f7a9 100644
--- a/platform/error_cuda.hpp
+++ b/platform/error_cuda.hpp
@@ -58,4 +58,5 @@ inline void checkRCInternal(cudaError_t return_code, std::string function_name,
   }
 }
 
+}
 #endif  // MRPAPP_LINALG_UTIL_ERROR_CUDA_HPP
diff --git a/platform/error_gpu.cpp b/platform/error_gpu.cpp
index 639082f97..e1cb727b4 100644
--- a/platform/error_gpu.cpp
+++ b/platform/error_gpu.cpp
@@ -9,14 +9,12 @@
 //
 // This file implements error_cuda functions.
 #include "defines.hpp"
-#include "platform/mrpapp_gpu.h"
+#include "mrpapp_gpu.h"
 #include <iostream>
 #include <string>
 #include <sstream>
 #include <stdexcept>
 
-namespace dca {
-namespace linalg {
 namespace mrpapp {
 // mrpapp::
 
@@ -25,7 +23,7 @@ void checkErrorsCudaDebugInternal(std::string function_name, std::string file_na
 
   cudaError_t ret = cudaGetLastError();
 
-#ifdef DCA_HAVE_HIP
+#ifdef MRPAPP_HAVE_HIP
   // hip reports this whenever you call getLastError and the stream is not empty of calls
   // To us this is not an error we're in an Async regime it's expected
   if(ret == hipErrorNotReady)
@@ -65,5 +63,3 @@ void printErrorMessage(std::string error, std::string function_name, std::string
   std::cout << s.str() << std::endl;
 }
 }  // util
-}  // linalg
-}  // dca
diff --git a/platform/error_gpuBLAS.cpp b/platform/error_gpuBLAS.cpp
index bd2d0a2a0..755e564eb 100644
--- a/platform/error_gpuBLAS.cpp
+++ b/platform/error_gpuBLAS.cpp
@@ -10,14 +10,11 @@
 //
 // This file implements cublas related utilities.
 
-#include "linalg/util/error_gpuBLAS.hpp"
+#include "error_gpuBLAS.hpp"
 #include <stdexcept>
 #include <string>
 
-namespace dca {
-namespace linalg {
 namespace mrpapp {
-// mrpapp::
 
 std::string errorStringCublas(cublasStatus_t error) {
   switch (error) {
@@ -53,6 +50,4 @@ void printErrorMessage(cublasStatus_t error, std::string function_name, std::str
                     file_name, line, cuda_error_str);
 }
 
-}  // util
-}  // linalg
-}  // dca
+}
diff --git a/platform/error_gpuBLAS.hpp b/platform/error_gpuBLAS.hpp
index 7f6a2f8cf..319348b74 100644
--- a/platform/error_gpuBLAS.hpp
+++ b/platform/error_gpuBLAS.hpp
@@ -31,7 +31,6 @@
 #include <stdexcept>
 #include <string>
 
-namespace linalg {
 namespace mrpapp {
 
 // Returns the error string related to error.
@@ -54,7 +53,6 @@ inline void checkRCInternal(cublasStatus_t return_code, std::string function_nam
   }
 }
 
-}  // util
-}  // linalg
+}
 
 #endif  // MRPAPP_ERROR_CUBLAS_HPP
diff --git a/platform/gpuBLAS_handles.hpp b/platform/gpuBLAS_handles.hpp
index 7ab7c1528..be155fadf 100644
--- a/platform/gpuBLAS_handles.hpp
+++ b/platform/gpuBLAS_handles.hpp
@@ -25,9 +25,7 @@
 #include "error_gpuBLAS.hpp"
 #include "gpu_stream.hpp"
 
-namespace linalg {
 namespace mrpapp {
-// mrpapp::
 
 class GpuBLASHandle {
 public:
@@ -60,7 +58,6 @@ class GpuBLASHandle {
   cublasHandle_t handle_ = nullptr;
 };
 
-}  // util
-}  // linalg
+}
 
 #endif  // MRPAPP_GPUBLAS_HANDLE_HPP
diff --git a/platform/gpu_stream.hpp b/platform/gpu_stream.hpp
index e9e37ed61..f36317777 100644
--- a/platform/gpu_stream.hpp
+++ b/platform/gpu_stream.hpp
@@ -17,8 +17,6 @@
 #include "defines.hpp"
 #include "platform/mrpapp_gpu.h"
 
-
-namespace linalg {
 namespace mrpapp {
 
 #ifdef MRPAPP_HAVE_GPU
@@ -111,6 +109,5 @@ class GpuStream {
 #endif  // MRPAPP_HAVE_GPU
 
 }  // namespace mrpapp
-}  // namespace linalg
 
 #endif  // MRPAPP_CUDA_STREAM_HPP
diff --git a/platform/handle_functions.hpp b/platform/handle_functions.hpp
index 89bc62e50..f0c3bd091 100644
--- a/platform/handle_functions.hpp
+++ b/platform/handle_functions.hpp
@@ -29,7 +29,6 @@
 #include "gpu_stream.hpp"
 #endif
 
-namespace linalg {
 namespace mrpapp {
 // mrpapp::
 
@@ -81,7 +80,6 @@ inline void resizeHandleContainer(const std::size_t max_threads) {
 
 #endif  // MRPAPP_HAVE_GPU
 
-}  // util
-}  // linalg
+}
 
 #endif  // MRPAPP_HANDLE_FUNCTIONS_HPP
diff --git a/platform/info_gpu.cpp b/platform/info_gpu.cpp
index 8fc24186d..83c0dd72f 100644
--- a/platform/info_gpu.cpp
+++ b/platform/info_gpu.cpp
@@ -12,12 +12,12 @@
 
 #include "defines.hpp"
 #if defined(MRPAPP_HAVE_CUDA)
-#include "linalg/util/error_cuda.hpp"
+#include "error_cuda.hpp"
 #elif defined(MRPAPP_HAVE_HIP)
-#include "linalg/util/error_hip.hpp"
+#include "error_hip.hpp"
 #include "util/cuda2hip.h"
 #endif
-#include "linalg/util/info_gpu.hpp"
+#include "info_gpu.hpp"
 #include <iostream>
 #include <string>
 #include <sstream>
diff --git a/platform/memory.hpp b/platform/memory.hpp
index f2cde9741..698228720 100644
--- a/platform/memory.hpp
+++ b/platform/memory.hpp
@@ -18,7 +18,7 @@
 #include <cstring>
 #include <stdexcept>
 
-#include "platform/mrpapp_gpu.h"
+#include "mrpapp_gpu.h"
 #include "util/type_help.hpp"
 #include "device_type.hpp"
 #include "gpu_stream.hpp"
@@ -29,13 +29,12 @@
 #endif
 
 namespace mrpapp {
-// mrpapp::
 
 template <DeviceType device_name>
 struct Memory {};
 
 template <>
-struct Memory<CPU> {
+struct Memory<DeviceType::CPU> {
   // Sets the elements to 0. Only defined for arithmetic types and
   // std::complex of aritmetic types.
   template <typename ScalarType>
@@ -61,14 +60,14 @@ struct Memory<CPU> {
   }
 #ifdef MRPAPP_HAVE_GPU
   template <typename Scalar>
-  static std::enable_if_t<util::IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
+  static std::enable_if_t<IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
       Scalar* ptr, size_t size) {
     std::memset(ptr, 0, sizeof(Scalar) * size);
   }
 
 #ifdef MRPAPP_HAVE_HIP
   template <typename Scalar>
-  static std::enable_if_t<util::IsMagmaComplex_t<Scalar>::value == true, void> setToZero(
+  static std::enable_if_t<IsMagmaComplex_t<Scalar>::value == true, void> setToZero(
       Scalar* ptr, size_t size) {
     std::memset(ptr, 0, sizeof(Scalar) * size);
   }
@@ -78,13 +77,13 @@ struct Memory<CPU> {
 
 #ifdef MRPAPP_HAVE_GPU
 template <>
-struct Memory<GPU> {
+struct Memory<DeviceType::GPU> {
   // Sets the elements to 0. Only defined for arithmetic types and
   // std::complex of aritmetic types.
 
   /// Specialization for float2, double2, cuComplex, cuDoubleComplex
   template <typename ScalarType>
-  static std::enable_if_t<dca::util::IsCUDAComplex_t<ScalarType>::value == true, void> setToZero(ScalarType ptr, size_t size) {
+  static std::enable_if_t<IsCUDAComplex_t<ScalarType>::value == true, void> setToZero(ScalarType ptr, size_t size) {
     checkRC(cudaMemset(ptr, 0, size * sizeof(ScalarType)));
   }
 
@@ -100,7 +99,7 @@ struct Memory<GPU> {
   }
 
   template <typename Scalar>
-  static std::enable_if_t<dca::util::IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
+  static std::enable_if_t<IsCUDAComplex_t<Scalar>::value == true, void> setToZero(
       Scalar* ptr, size_t size) {
     checkRC(cudaMemset(ptr, 0, size * sizeof(Scalar)));
   }
diff --git a/platform/mrpapp_gpu.h b/platform/mrpapp_gpu.h
index e33dd0c7a..be2ca0043 100644
--- a/platform/mrpapp_gpu.h
+++ b/platform/mrpapp_gpu.h
@@ -24,14 +24,14 @@
 #if defined(MRPAPP_HAVE_CUDA)
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "linalg/util/error_cuda.hpp"
+#include "error_cuda.hpp"
 
 #define HIP_SYMBOL(x) x
 
 #elif defined(MRPAPP_HAVE_HIP)
 #include <hip/hip_runtime.h>
 #include "platform/cuda2hip.h"
-#include "linalg/util/error_hip.hpp"
+#include "error_hip.hpp"
 #endif
 
 #endif
diff --git a/platform/stream_container.hpp b/platform/stream_container.hpp
index 0721be2d9..4984e8531 100644
--- a/platform/stream_container.hpp
+++ b/platform/stream_container.hpp
@@ -20,9 +20,7 @@
 
 #include "gpu_stream.hpp"
 
-namespace linalg {
 namespace mrpapp {
-// mrpapp::
 
 class StreamContainer {
 public:
@@ -65,6 +63,5 @@ class StreamContainer {
 };
 
 }  // namespace mrpapp
-}  // namespace linalg
 
 #endif  // MRPAPP_STREAM_CONTAINER_HPP
diff --git a/platform/stream_functions.hpp b/platform/stream_functions.hpp
index 760d61cea..005352c8e 100644
--- a/platform/stream_functions.hpp
+++ b/platform/stream_functions.hpp
@@ -15,7 +15,6 @@
 
 #include "stream_container.hpp"
 
-namespace linalg {
 namespace mrpapp {
 // mrpapp::
 
@@ -43,6 +42,5 @@ inline void syncStream(int thread_id, int stream_id) {
 }
 
 }  // namespace mrpapp
-}  // namespace linalg
 
 #endif  // MRPAPP_STREAM_FUNCTIONS_HPP
diff --git a/platform/util_gpublas.cpp b/platform/util_gpublas.cpp
index 948957445..6a72ab4a6 100644
--- a/platform/util_gpublas.cpp
+++ b/platform/util_gpublas.cpp
@@ -39,3 +39,4 @@ int getGpuBLASVersion() {
 #endif  // MRPAPP_HAVE_CUDA
 
 }  // namespace mrpapp
+
diff --git a/platform/util_gpublas.hpp b/platform/util_gpublas.hpp
index ab98f0fcf..64b9ae0bb 100644
--- a/platform/util_gpublas.hpp
+++ b/platform/util_gpublas.hpp
@@ -10,18 +10,14 @@
 //
 
 
-#ifndef DCA_LINALG_UTIL_UTIL_GPUBLAS_HPP
-#define DCA_LINALG_UTIL_UTIL_GPUBLAS_HPP
+#ifndef MRPAPP_LINALG_UTIL_UTIL_GPUBLAS_HPP
+#define MRPAPP_LINALG_UTIL_UTIL_GPUBLAS_HPP
 
-namespace dca {
-namespace linalg {
 namespace mrpapp {
-// mrpapp::
+
 
 int getGpuBLASVersion();
 
-}  // util
-}  // linalg
-}  // dca
+}
 
-#endif  // DCA_LINALG_UTIL_UTIL_CUBLAS_HPP
+#endif  // MRPAPP_LINALG_UTIL_UTIL_CUBLAS_HPP
diff --git a/util/type_help.hpp b/util/type_help.hpp
index b2e38c40c..ef3dd91aa 100644
--- a/util/type_help.hpp
+++ b/util/type_help.hpp
@@ -19,7 +19,7 @@
 
 #ifdef MRPAPP_HAVE_GPU
 #include "platform/mrpapp_gpu_complex.h"
-#include "gpu_type_mapping.hpp"
+#include "platform/gpu_type_mapping.hpp"
 #endif
 
 namespace mrpapp {
@@ -79,7 +79,7 @@ struct CUDAScalar_impl<T, IsReal<T>> {
 
 template <typename T>
 struct CUDAScalar_impl<T, IsComplex<T>> {
-  using value_type = util::CUDAComplex<RealAlias<T>>;
+  using value_type = CUDAComplex<RealAlias<T>>;
 };
 
 template <typename T>

From ff59d67aa01da62c93b5aa99081dbb36f89098c2 Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 12:01:34 -0400
Subject: [PATCH 09/11] CPUGPU vector test passes

---
 linalg/tests/vector_cpu_gpu_test.cpp | 47 ++++++++++++++--------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/linalg/tests/vector_cpu_gpu_test.cpp b/linalg/tests/vector_cpu_gpu_test.cpp
index eca3e990c..df200b406 100644
--- a/linalg/tests/vector_cpu_gpu_test.cpp
+++ b/linalg/tests/vector_cpu_gpu_test.cpp
@@ -38,33 +38,34 @@ TEST_CASE("VectorCPUTest::PointerMemoryType", "[linalg]") {
     CHECK(testing::isHostPointer(vec.ptr()));
   }
 }
-}
-// TEST(VectorCPUGPUTest, Constructors) {
-//   size_t size = 3;
-
-//   dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
-//   // Set the elements.
-//   for (int i = 0; i < vec.size(); ++i) {
-//     float el = 3 * i - 2;
-//     vec[i] = el;
-//   }
 
-//   dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(vec);
-//   ASSERT_EQ(vec.size(), vec_copy.size());
-//   ASSERT_LE(vec.size(), vec_copy.capacity());
-//   ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
+TEST_CASE("VectorCPUGPUTest::Constructors", "[linalg]") {
+  size_t size = 3;
 
-//   dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(vec_copy);
-//   EXPECT_EQ(vec.size(), vec_copy_copy.size());
-//   EXPECT_LE(vec.size(), vec_copy_copy.capacity());
-//   EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
+  Vector<float, DeviceType::CPU> vec("name", size);
+  // Set the elements.
+  for (int i = 0; i < vec.size(); ++i) {
+    float el = 3 * i - 2;
+    vec[i] = el;
+  }
 
-//   for (int i = 0; i < vec.size(); ++i) {
-//     EXPECT_EQ(vec[i], vec_copy_copy[i]);
-//     EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
-//   }
-// }
+  Vector<float, DeviceType::GPU> vec_copy(vec);
+  CHECK(vec.size() == vec_copy.size());
+  CHECK(vec.size() <= vec_copy.capacity());
+  CHECK(testing::isDevicePointer(vec_copy.ptr()));
+
+  Vector<float, DeviceType::CPU> vec_copy_copy(vec_copy);
+  CHECK(vec.size() == vec_copy_copy.size());
+  CHECK(vec.size() <= vec_copy_copy.capacity());
+  CHECK(testing::isHostPointer(vec_copy_copy.ptr()));
+
+  for (int i = 0; i < vec.size(); ++i) {
+    CHECK(vec[i] == vec_copy_copy[i]);
+    CHECK(vec.ptr(i) != vec_copy_copy.ptr(i));
+  }
+}
 
+}
 // TEST(VectorCPUGPUTest, Assignement) {
 //   {
 //     // Assign a vector that fits into the capacity.

From e00abbc89c18b820222cd9e8ac7b8a28050f237b Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 14:27:03 -0400
Subject: [PATCH 10/11] all vector_cpu_gpu tests pass

---
 linalg/tests/vector_cpu_gpu_test.cpp | 233 ++++++++++-----------------
 1 file changed, 85 insertions(+), 148 deletions(-)

diff --git a/linalg/tests/vector_cpu_gpu_test.cpp b/linalg/tests/vector_cpu_gpu_test.cpp
index df200b406..17c41b636 100644
--- a/linalg/tests/vector_cpu_gpu_test.cpp
+++ b/linalg/tests/vector_cpu_gpu_test.cpp
@@ -65,152 +65,89 @@ TEST_CASE("VectorCPUGPUTest::Constructors", "[linalg]") {
   }
 }
 
+TEST_CASE("VectorCPUGPUTest::Set", "[linalg]") {
+  SECTION("Assignment fits capacity")
+  {
+    // Assign a vector that fits into the capacity.
+    size_t size = 3;
+
+    Vector<float, DeviceType::GPU> vec_copy(10);
+    auto old_ptr = vec_copy.ptr();
+    auto capacity = vec_copy.capacity();
+    Vector<float, DeviceType::CPU> vec_copy_copy(6);
+    auto old_ptr_2 = vec_copy_copy.ptr();
+    auto capacity_2 = vec_copy_copy.capacity();
+
+    Vector<float, DeviceType::CPU> vec("name", size);
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      vec[i] = el;
+    }
+
+    vec_copy.set(vec, 0, 1);
+    CHECK(vec.size() ==  vec_copy.size());
+    CHECK(capacity ==  vec_copy.capacity());
+    CHECK(old_ptr ==  vec_copy.ptr());
+    CHECK(testing::isDevicePointer(vec_copy.ptr()));
+
+    vec_copy_copy.set(vec_copy, 0, 1);
+    CHECK(vec.size() ==  vec_copy_copy.size());
+    CHECK(capacity_2 ==  vec_copy_copy.capacity());
+    CHECK(old_ptr_2 ==  vec_copy_copy.ptr());
+    CHECK(testing::isHostPointer(vec_copy_copy.ptr()));
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(vec[i] ==  vec_copy_copy[i]);
+      CHECK(vec.ptr(i) != vec_copy_copy.ptr(i));
+    }
+  }
+  SECTION("Assignment exceeds capacity")
+  {
+    // Assign a vector that doesn't fit into the capacity.
+    Vector<float, DeviceType::GPU> vec_copy(10);
+    Vector<float, DeviceType::CPU> vec_copy_copy(6);
+    size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
+
+    Vector<float, DeviceType::CPU> vec("name", size);
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      vec[i] = el;
+    }
+
+    vec_copy.set(vec, 0, 1);
+    CHECK(vec.size() ==  vec_copy.size());
+    CHECK(vec.size() <= vec_copy.capacity());
+    CHECK(!(testing::isHostPointer(vec_copy.ptr())));
+
+    vec_copy_copy.set(vec_copy, 0, 1);
+    CHECK(vec.size() ==  vec_copy_copy.size());
+    CHECK(vec.size() <= vec_copy_copy.capacity());
+    CHECK(testing::isHostPointer(vec_copy_copy.ptr()));
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(vec[i] ==  vec_copy_copy[i]);
+      CHECK(vec.ptr(i) != vec_copy_copy.ptr(i));
+    }
+  }
+}
+
+TEST_CASE("VectorCPUTest::setAsync", "[linalg]") {
+  std::vector<int> vec(4, 1);
+
+  Vector<int, DeviceType::GPU> vec_copy;
+  Vector<int, DeviceType::CPU> vec_copy_copy;
+
+  GpuStream stream;
+
+  vec_copy.setAsync(vec, stream);
+  vec_copy_copy.setAsync(vec_copy, stream);
+  stream.sync();
+
+  CHECK(vec.size() ==  vec_copy_copy.size());
+  for (int i = 0; i < vec.size(); ++i)
+    CHECK(vec[i] ==  vec_copy_copy[i]);
+}
+
 }
-// TEST(VectorCPUGPUTest, Assignement) {
-//   {
-//     // Assign a vector that fits into the capacity.
-//     size_t size = 3;
-
-//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
-//     auto old_ptr = vec_copy.ptr();
-//     auto capacity = vec_copy.capacity();
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
-//     auto old_ptr_2 = vec_copy_copy.ptr();
-//     auto capacity_2 = vec_copy_copy.capacity();
-
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
-//     // Set the elements.
-//     for (int i = 0; i < vec.size(); ++i) {
-//       float el = 3 * i - 2;
-//       vec[i] = el;
-//     }
-
-//     vec_copy = vec;
-//     ASSERT_EQ(vec.size(), vec_copy.size());
-//     ASSERT_EQ(capacity, vec_copy.capacity());
-//     ASSERT_EQ(old_ptr, vec_copy.ptr());
-//     ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
-
-//     vec_copy_copy = vec_copy;
-//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
-//     EXPECT_EQ(capacity_2, vec_copy_copy.capacity());
-//     EXPECT_EQ(old_ptr_2, vec_copy_copy.ptr());
-//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
-
-//     for (int i = 0; i < vec.size(); ++i) {
-//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
-//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
-//     }
-//   }
-//   {
-//     // Assign a vector that doesn't fit into the capacity.
-//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
-//     size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
-
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
-//     // Set the elements.
-//     for (int i = 0; i < vec.size(); ++i) {
-//       float el = 3 * i - 2;
-//       vec[i] = el;
-//     }
-
-//     vec_copy = vec;
-//     ASSERT_EQ(vec.size(), vec_copy.size());
-//     ASSERT_LE(vec.size(), vec_copy.capacity());
-//     ASSERT_FALSE(testing::isHostPointer(vec_copy.ptr()));
-
-//     vec_copy_copy = vec_copy;
-//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
-//     EXPECT_LE(vec.size(), vec_copy_copy.capacity());
-//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
-
-//     for (int i = 0; i < vec.size(); ++i) {
-//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
-//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
-//     }
-//   }
-// }
-
-// TEST(VectorCPUGPUTest, Set) {
-//   {
-//     // Assign a vector that fits into the capacity.
-//     size_t size = 3;
-
-//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
-//     auto old_ptr = vec_copy.ptr();
-//     auto capacity = vec_copy.capacity();
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
-//     auto old_ptr_2 = vec_copy_copy.ptr();
-//     auto capacity_2 = vec_copy_copy.capacity();
-
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
-//     // Set the elements.
-//     for (int i = 0; i < vec.size(); ++i) {
-//       float el = 3 * i - 2;
-//       vec[i] = el;
-//     }
-
-//     vec_copy.set(vec, 0, 1);
-//     ASSERT_EQ(vec.size(), vec_copy.size());
-//     ASSERT_EQ(capacity, vec_copy.capacity());
-//     ASSERT_EQ(old_ptr, vec_copy.ptr());
-//     ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
-
-//     vec_copy_copy.set(vec_copy, 0, 1);
-//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
-//     EXPECT_EQ(capacity_2, vec_copy_copy.capacity());
-//     EXPECT_EQ(old_ptr_2, vec_copy_copy.ptr());
-//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
-
-//     for (int i = 0; i < vec.size(); ++i) {
-//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
-//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
-//     }
-//   }
-//   {
-//     // Assign a vector that doesn't fit into the capacity.
-//     dca::mrpapp::Vector<float, dca::mrpapp::GPU> vec_copy(10);
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec_copy_copy(6);
-//     size_t size = std::max(vec_copy.capacity(), vec_copy_copy.capacity()) + 1;
-
-//     dca::mrpapp::Vector<float, dca::mrpapp::CPU> vec("name", size);
-//     // Set the elements.
-//     for (int i = 0; i < vec.size(); ++i) {
-//       float el = 3 * i - 2;
-//       vec[i] = el;
-//     }
-
-//     vec_copy.set(vec, 0, 1);
-//     ASSERT_EQ(vec.size(), vec_copy.size());
-//     ASSERT_LE(vec.size(), vec_copy.capacity());
-//     ASSERT_FALSE(testing::isHostPointer(vec_copy.ptr()));
-
-//     vec_copy_copy.set(vec_copy, 0, 1);
-//     EXPECT_EQ(vec.size(), vec_copy_copy.size());
-//     EXPECT_LE(vec.size(), vec_copy_copy.capacity());
-//     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
-
-//     for (int i = 0; i < vec.size(); ++i) {
-//       EXPECT_EQ(vec[i], vec_copy_copy[i]);
-//       EXPECT_NE(vec.ptr(i), vec_copy_copy.ptr(i));
-//     }
-//   }
-// }
-
-// TEST(VectorCPUTest, setAsync) {
-//   std::vector<int> vec(4, 1);
-
-//   dca::mrpapp::Vector<int, dca::mrpapp::GPU> vec_copy;
-//   dca::mrpapp::Vector<int, dca::mrpapp::CPU> vec_copy_copy;
-
-//   mrpapp::GpuStream stream;
-
-//   vec_copy.setAsync(vec, stream);
-//   vec_copy_copy.setAsync(vec_copy, stream);
-//   stream.sync();
-
-//   EXPECT_EQ(vec.size(), vec_copy_copy.size());
-//   for (int i = 0; i < vec.size(); ++i)
-//     EXPECT_EQ(vec[i], vec_copy_copy[i]);
-// }

From 665f31c5feea2a24126ef45496e8a6d97371a692 Mon Sep 17 00:00:00 2001
From: "Peter Doak (epd)" <doakpw@ornl.gov>
Date: Mon, 12 May 2025 15:10:11 -0400
Subject: [PATCH 11/11] add additional testing for vector

---
 linalg/tests/CMakeLists.txt               |   7 +-
 linalg/tests/vector_cpu_test.cpp          | 351 ++++++++++++++++++++++
 linalg/tests/vector_gpu_test.cpp          | 336 +++++++++++++++++++++
 platform/allocators/aligned_allocator.cpp |   6 +
 4 files changed, 699 insertions(+), 1 deletion(-)
 create mode 100644 linalg/tests/vector_cpu_test.cpp
 create mode 100644 linalg/tests/vector_gpu_test.cpp
 create mode 100644 platform/allocators/aligned_allocator.cpp

diff --git a/linalg/tests/CMakeLists.txt b/linalg/tests/CMakeLists.txt
index f5169a84c..c021d3147 100644
--- a/linalg/tests/CMakeLists.txt
+++ b/linalg/tests/CMakeLists.txt
@@ -1,6 +1,11 @@
 # linalg unit tests
 
-add_executable(test_vector vector_cpu_gpu_test.cpp)
+set(test_srcs vector_cpu_test.cpp)
+if(MRPAPP_HAVE_GPU)
+set(test_srcs ${test_srcs} vector_gpu_test.cpp vector_cpu_gpu_test.cpp)
+endif()
+
+add_executable(test_vector ${test_srcs})
 target_link_libraries(test_vector PUBLIC platform linalg Catch2::Catch2WithMain)
 mrpapp_gpu_runtime_link(test_vector)
 
diff --git a/linalg/tests/vector_cpu_test.cpp b/linalg/tests/vector_cpu_test.cpp
new file mode 100644
index 000000000..a4c3c10b7
--- /dev/null
+++ b/linalg/tests/vector_cpu_test.cpp
@@ -0,0 +1,351 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file tests the Vector<CPU> class.
+
+#include "vector.hpp"
+#include <complex>
+#include <string>
+#include <utility>
+#include "catch2/catch_test_macros.hpp"
+
+namespace mrpapp {
+
+TEST_CASE("VectorCPUTest::Constructors", "[linalg]"){
+  size_t size = 3;
+  size_t capacity = 11;
+  std::string name("vector name");
+
+  // Tests all the constructors.
+  {
+    Vector<float, DeviceType::CPU> vec(name, size, capacity);
+    CHECK(name ==  vec.get_name());
+    CHECK(size ==  vec.size());
+    CHECK(capacity <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+  }
+  {
+    Vector<double, DeviceType::CPU> vec;
+    CHECK(0 == vec.size());
+    CHECK(0 <= vec.capacity());
+  }
+  {
+    Vector<int, DeviceType::CPU> vec(size);
+    CHECK(size == vec.size());
+    CHECK(size <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+  }
+  {
+    Vector<std::complex<double>, DeviceType::CPU> vec(size, capacity);
+    CHECK(size == vec.size());
+    CHECK(capacity <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+  }
+  {
+    Vector<double, DeviceType::CPU> vec(name);
+    CHECK(name == vec.get_name());
+    CHECK(0 == vec.size());
+    CHECK(0 <= vec.capacity());
+  }
+  {
+    Vector<int, DeviceType::CPU> vec(name, size);
+    CHECK(name == vec.get_name());
+    CHECK(size == vec.size());
+    CHECK(size <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+  }
+}
+
+TEST_CASE("VectorCPUTest::ElementPointers", "[linalg]"){
+  // Check if the pointers are computed correctly.
+  size_t size = 5;
+
+  Vector<int, DeviceType::CPU> vec(size);
+  const Vector<int, DeviceType::CPU>& vec_const_ref(vec);
+  for (int i = 0; i < vec.size(); ++i) {
+    int* ptr = vec.ptr();
+    CHECK(i == vec.ptr(i) - ptr);
+    CHECK(vec.ptr(i) == vec_const_ref.ptr(i));
+    CHECK(vec.ptr(i) == &vec[i]);
+    CHECK(vec.ptr(i) == &vec_const_ref[i]);
+  }
+}
+
+TEST_CASE("VectorCPUTest::ElementAccess", "[linalg]"){
+  // Check if the different element accesses return the same value.
+  size_t size = 4;
+
+  Vector<int, DeviceType::CPU> vec(size);
+  const Vector<int, DeviceType::CPU>& vec_const_ref(vec);
+  for (int i = 0; i < vec.size(); ++i) {
+    int el = 3 * i - 2;
+    vec[i] = el;
+    CHECK(el == vec[i]);
+    CHECK(el == vec_const_ref[i]);
+    CHECK(el == *(vec.ptr(i)));
+    CHECK(el == *(vec_const_ref.ptr(i)));
+  }
+}
+
+TEST_CASE("VectorCPUTest::CopyConstructor", "[linalg]"){
+  size_t size = 4;
+
+  Vector<float, DeviceType::CPU> vec("name", size);
+  // Set the elements.
+  for (int i = 0; i < vec.size(); ++i) {
+    float el = 3 * i - 2;
+    vec[i] = el;
+  }
+
+  Vector<float, DeviceType::CPU> vec_copy(vec, "another name");
+  CHECK("another name" == vec_copy.get_name());
+  CHECK(vec.size() == vec_copy.size());
+  CHECK(vec.size() <= vec_copy.capacity());
+
+  for (int i = 0; i < vec.size(); ++i) {
+    CHECK(vec[i] == vec_copy[i]);
+    CHECK(vec.ptr(i) != vec_copy.ptr(i));
+  }
+}
+
+TEST_CASE("VectorCPUTest::Assignement", "[linalg]"){
+  {
+    // Assign a vector that fits into the capacity.
+    size_t size = 4;
+
+    Vector<float, DeviceType::CPU> vec_copy(10);
+    auto old_ptr = vec_copy.ptr();
+    auto capacity = vec_copy.capacity();
+
+    Vector<float, DeviceType::CPU> vec("name", size);
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      vec[i] = el;
+    }
+
+    vec_copy = vec;
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(capacity == vec_copy.capacity());
+    CHECK(old_ptr == vec_copy.ptr());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(vec[i] == vec_copy[i]);
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+  {
+    // Assign a vector that does not fit into the capacity.
+    Vector<float, DeviceType::CPU> vec_copy(10);
+    auto size = vec_copy.capacity();
+    ++size;
+
+    Vector<float, DeviceType::CPU> vec("name", size);
+
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      vec[i] = el;
+    }
+
+    vec_copy = vec;
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(vec.size() <= vec_copy.capacity());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(vec[i] == vec_copy[i]);
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+}
+
+TEST_CASE("VectorCPUTest::Set", "[linalg]"){
+  {
+    // Assign a vector that fits into the capacity.
+    size_t size = 4;
+
+    Vector<float, DeviceType::CPU> vec_copy(10);
+    auto old_ptr = vec_copy.ptr();
+    auto capacity = vec_copy.capacity();
+
+    Vector<float, DeviceType::CPU> vec("name", size);
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      vec[i] = el;
+    }
+
+    vec_copy.set(vec, 0, 1);
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(capacity == vec_copy.capacity());
+    CHECK(old_ptr == vec_copy.ptr());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(vec[i] == vec_copy[i]);
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+  {
+    // Assign a vector that does not fit into the capacity.
+    Vector<float, DeviceType::CPU> vec_copy(10);
+    auto size = vec_copy.capacity();
+    ++size;
+
+    Vector<float, DeviceType::CPU> vec("name", size);
+
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      vec[i] = el;
+    }
+
+    vec_copy.set(vec, 0, 1);
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(vec.size() <= vec_copy.capacity());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(vec[i] == vec_copy[i]);
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+}
+
+TEST_CASE("VectorCPUTest::Resize", "[linalg]"){
+  {
+    size_t size = 4;
+
+    Vector<long, DeviceType::CPU> vec(size);
+
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      vec[i] = el;
+    }
+
+    // Resize to capacity. No reallocation has to take place.
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    int new_size = capacity;
+    vec.resize(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+
+    // Check the value of the elements.
+    for (int i = 0; i < size; ++i) {
+      long el = 1 + 3 * i;
+      CHECK(el == vec[i]);
+    }
+  }
+  {
+    size_t size = 5;
+
+    Vector<long, DeviceType::CPU> vec(size);
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      vec[i] = el;
+    }
+
+    // Shrink the vector. No reallocation has to take place.
+    int new_size = 2;
+    vec.resize(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+
+    // Check the value of the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      CHECK(el == vec[i]);
+    }
+  }
+  {
+    size_t size = 3;
+
+    Vector<long, DeviceType::CPU> vec(size);
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      vec[i] = el;
+    }
+
+    // New size is larger than capacity.
+    // Reallocation has to take place.
+    int new_size = capacity + 1;
+    vec.resize(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(new_size <= vec.capacity());
+    CHECK(old_ptr != vec.ptr());
+
+    // Check the value of the elements.
+    for (int i = 0; i < size; ++i) {
+      long el = 1 + 3 * i;
+      CHECK(el == vec[i]);
+    }
+  }
+}
+
+TEST_CASE("VectorCPUTest::ResizeNoCopy", "[linalg]"){
+  {
+    size_t size = 4;
+
+    Vector<long, DeviceType::CPU> vec(size);
+
+    // Resize to capacity. No reallocation has to take place.
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    size_t new_size = capacity;
+    vec.resizeNoCopy(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+  }
+  {
+    size_t size = 5;
+
+    Vector<long, DeviceType::CPU> vec(size);
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+
+    // Shrink the vector. No reallocation has to take place.
+    size_t new_size = 2;
+    vec.resizeNoCopy(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+  }
+  {
+    size_t size = 3;
+
+    Vector<long, DeviceType::CPU> vec(size);
+    auto capacity = vec.capacity();
+
+    // New size is larger than capacity.
+    // Reallocation has to take place.
+    size_t new_size = capacity + 1;
+    vec.resizeNoCopy(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(new_size <= vec.capacity());
+  }
+}
+
+TEST_CASE("VectorCPUTest::Clear", "[linalg]"){
+  Vector<double, DeviceType::CPU> vec(42);
+
+  CHECK(42 == vec.size());
+  vec.clear();
+  CHECK(0 == vec.size());
+  CHECK(0 == vec.capacity());
+}
+}
diff --git a/linalg/tests/vector_gpu_test.cpp b/linalg/tests/vector_gpu_test.cpp
new file mode 100644
index 000000000..f0e37c6e5
--- /dev/null
+++ b/linalg/tests/vector_gpu_test.cpp
@@ -0,0 +1,336 @@
+// Copyright (C) 2018 ETH Zurich
+// Copyright (C) 2018 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Raffaele Solca' (rasolca@itp.phys.ethz.ch)
+//
+// This file tests the Vector<GPU> class.
+
+#include "vector.hpp"
+#include <complex>
+#include <string>
+#include <utility>
+#include "catch2/catch_test_macros.hpp"
+#include "gpu_test_util.hpp"
+
+namespace mrpapp {
+TEST_CASE("VectorGPUTest::Constructors", "[linalg]"){
+  size_t size = 3;
+  size_t capacity = 11;
+  std::string name("vector name");
+
+  // Tests all the constructors.
+  {
+    Vector<float, DeviceType::GPU> vec(name, size, capacity);
+    CHECK(name == vec.get_name());
+    CHECK(size == vec.size());
+    CHECK(capacity <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+    CHECK(testing::isDevicePointer(vec.ptr()));
+  }
+  {
+    Vector<double, DeviceType::GPU> vec;
+    CHECK(0 == vec.size());
+    CHECK(0 <= vec.capacity());
+  }
+  {
+    Vector<int, DeviceType::GPU> vec(size);
+    CHECK(size == vec.size());
+    CHECK(size <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+    CHECK(testing::isDevicePointer(vec.ptr()));
+  }
+  {
+    Vector<std::complex<double>, DeviceType::GPU> vec(size, capacity);
+    CHECK(size == vec.size());
+    CHECK(capacity <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+    CHECK(testing::isDevicePointer(vec.ptr()));
+  }
+  {
+    Vector<double, DeviceType::GPU> vec(name);
+    CHECK(name == vec.get_name());
+    CHECK(0 == vec.size());
+    CHECK(0 <= vec.capacity());
+  }
+  {
+    Vector<int, DeviceType::GPU> vec(name, size);
+    CHECK(name == vec.get_name());
+    CHECK(size == vec.size());
+    CHECK(size <= vec.capacity());
+    CHECK(nullptr != vec.ptr());
+    CHECK(testing::isDevicePointer(vec.ptr()));
+  }
+}
+
+TEST_CASE("VectorGPUTest::ElementPointers", "[linalg]"){
+  // Check if the pointers are computed correctly.
+  size_t size = 5;
+
+  Vector<int, DeviceType::GPU> vec(size);
+  const Vector<int, DeviceType::GPU>& vec_const_ref(vec);
+  for (int i = 0; i < vec.size(); ++i) {
+    int* ptr = vec.ptr();
+    CHECK(i == vec.ptr(i) - ptr);
+    CHECK(vec.ptr(i) == vec_const_ref.ptr(i));
+  }
+}
+
+TEST_CASE("VectorGPUTest::CopyConstructor", "[linalg]"){
+  size_t size = 4;
+
+  Vector<float, DeviceType::GPU> vec("name", size);
+  // Set the elements.
+  for (int i = 0; i < vec.size(); ++i) {
+    float el = 3 * i - 2;
+    testing::setOnDevice(vec.ptr(i), el);
+  }
+
+  Vector<float, DeviceType::GPU> vec_copy(vec);
+  CHECK(vec.size() == vec_copy.size());
+  CHECK(vec.size() <= vec_copy.capacity());
+
+  for (int i = 0; i < vec.size(); ++i) {
+    CHECK(testing::getFromDevice(vec.ptr(i)) == testing::getFromDevice(vec_copy.ptr(i)));
+    CHECK(vec.ptr(i) != vec_copy.ptr(i));
+  }
+}
+
+TEST_CASE("VectorGPUTest::Assignement", "[linalg]"){
+  {
+    // Assign a vector that fits into the capacity.
+    size_t size = 4;
+
+    Vector<float, DeviceType::GPU> vec_copy(10);
+    auto old_ptr = vec_copy.ptr();
+    auto capacity = vec_copy.capacity();
+
+    Vector<float, DeviceType::GPU> vec("name", size);
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    vec_copy = vec;
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(capacity == vec_copy.capacity());
+    CHECK(old_ptr == vec_copy.ptr());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(testing::getFromDevice(vec.ptr(i)) == testing::getFromDevice(vec_copy.ptr(i)));
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+  {
+    // Assign a vector that does not fit into the capacity.
+    Vector<float, DeviceType::GPU> vec_copy(10);
+    auto size = vec_copy.capacity();
+    ++size;
+
+    Vector<float, DeviceType::GPU> vec("name", size);
+
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    vec_copy = vec;
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(vec.size() <= vec_copy.capacity());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(testing::getFromDevice(vec.ptr(i)) == testing::getFromDevice(vec_copy.ptr(i)));
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+}
+
+TEST_CASE("VectorGPUTest::Set", "[linalg]"){
+  {
+    // Assign a vector that fits into the capacity.
+    size_t size = 4;
+
+    Vector<float, DeviceType::GPU> vec_copy(10);
+    auto old_ptr = vec_copy.ptr();
+    auto capacity = vec_copy.capacity();
+
+    Vector<float, DeviceType::GPU> vec("name", size);
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    vec_copy.set(vec, 0, 1);
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(capacity == vec_copy.capacity());
+    CHECK(old_ptr == vec_copy.ptr());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(testing::getFromDevice(vec.ptr(i)) == testing::getFromDevice(vec_copy.ptr(i)));
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+  {
+    // Assign a vector that does not fit into the capacity.
+    Vector<float, DeviceType::GPU> vec_copy(10);
+    auto size = vec_copy.capacity();
+    ++size;
+
+    Vector<float, DeviceType::GPU> vec("name", size);
+
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      float el = 3 * i - 2;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    vec_copy.set(vec, 0, 1);
+    CHECK(vec.size() == vec_copy.size());
+    CHECK(vec.size() <= vec_copy.capacity());
+
+    for (int i = 0; i < vec.size(); ++i) {
+      CHECK(testing::getFromDevice(vec.ptr(i)) == testing::getFromDevice(vec_copy.ptr(i)));
+      CHECK(vec.ptr(i) != vec_copy.ptr(i));
+    }
+  }
+}
+
+TEST_CASE("VectorGPUTest::Resize", "[linalg]"){
+  {
+    size_t size = 4;
+
+    Vector<long, DeviceType::GPU> vec(size);
+
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    // Resize to capacity. No reallocation has to take place.
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    int new_size = capacity;
+    vec.resize(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+
+    // Check the value of the elements.
+    for (int i = 0; i < size; ++i) {
+      long el = 1 + 3 * i;
+      CHECK(el == testing::getFromDevice(vec.ptr(i)));
+    }
+  }
+  {
+    size_t size = 5;
+
+    Vector<long, DeviceType::GPU> vec(size);
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    // Shrink the vector. No reallocation has to take place.
+    int new_size = 2;
+    vec.resize(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+
+    // Check the value of the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      CHECK(el == testing::getFromDevice(vec.ptr(i)));
+    }
+  }
+  {
+    size_t size = 3;
+
+    Vector<long, DeviceType::GPU> vec(size);
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    // Set the elements.
+    for (int i = 0; i < vec.size(); ++i) {
+      long el = 1 + 3 * i;
+      testing::setOnDevice(vec.ptr(i), el);
+    }
+
+    // New size is larger than capacity.
+    // Reallocation has to take place.
+    int new_size = capacity + 1;
+    vec.resize(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(new_size <= vec.capacity());
+    CHECK(old_ptr != vec.ptr());
+
+    // Check the value of the elements.
+    for (int i = 0; i < size; ++i) {
+      long el = 1 + 3 * i;
+      CHECK(el == testing::getFromDevice(vec.ptr(i)));
+    }
+  }
+}
+
+TEST_CASE("VectorGPUTest::ResizeNoCopy", "[linalg]"){
+  {
+    size_t size = 4;
+
+    Vector<long, DeviceType::GPU> vec(size);
+
+    // Resize to capacity. No reallocation has to take place.
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+    size_t new_size = capacity;
+    vec.resizeNoCopy(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+  }
+  {
+    size_t size = 5;
+
+    Vector<long, DeviceType::GPU> vec(size);
+    auto old_ptr = vec.ptr();
+    auto capacity = vec.capacity();
+
+    // Shrink the vector. No reallocation has to take place.
+    size_t new_size = 2;
+    vec.resizeNoCopy(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(capacity == vec.capacity());
+    CHECK(old_ptr == vec.ptr());
+  }
+  {
+    size_t size = 3;
+
+    Vector<long, DeviceType::GPU> vec(size);
+    auto capacity = vec.capacity();
+
+    // New size is larger than capacity.
+    // Reallocation has to take place.
+    size_t new_size = capacity + 1;
+    vec.resizeNoCopy(new_size);
+    CHECK(new_size == vec.size());
+    CHECK(new_size <= vec.capacity());
+  }
+}
+
+TEST_CASE("VectorGPUTest::Clear", "[linalg]"){
+  Vector<double, DeviceType::GPU> vec(42);
+
+  CHECK(42 == vec.size());
+  vec.clear();
+  CHECK(0 == vec.size());
+  CHECK(0 == vec.capacity());
+}
+}
diff --git a/platform/allocators/aligned_allocator.cpp b/platform/allocators/aligned_allocator.cpp
new file mode 100644
index 000000000..b420bee16
--- /dev/null
+++ b/platform/allocators/aligned_allocator.cpp
@@ -0,0 +1,6 @@
+
+#include "aligned_allocator.hpp"
+
+namespace mrpapp {
+template class AlignedAllocator<double>;
+}