From 674039fd2b131ce12d46d105b437265419999197 Mon Sep 17 00:00:00 2001
From: Simon Pintarelli <1237199+simonpintarelli@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:00:31 +0200
Subject: [PATCH] distributed wfc support / misc updates (#20)

* remove enable_language(CUDA)

* enable distributed wave-functions

* update spack recipe for intel-oneapi-mkl

* cmake: simplify blas/mkl

* add check-format

* remove kokkos <4 initialization
---
 .github/workflows/check-format.yml         |  15 ++
 .gitignore                                 |   1 +
 CMakeLists.txt                             |  26 ++-
 check_format.sh                            |  15 ++
 cmake/nlcglib_macros.cmake                 |  28 +--
 include/interface.hpp                      |  32 ++-
 include/nlcglib.hpp                        |  18 +-
 spack/packages/nlcglib/package.py          |  29 +++
 src/CMakeLists.txt                         |  11 -
 src/constants.hpp                          |   6 +-
 src/dft/newton_minimization_smearing.hpp   |   4 +-
 src/exceptions.hpp                         |   2 +-
 src/exec_space.hpp                         |   4 +-
 src/free_energy.hpp                        |   3 +-
 src/geodesic.hpp                           |   1 -
 src/gpu/acc.cpp                            |   2 +-
 src/gpu/acc.hpp                            |   6 +-
 src/hip/hip_space.hpp                      |   2 +-
 src/la/cublas.hpp                          |   7 +-
 src/la/dvector.hpp                         |  39 +++-
 src/la/lapack.hpp                          |  13 +-
 src/la/lapack_cpu.hpp                      | 105 ++++-----
 src/la/lapack_cuda.hpp                     |  91 ++++----
 src/la/lapack_rocm.hpp                     | 100 ++++-----
 src/la/magma.cpp                           |  14 +-
 src/la/magma.hpp                           |  17 +-
 src/la/mvector.hpp                         | 248 ++++++++++-----------
 src/la/rocblas.hpp                         |   5 +-
 src/la/utils.hpp                           |  25 +--
 src/linesearch/linesearch.hpp              |  55 ++---
 src/mpi/communicator.hpp                   |  25 +++
 src/mpi/mpi_type.hpp                       |  18 +-
 src/mvp2/descent_direction_impl.hpp        |   2 -
 src/nlcglib.cpp                            |  28 +--
 src/operator.hpp                           |   1 -
 src/overlap.hpp                            |  21 +-
 src/preconditioner.hpp                     |  29 ++-
 src/pseudo_hamiltonian/grad_eta.hpp        |  11 +-
 src/smearing.hpp                           |  11 +-
 src/traits.hpp                             |  30 ++-
 src/ultrasoft_precond.hpp                  |  23 +-
 src/utils/format.hpp                       |   9 +-
 src/utils/logger.hpp                       |  15 +-
 src/utils/step_logger.hpp                  |  33 +--
 src/utils/timer.hpp                        |   8 +-
 test/test.cpp                              |  33 +--
 test/test3.cpp                             |   4 +-
 test/test_buffer.cpp                       |   3 +-
 test/test_deep_copy.cpp                    |  28 ++-
 test/test_dvector_mirror_view_and_copy.cpp |  20 +-
 test/test_gpu.cpp                          |  12 +-
 test/test_kokkos.cpp                       |  65 +++---
 test/test_mpi_wrapper.cpp                  |   9 +-
 test/test_mvector_cuda.cpp                 |   9 +-
 test/test_smearing.cpp                     |  21 +-
 test/test_solver.cpp                       |  17 +-
 test/test_stride_dvector.cpp               |  47 ++--
 test/test_threaded.cpp                     |   5 +-
 test/test_traits.cpp                       |  12 +-
 test/test_utils.cpp                        |  13 +-
 unit_tests/CMakeLists.txt                  |   2 +-
 unit_tests/local/test_la_wrappers.cpp      |  52 ++---
 unit_tests/local/test_solver_wrappers.cpp  |  18 +-
 63 files changed, 816 insertions(+), 742 deletions(-)
 create mode 100644 .github/workflows/check-format.yml
 create mode 100755 check_format.sh

diff --git a/.github/workflows/check-format.yml b/.github/workflows/check-format.yml
new file mode 100644
index 0000000..3423c0b
--- /dev/null
+++ b/.github/workflows/check-format.yml
@@ -0,0 +1,15 @@
+name: Check source code formatting
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    container: zhongruoyu/llvm-ports:17.0.4-slim-focal
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check .cpp and .hpp files
+        run: |
+          ./check_format.sh
diff --git a/.gitignore b/.gitignore
index b50ee76..4a8fa95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ spack-build-*
 *~undo-tree~
 __pycache__/
 compile_commands.json
+build-linux-*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8971a32..099f063 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(USE_OPENMP On CACHE BOOL "use OpenMP")
 set(USE_CUDA Off CACHE BOOL "use cuda")
 set(USE_ROCM Off CACHE BOOL "use amd gpus")
 set(USE_MAGMA Off CACHE BOOL "use magma eigensolver for amd gpus")
+set(USE_GPU_DIRECT Off CACHE BOOL "use gpu direct")
 
 set(BUILD_TESTS OFF CACHE BOOL "build tests")
 set(LAPACK_VENDOR "OpenBLAS" CACHE STRING "lapack vendor")
@@ -21,22 +22,19 @@ endif()
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS "YES")
 
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
 
 include(cmake/nlcglib_macros.cmake)
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
 
-find_package(Kokkos)
 if(USE_CUDA)
   find_package(CUDAToolkit REQUIRED)
-  enable_language(CUDA)
   include(cmake/cudalibs_target.cmake)
 endif()
-
 if(USE_ROCM)
   include(cmake/rocmlibs_target.cmake)
 endif()
+find_package(Kokkos)
 
 if(USE_MAGMA)
   find_package(MAGMA REQUIRED)
@@ -53,12 +51,24 @@ if(LAPACK_VENDOR MATCHES OpenBLAS)
       INTERFACE_INCLUDE_DIRECTORIES "${OpenBLAS_INCLUDE_DIRS}"
       INTERFACE_LINK_LIBRARIES "${OpenBLAS_LIBRARIES}")
   endif()
-elseif(LAPACK_VENDOR MATCHES MKL)
-  message("LAPACK VENDOR MKL")
-  find_package(MKL REQUIRED)
 elseif(LAPACK_VENDOR STREQUAL MKLONEAPI)
-  # set(MKL_THREADING gnu_thread)
+  message("LAPACK VENDOR MKL")
+  set(MKL_INTERFACE "lp64" CACHE STRING "")
+  set(MKL_THREADING "sequential" CACHE STRING "")
+  set(MKL_MPI "mpich" CACHE STRING "")
+  find_package(MKL CONFIG REQUIRED)
+  if(NOT TARGET nlcg::cpu_lapack)
+    add_library(nlcg::cpu_lapack INTERFACE IMPORTED)
+    target_link_libraries(nlcg::cpu_lapack INTERFACE MKL::MKL)
+    target_compile_definitions(nlcg::cpu_lapack INTERFACE __USE_MKL)
+  endif()
+elseif(LAPACK_VENDOR STREQUAL MKL)
   find_package(MKL REQUIRED NO_MODULE)
+  if(NOT TARGET nlcg::cpu_lapack)
+    add_library(nlcg::cpu_lapack INTERFACE IMPORTED)
+    target_link_libraries(nlcg::cpu_lapack INTERFACE mkl::mkl_intel_32bit_omp_dyn)
+    target_compile_definitions(nlcg::cpu_lapack INTERFACE __USE_MKL)
+  endif()
 elseif(LAPACK_VENDOR STREQUAL CRAY_LIBSCI)
   message("LAPACK VENDOR Cray Libsci")
   find_package(SCI REQUIRED)
diff --git a/check_format.sh b/check_format.sh
new file mode 100755
index 0000000..c4d05f0
--- /dev/null
+++ b/check_format.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+check_diff() {
+    local status=0
+	  for file in "$@"; do
+		    if ! diff -q "$file" <(clang-format "$file"); then
+            status=1
+        fi
+	  done
+    return $status
+}
+
+export -f check_diff
+
+find . -type f \( -name "*.cpp" -o -name "*.hpp" \) ! -path "./build-env/*" ! -path "./.*" -exec bash -c 'check_diff "$@"' sh {} +
diff --git a/cmake/nlcglib_macros.cmake b/cmake/nlcglib_macros.cmake
index b956797..9852bce 100644
--- a/cmake/nlcglib_macros.cmake
+++ b/cmake/nlcglib_macros.cmake
@@ -2,42 +2,30 @@ MACRO(NLCGLIB_SETUP_TARGET _target)
   target_link_libraries(
     ${_target} PUBLIC
     Kokkos::kokkos
-    # ${LAPACK_LIBRARIES}
     MPI::MPI_CXX
-    # $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
     $<TARGET_NAME_IF_EXISTS:nlcglib::cudalibs>
     $<TARGET_NAME_IF_EXISTS:nlcglib::rocmlibs>
     $<TARGET_NAME_IF_EXISTS:nlcglib::magma>
     $<TARGET_NAME_IF_EXISTS:roc::hipblas> # only required for magma
     $<TARGET_NAME_IF_EXISTS:roc::hipsparse> # only required for magma
     nlohmann_json::nlohmann_json
-    )
+    nlcg::cpu_lapack
+  )
 
   target_include_directories(${_target} PUBLIC
     ${CMAKE_SOURCE_DIR}/src
     ${CMAKE_SOURCE_DIR}/include
-    )
+  )
 
-    if(USE_ROCM)
-      target_compile_options(${_target} PUBLIC --offload-arch=gfx90a)
-    endif()
-
-  if(LAPACK_VENDOR MATCHES MKL)
-    target_compile_definitions(${_target} PUBLIC __USE_MKL)
-    # if(USE_OPENMP)
-    target_link_libraries(${_target}  PUBLIC mkl::mkl_intel_32bit_omp_dyn)
-    # else()
-    #   target_link_libraries(${_target}  PUBLIC mkl::mkl_intel_32bit_seq_dyn)
-    # endif()
-  elseif(LAPACK_VENDOR STREQUAL MKLONEAPI)
-    target_link_libraries(${_target}  PUBLIC MKL::MKL)
-  else()
-    target_link_libraries(${_target} PRIVATE nlcg::cpu_lapack)
+  if(USE_ROCM)
+    target_compile_options(${_target} PUBLIC --offload-arch=gfx90a)
   endif()
+
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_OPENMP}>:__USE_OPENMP>)
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_CUDA}>:__NLCGLIB__CUDA>)
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_ROCM}>:__NLCGLIB__ROCM>)
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_MAGMA}>:__NLCGLIB__MAGMA>)
+  target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_GPU_DIRECT}>:__NLCGLIB__GPU_DIRECT>)
   target_include_directories(${_target} PUBLIC $<TARGET_PROPERTY:Kokkos::kokkoscore,INTERFACE_INCLUDE_DIRECTORIES>)
-
 ENDMACRO()
diff --git a/include/interface.hpp b/include/interface.hpp
index a5dd4fe..2fe525f 100644
--- a/include/interface.hpp
+++ b/include/interface.hpp
@@ -2,10 +2,10 @@
 
 #include <array>
 #include <complex>
-#include <memory>
+#include <functional>
 #include <map>
+#include <memory>
 #include <stdexcept>
-#include <functional>
 #include <vector>
 #include "mpi.h"
 
@@ -18,9 +18,8 @@ enum class memory_type
   device
 };
 
-static std::map<memory_type, std::string> memory_names = {{memory_type::none, "none"},
-                                                          {memory_type::host, "host"},
-                                                          {memory_type::device, "device"}};
+static std::map<memory_type, std::string> memory_names = {
+    {memory_type::none, "none"}, {memory_type::host, "host"}, {memory_type::device, "device"}};
 
 enum class smearing_type
 {
@@ -50,20 +49,18 @@ struct buffer_protocol
                   std::array<int, d> size,
                   T* data,
                   enum memory_type memtype,
-                  MPI_Comm mpi_comm=MPI_COMM_SELF)
+                  MPI_Comm mpi_comm = MPI_COMM_SELF)
       : stride(std::move(stride))
       , size(std::move(size))
       , data(data)
       , memtype(memtype)
       , mpi_comm(mpi_comm)
-  { /* empty */ }
+  { /* empty */
+  }
 
   // 1d constructor
   // template<int k=dim, class=std::enable_if_t<k==1>>
-  buffer_protocol(int size,
-                  T* data,
-                  enum memory_type memtype,
-                  MPI_Comm mpi_comm= MPI_COMM_SELF)
+  buffer_protocol(int size, T* data, enum memory_type memtype, MPI_Comm mpi_comm = MPI_COMM_SELF)
       : buffer_protocol({1}, {size}, data, memtype, mpi_comm)
   {
     static_assert(d == 1, "not available.");
@@ -79,7 +76,7 @@ struct buffer_protocol
   MPI_Comm mpi_comm{MPI_COMM_SELF};
 };
 
-template<int dim, class numeric_t>
+template <int dim, class numeric_t>
 class BufferBase
 {
 public:
@@ -100,7 +97,7 @@ class BufferBase
   virtual kindex_t kpoint_index(int i) const = 0;
 };
 
-template<class numeric_t>
+template <class numeric_t>
 class BufferBase<0, numeric_t>
 {
 public:
@@ -140,23 +137,24 @@ class EnergyBase
   virtual std::shared_ptr<MatrixBaseZ> get_sphi(memory_type) = 0;
   virtual std::shared_ptr<MatrixBaseZ> get_C(memory_type) = 0;
   virtual std::shared_ptr<VectorBaseZ> get_fn() = 0;
-  virtual void set_fn(const std::vector<std::pair<int, int>>&, const std::vector<std::vector<double>>&) = 0;
+  virtual void set_fn(const std::vector<std::pair<int, int>>&,
+                      const std::vector<std::vector<double>>&) = 0;
   virtual std::shared_ptr<VectorBaseZ> get_ek() = 0;
   virtual std::shared_ptr<VectorBaseZ> get_gkvec_ekin() = 0;
   virtual std::shared_ptr<ScalarBaseZ> get_kpoint_weights() = 0;
   virtual void set_chemical_potential(double) = 0;
   virtual double get_chemical_potential() = 0;
   virtual void print_info() const = 0;
+  virtual MPI_Comm comm_world() const = 0;
 };
 
 class OpBase
 {
 public:
   using key_t = std::pair<int, int>;
+
 public:
-  virtual void apply(const key_t&,
-                     MatrixBaseZ::buffer_t& out,
-                     MatrixBaseZ::buffer_t& in) const = 0;
+  virtual void apply(const key_t&, MatrixBaseZ::buffer_t& out, MatrixBaseZ::buffer_t& in) const = 0;
   virtual std::vector<key_t> get_keys() const = 0;
 };
 
diff --git a/include/nlcglib.hpp b/include/nlcglib.hpp
index 9bc3f1e..bb5a97e 100644
--- a/include/nlcglib.hpp
+++ b/include/nlcglib.hpp
@@ -4,8 +4,10 @@
 
 namespace nlcglib {
 
-void initialize();
-void finalize();
+void
+initialize();
+void
+finalize();
 
 nlcg_info
 nlcg_mvp2_cpu(EnergyBase& energy_base,
@@ -28,12 +30,12 @@ nlcg_mvp2_device(EnergyBase& energy_base,
 
 nlcg_info
 nlcg_mvp2_cpu_device(EnergyBase& energy_base,
-                      smearing_type smearing,
-                      double temp,
-                      double tol,
-                      double kappa,
-                      double tau,
-                      int maxiter,
+                     smearing_type smearing,
+                     double temp,
+                     double tol,
+                     double kappa,
+                     double tau,
+                     int maxiter,
                      int restart);
 nlcg_info
 nlcg_mvp2_device_cpu(EnergyBase& energy_base,
diff --git a/spack/packages/nlcglib/package.py b/spack/packages/nlcglib/package.py
index 47a54db..a140d7a 100644
--- a/spack/packages/nlcglib/package.py
+++ b/spack/packages/nlcglib/package.py
@@ -28,6 +28,7 @@ class Nlcglib(CMakePackage, CudaPackage, ROCmPackage):
         description="CMake build type",
         values=("Debug", "Release", "RelWithDebInfo"),
     )
+    variant("gpu_direct", default=False)
 
     depends_on("cmake@3.21:", type="build")
     depends_on("mpi")
@@ -38,6 +39,10 @@ class Nlcglib(CMakePackage, CudaPackage, ROCmPackage):
 
     depends_on("googletest", type="build", when="+tests")
     depends_on("nlohmann-json")
+    depends_on("kokkos@4:", when="@1.1:")
+
+    # MKLConfig.cmake introduced in 2021.3
+    conflicts("intel-oneapi-mkl@:2021.2", when="^intel-oneapi-mkl")
 
     with when("@:0.9"):
         conflicts("+rocm")
@@ -59,6 +64,7 @@ def cmake_args(self):
             self.define_from_variant("USE_OPENMP", "openmp"),
             self.define_from_variant("BUILD_TESTS", "tests"),
             self.define_from_variant("USE_ROCM", "rocm"),
+            self.define_from_variant("USE_GPU_DIRECT", "gpu_direct"),
             self.define_from_variant("USE_MAGMA", "magma"),
             self.define_from_variant("USE_CUDA", "cuda"),
         ]
@@ -67,6 +73,29 @@ def cmake_args(self):
             options += [self.define("LAPACK_VENDOR", "MKL")]
         elif self.spec["blas"].name in ["intel-oneapi-mkl"]:
             options += [self.define("LAPACK_VENDOR", "MKLONEAPI")]
+            mkl_mapper = {
+                "threading": {
+                    "none": "sequential",
+                    "openmp": "gnu_thread",
+                    "tbb": "tbb_thread",
+                },
+                "mpi": {"intel-mpi": "intelmpi", "mpich": "mpich", "openmpi": "openmpi"},
+            }
+
+            mkl_threads = mkl_mapper["threading"][self.spec["intel-oneapi-mkl"].variants["threads"].value]
+
+            mpi_provider = self.spec["mpi"].name
+            if mpi_provider in ["mpich", "cray-mpich", "mvapich", "mvapich2"]:
+                mkl_mpi = mkl_mapper["mpi"]["mpich"]
+            else:
+                mkl_mpi = mkl_mapper["mpi"][mpi_provider]
+
+            options.extend([
+                self.define("MKL_INTERFACE", "lp64"),
+                self.define("MKL_THREADING", mkl_threads),
+                self.define("MKL_MPI", mkl_mpi)
+            ])
+
         elif self.spec["blas"].name in ["openblas"]:
             options += [self.define("LAPACK_VENDOR", "OpenBLAS")]
         else:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1fb6969..c50cfe1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,17 +11,6 @@ add_library(nlcglib SHARED nlcglib.cpp)
 target_include_directories(nlcglib PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>)
 target_link_libraries(nlcglib PRIVATE nlcglib_core)
 
-if(LAPACK_VENDOR MATCHES MKL)
-  target_compile_definitions(nlcglib PRIVATE __USE_MKL)
-  if(USE_OPENMP)
-    target_link_libraries(nlcglib PRIVATE mkl::mkl_intel_32bit_omp_dyn)
-  else()
-    target_link_libraries(nlcglib PRIVATE mkl::mkl_intel_32bit_seq_st)
-  endif()
-else()
-  target_link_libraries(nlcglib PRIVATE nlcg::cpu_lapack)
-endif()
-
 set_target_properties(nlcglib PROPERTIES PUBLIC_HEADER
   ${CMAKE_SOURCE_DIR}/include/nlcglib.hpp
 )
diff --git a/src/constants.hpp b/src/constants.hpp
index b87b468..e54711d 100644
--- a/src/constants.hpp
+++ b/src/constants.hpp
@@ -4,10 +4,10 @@ namespace nlcglib {
 
 namespace constants {
 const double pi{3.1415926535897932385};
-}  // constants
+}  // namespace constants
 
 namespace physical_constants {
 const double kb{0.00000316681156340226};
-}  // physical_constants
+}  // namespace physical_constants
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/dft/newton_minimization_smearing.hpp b/src/dft/newton_minimization_smearing.hpp
index 583cf29..733a44d 100644
--- a/src/dft/newton_minimization_smearing.hpp
+++ b/src/dft/newton_minimization_smearing.hpp
@@ -28,8 +28,8 @@ double
 newton_minimization_chemical_potential(
     Nt&& N, DNt&& dN, D2Nt&& ddN, double mu0, double ne, double tol, int maxstep = 1000)
 {
-  // Newton finds the minimum, not necessarily N(mu) == ne, tolerate up to `tol_ne` difference in number of electrons
-  // if |N(mu_0) -ne| > tol_ne an error is thrown.
+  // Newton finds the minimum, not necessarily N(mu) == ne, tolerate up to `tol_ne` difference in
+  // number of electrons if |N(mu_0) -ne| > tol_ne an error is thrown.
   const double tol_ne = 1e-2;
 
   double mu = mu0;
diff --git a/src/exceptions.hpp b/src/exceptions.hpp
index 065454b..d9f8fbe 100644
--- a/src/exceptions.hpp
+++ b/src/exceptions.hpp
@@ -25,4 +25,4 @@ class DescentError : public std::exception
 };
 
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/exec_space.hpp b/src/exec_space.hpp
index 5ba443e..8d2ae5b 100644
--- a/src/exec_space.hpp
+++ b/src/exec_space.hpp
@@ -30,7 +30,7 @@ template <>
 struct exec<Kokkos::HostSpace>
 {
 #ifdef __USE_OPENMP
- using type = Kokkos::OpenMP;
+  using type = Kokkos::OpenMP;
 #else
   using type = Kokkos::Serial;
 #endif
@@ -40,4 +40,4 @@ template <class SPACE>
 using exec_t = typename exec<SPACE>::type;
 
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/free_energy.hpp b/src/free_energy.hpp
index d4e1e43..22c6e67 100644
--- a/src/free_energy.hpp
+++ b/src/free_energy.hpp
@@ -92,7 +92,8 @@ FreeEnergy::compute(const mvector<tX>& X, const mvector<tF>& fn, const mvector<t
   energy.set_fn(key_fn, vec_fn);
   energy.compute();
 
-  // update fermi energy in SIRIUS (no effect here, but make sure to leave SIRIUS in a consistent state)
+  // update fermi energy in SIRIUS (no effect here, but make sure to leave SIRIUS in a consistent
+  // state)
   energy.set_chemical_potential(mu);
 
   double etot = energy.get_total_energy();
diff --git a/src/geodesic.hpp b/src/geodesic.hpp
index 62d99f4..bdc1740 100644
--- a/src/geodesic.hpp
+++ b/src/geodesic.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <Kokkos_Core.hpp>
-
 #include "la/lapack.hpp"
 #include "la/utils.hpp"
 
diff --git a/src/gpu/acc.cpp b/src/gpu/acc.cpp
index c3573a0..1a2e972 100644
--- a/src/gpu/acc.cpp
+++ b/src/gpu/acc.cpp
@@ -2,9 +2,9 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <Kokkos_Complex.hpp>
+#include <cassert>
 #include <complex>
 #include <cstdio>
-#include <cassert>
 
 namespace nlcglib {
 #define CALL_DEVICE_API(func__, args__)                      \
diff --git a/src/gpu/acc.hpp b/src/gpu/acc.hpp
index c962c0d..43cdd6f 100644
--- a/src/gpu/acc.hpp
+++ b/src/gpu/acc.hpp
@@ -5,9 +5,9 @@ namespace nlcglib {
 
 namespace acc {
 
-template <class T, class U=T>
+template <class T, class U = T>
 void
 copy(T* target, const U* src, size_t n);
 
-}  // acc
-}  // nlcglib
+}  // namespace acc
+}  // namespace nlcglib
diff --git a/src/hip/hip_space.hpp b/src/hip/hip_space.hpp
index 66d9251..6d9e2cb 100644
--- a/src/hip/hip_space.hpp
+++ b/src/hip/hip_space.hpp
@@ -3,5 +3,5 @@
 #include <HIP/Kokkos_HIP_Space.hpp>
 #else
 #include <Kokkos_HIP_Space.hpp>
-#endif // KOKKOS > 3.7.01
+#endif  // KOKKOS > 3.7.01
 #endif
diff --git a/src/la/cublas.hpp b/src/la/cublas.hpp
index 030ce6d..01ce63b 100644
--- a/src/la/cublas.hpp
+++ b/src/la/cublas.hpp
@@ -12,11 +12,12 @@ struct cublasHandle
     static cublasHandle_t handle{nullptr};
     return handle;
   }
+
 public:
   static cublasHandle_t& get()
   {
     cublasHandle_t& handle = _get();
-    if(!handle) {
+    if (!handle) {
       cublasCreate(&handle);
     }
     return handle;
@@ -24,10 +25,10 @@ struct cublasHandle
 
   static void destroy()
   {
-    if(!_get()) cublasDestroy(_get());
+    if (!_get()) cublasDestroy(_get());
     _get() = nullptr;
   }
 };
 
 
-}  // cublas
+}  // namespace cublas
diff --git a/src/la/dvector.hpp b/src/la/dvector.hpp
index 1150061..c82db8a 100644
--- a/src/la/dvector.hpp
+++ b/src/la/dvector.hpp
@@ -1,14 +1,14 @@
 #pragma once
 
 #include <Kokkos_Core.hpp>
-#include "hip/hip_space.hpp"
 #include <complex>
 #include <iomanip>
 #include <string>
 #include <type_traits>
 #include <utility>
+#include "hip/hip_space.hpp"
+#include "interface.hpp"
 #include "map.hpp"
-#include "nlcglib.hpp"
 
 namespace nlcglib {
 
@@ -346,5 +346,40 @@ print(const KokkosDVector<T**, ARGS...>& mat, O&& out, int precision = 4)
   }
 }
 
+template <class T, class... ARGS>
+void
+allreduce(KokkosDVector<T, ARGS...>& C, const Communicator& comm)
+{
+  typename KokkosDVector<T, ARGS...>::storage_t::execution_space ex;
+  ex.fence();
+#ifdef __NLCGLIB___GPU_DIRECT
+  auto C_ptr = C.array().data();
+  int m = C.map().nrows();
+  int n = C.map().ncols();
+  if (C.array().stride(0) != 1) {
+    throw std::runtime_error("allreduce, expected stride(0) == 1");
+  }
+  if (C.array().stride(1) != m) {
+    throw std::runtime_error("allreduce, expected stride(1) == ncols");
+  }
+
+  comm.allreduce(C_ptr, m * n, mpi_op::sum);
+#else
+  auto C_h = create_mirror_view_and_copy(Kokkos::HostSpace(), C);
+  auto C_ptr = C_h.array().data();
+  int m = C_h.map().nrows();
+  int n = C_h.map().ncols();
+  if (static_cast<int>(C_h.array().stride(0)) != 1) {
+    throw std::runtime_error("allreduce, expected stride(0) == 1");
+  }
+  if (static_cast<int>(C_h.array().stride(1)) != m) {
+    throw std::runtime_error("allreduce, expected stride(1) == ncols");
+  }
+  comm.allreduce(C_ptr, m * n, mpi_op::sum);
+  // copy back to original memory
+  deep_copy(C, C_h);
+#endif
+}
+
 
 }  // namespace nlcglib
diff --git a/src/la/lapack.hpp b/src/la/lapack.hpp
index bf411c3..662a5b5 100644
--- a/src/la/lapack.hpp
+++ b/src/la/lapack.hpp
@@ -1,10 +1,12 @@
 #pragma once
 
-#include "hip/hip_space.hpp"
+#include <mpi.h>
 #include <functional>
 #include <utility>
+#include "hip/hip_space.hpp"
 #include "la/map.hpp"
 #include "lapack_cpu.hpp"
+#include "mpi/communicator.hpp"
 #ifdef __NLCGLIB__CUDA
 #include "lapack_cuda.hpp"
 #endif
@@ -240,7 +242,7 @@ struct inner_
   {
     int n = A.map().ncols();
     int m = B.map().ncols();
-    Map<SlabLayoutV> map(A.map().comm(), SlabLayoutV({{0, 0, n, m}}));
+    Map<SlabLayoutV> map(Communicator(), SlabLayoutV({{0, 0, n, m}}));
     to_layout_left_t<M1> C(map);
     inner(C, A, B, alpha, beta);
     return C;
@@ -250,7 +252,7 @@ struct inner_
 /// Hermitian inner product, summed
 struct innerh_tr
 {
-#if defined(__NLCGLIB__CUDA)  || defined(__NLCGLIB__ROCM)
+#if defined(__NLCGLIB__CUDA) || defined(__NLCGLIB__ROCM)
   template <class M1, class M2>
   std::enable_if_t<
       !Kokkos::SpaceAccessibility<Kokkos::Serial, typename M1::storage_t::memory_space>::accessible,
@@ -285,6 +287,9 @@ struct innerh_tr
         Kokkos::RangePolicy<exec_t<memory_space>>(0, nrows),
         KOKKOS_LAMBDA(int i, T& lsum) { lsum += tmp(i); },
         sum);
+    exec_t<memory_space> spc;
+    spc.fence();
+    sum = X.map().comm().allreduce(sum, mpi_op::sum);
     return sum;
   }
 #endif
@@ -321,7 +326,7 @@ struct innerh_tr
         Kokkos::RangePolicy<Kokkos::Serial>(0, nrows),
         KOKKOS_LAMBDA(int i, T& lsum) { lsum += tmp(i); },
         sum);
-
+    sum = X.map().comm().allreduce(sum, mpi_op::sum);
     return sum;
   }
 };
diff --git a/src/la/lapack_cpu.hpp b/src/la/lapack_cpu.hpp
index cb0e2e6..9b286e9 100644
--- a/src/la/lapack_cpu.hpp
+++ b/src/la/lapack_cpu.hpp
@@ -47,7 +47,7 @@ eigh(KokkosDVector<T, LAYOUT, KOKKOS...>& U,
     );
     if (info != 0) throw std::runtime_error("cblas zheevd failed");
   } else {
-    throw std::runtime_error("not yet implemented");
+    throw std::runtime_error("eigh: not yet implemented");
   }
 }
 
@@ -81,7 +81,7 @@ solve_sym(KokkosDVector<T, LAYOUT, KOKKOS...>& A, KokkosDVector<T, LAYOUT, KOKKO
     typedef cblas::potrs<numeric_t> potrs_t;
     potrs_t::call(order, uplo, n, nrhs, ptr_A, lda, ptr_B, ldb);
   } else {
-    throw std::runtime_error("not implemented");
+    throw std::runtime_error("solve_sym: not implemented");
   }
 }
 
@@ -120,39 +120,36 @@ inner(KokkosDVector<T0**, LAYOUT0, KOKKOS0...>& C,
   static_assert(std::is_same<LAYOUT1, LAYOUT2>::value, "matrix layout do not match");
 
   // single rank
-  if (A.map().is_local() && B.map().is_local() && C.map().is_local()) {
-    int m = A.map().ncols();
-    int k = A.map().nrows();
-    int n = B.map().ncols();
-    numeric_t* A_ptr = A.array().data();
-    numeric_t* B_ptr = B.array().data();
-    numeric_t* C_ptr = C.array().data();
-
-    if (A.array().stride(0) != 1 || B.array().stride(0) != 1 || C.array().stride(0) != 1) {
-      throw std::runtime_error("expecting column major layout");
-    }
-    int lda = A.array().stride(1);
-    int ldb = B.array().stride(1);
-    int ldc = C.array().stride(1);
-
-    // single rank inner product
-    cblas::gemm<numeric_t>::call(CblasColMajor,
-                                 cblas::gemm<numeric_t>::H,
-                                 CblasNoTrans,
-                                 m,
-                                 n,
-                                 k,
-                                 alpha,
-                                 A_ptr,
-                                 lda,
-                                 B_ptr,
-                                 ldb,
-                                 beta,
-                                 C_ptr,
-                                 ldc);
-  } else {
-    throw std::runtime_error("not implemented.");
+  int m = A.map().ncols();
+  int k = A.map().nrows();
+  int n = B.map().ncols();
+  numeric_t* A_ptr = A.array().data();
+  numeric_t* B_ptr = B.array().data();
+  numeric_t* C_ptr = C.array().data();
+
+  if (A.array().stride(0) != 1 || B.array().stride(0) != 1 || C.array().stride(0) != 1) {
+    throw std::runtime_error("expecting column major layout");
   }
+  int lda = A.array().stride(1);
+  int ldb = B.array().stride(1);
+  int ldc = C.array().stride(1);
+
+  // single rank inner product
+  cblas::gemm<numeric_t>::call(CblasColMajor,
+                               cblas::gemm<numeric_t>::H,
+                               CblasNoTrans,
+                               m,
+                               n,
+                               k,
+                               alpha,
+                               A_ptr,
+                               lda,
+                               B_ptr,
+                               ldb,
+                               beta,
+                               C_ptr,
+                               ldc);
+  allreduce(C, A.map().comm());
 }
 
 ///  Inner product: c = a^H * b, on CPU
@@ -220,7 +217,7 @@ outer(KokkosDVector<T0**, LAYOUT0, KOKKOS0...>& C,
                                  C_ptr,
                                  ldc);
   } else {
-    throw std::runtime_error("not implemented.");
+    throw std::runtime_error("outer: not implemented.");
   }
 }
 
@@ -257,7 +254,7 @@ transform(KokkosDVector<T0**, LAYOUT0, KOKKOS0...>& C,
                 "a,b not on same memory");
   static_assert(std::is_same<LAYOUT1, LAYOUT2>::value, "matrix layout do not match");
 
-  if (A.map().is_local() && B.map().is_local() && C.map().is_local()) {
+  if (B.map().is_local()) {
     /* single rank */
     int m = A.map().nrows();
     int n = B.map().ncols();
@@ -267,7 +264,7 @@ transform(KokkosDVector<T0**, LAYOUT0, KOKKOS0...>& C,
     numeric_t* C_ptr = C.array().data();
 
     if (A.array().stride(0) != 1 || B.array().stride(0) != 1 || C.array().stride(0) != 1) {
-      throw std::runtime_error("expecting column major layout");
+      throw std::runtime_error("transform: expecting column major layout");
     }
     int lda = A.array().stride(1);
     int ldb = B.array().stride(1);
@@ -289,7 +286,7 @@ transform(KokkosDVector<T0**, LAYOUT0, KOKKOS0...>& C,
                                  C_ptr,
                                  ldc);
   } else {
-    throw std::runtime_error("not implemented.");
+    throw std::runtime_error("tranform: not implemented.");
   }
 }
 
@@ -310,26 +307,22 @@ add(M0& C,
                              typename vector1_t::storage_t::memory_space>::value,
                 "c,a not on same memory");
 
-  if (A.map().is_local() && C.map().is_local()) {
-    /* single rank */
-    int m = A.map().nrows();
-    int n = C.map().ncols();
-    numeric_t* A_ptr = A.array().data();
-    numeric_t* C_ptr = C.array().data();
+  /* single rank */
+  int m = A.map().nrows();
+  int n = C.map().ncols();
+  numeric_t* A_ptr = A.array().data();
+  numeric_t* C_ptr = C.array().data();
 
-    if (A.array().stride(0) != 1 || C.array().stride(0) != 1) {
-      throw std::runtime_error("expecting column major layout");
-    }
-    // assume there are no strides
-    int lda = A.array().stride(1);
-    int ldc = C.array().stride(1);
-
-    using geam = cblas::geam<numeric_t>;
-    geam::call(
-        CblasColMajor, geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc);
-  } else {
-    throw std::runtime_error("not implemented.");
+  if (A.array().stride(0) != 1 || C.array().stride(0) != 1) {
+    throw std::runtime_error("expecting column major layout");
   }
+  // assume there are no strides
+  int lda = A.array().stride(1);
+  int ldc = C.array().stride(1);
+
+  using geam = cblas::geam<numeric_t>;
+  geam::call(
+      CblasColMajor, geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc);
 }
 
 }  // namespace nlcglib
diff --git a/src/la/lapack_cuda.hpp b/src/la/lapack_cuda.hpp
index 65dfcc1..b1acf6c 100644
--- a/src/la/lapack_cuda.hpp
+++ b/src/la/lapack_cuda.hpp
@@ -34,7 +34,8 @@ eigh(KokkosDVector<T, LAYOUT, KOKKOS...>& U,
 
 /// stores result in RHS, after the call A will contain the cholesky factorization of a
 template <class T, class LAYOUT, class... KOKKOS>
-std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space, Kokkos::CudaSpace>::value>
+std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space,
+                              Kokkos::CudaSpace>::value>
 cholesky(KokkosDVector<T, LAYOUT, KOKKOS...>& A)
 {
   if (A.map().is_local()) {
@@ -56,9 +57,9 @@ cholesky(KokkosDVector<T, LAYOUT, KOKKOS...>& A)
 
 /// stores result in RHS, after the call A will contain the cholesky factorization of a
 template <class T, class LAYOUT, class... KOKKOS>
-std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space, Kokkos::CudaSpace>::value>
-solve_sym(KokkosDVector<T, LAYOUT, KOKKOS...>& A,
-          KokkosDVector<T, LAYOUT, KOKKOS...>& RHS)
+std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space,
+                              Kokkos::CudaSpace>::value>
+solve_sym(KokkosDVector<T, LAYOUT, KOKKOS...>& A, KokkosDVector<T, LAYOUT, KOKKOS...>& RHS)
 {
   if (A.map().is_local() && RHS.map().is_local()) {
     typedef KokkosDVector<T**, LAYOUT, KOKKOS...> vector_t;
@@ -101,38 +102,30 @@ inner(M0& c,
                              typename M2::storage_t::memory_space>::value,
                 "a,b not on same memory");
   // static_assert(std::is_same<LAYOUT1, LAYOUT2>::value, "matrix layout do not match");
-  if (a.map().is_local() && b.map().is_local() && c.map().is_local()) {
-    if (a.array().stride(0) != 1 || b.array().stride(0) != 1 || c.array().stride(0) != 1) {
-      throw std::runtime_error("expecting column major layout");
-    }
 
-    int m = a.map().ncols();
-    int k = a.map().nrows();
-    int n = b.map().ncols();
-    numeric_t* A_ptr = a.array().data();
-    numeric_t* B_ptr = b.array().data();
-    numeric_t* C_ptr = c.array().data();
+  if (a.array().stride(0) != 1 || b.array().stride(0) != 1 || c.array().stride(0) != 1) {
+    throw std::runtime_error("expecting column major layout");
+  }
 
-    int lda = a.array().stride(1);
-    int ldb = b.array().stride(1);
-    int ldc = c.array().stride(1);
+  int m = a.map().ncols();
+  int k = a.map().nrows();
+  int n = b.map().ncols();
+  numeric_t* A_ptr = a.array().data();
+  numeric_t* B_ptr = b.array().data();
+  numeric_t* C_ptr = c.array().data();
 
-    using gemm = cuda::gemm<numeric_t>;
-    gemm::call(gemm::H, gemm::N, m, n, k, alpha, A_ptr, lda, B_ptr, ldb, beta, C_ptr, ldc);
+  int lda = a.array().stride(1);
+  int ldb = b.array().stride(1);
+  int ldc = c.array().stride(1);
 
-  } else {
-    throw std::runtime_error("distributed inner product not implemented.");
-  }
+  using gemm = cuda::gemm<numeric_t>;
+  gemm::call(gemm::H, gemm::N, m, n, k, alpha, A_ptr, lda, B_ptr, ldb, beta, C_ptr, ldc);
+  allreduce(c, a.map().comm());
 }
 
 /// Inner product c = a^H * b, on GPU
-template <class M0,
-          class M1,
-          class M2>
-std::enable_if_t<
-    std::is_same<typename M0::storage_t::memory_space,
-                 Kokkos::CudaSpace>::value,
-    void>
+template <class M0, class M1, class M2>
+std::enable_if_t<std::is_same<typename M0::storage_t::memory_space, Kokkos::CudaSpace>::value, void>
 outer(M0& c,
       const M1& a,
       const M2& b,
@@ -178,11 +171,8 @@ outer(M0& c,
 /// C <- beta * C + alpha * A @ B
 template <class M0, class M1, class M2>
 std::enable_if_t<std::is_same<typename M0::storage_t::memory_space, Kokkos::CudaSpace>::value, void>
-transform(M0& C,
-          typename M0::numeric_t beta,
-          typename M0::numeric_t alpha,
-          const M1& A,
-          const M2& B)
+transform(
+    M0& C, typename M0::numeric_t beta, typename M0::numeric_t alpha, const M1& A, const M2& B)
 {
   typedef M0 vector0_t;
   typedef M1 vector1_t;
@@ -195,9 +185,8 @@ transform(M0& C,
   static_assert(std::is_same<typename vector1_t::storage_t::memory_space,
                              typename vector2_t::storage_t::memory_space>::value,
                 "a,b not on same memory");
-  // static_assert(std::is_same<LAYOUT1, LAYOUT2>::value, "matrix layout do not match");
 
-  if (A.map().is_local() && B.map().is_local() && C.map().is_local()) {
+  if (B.map().is_local()) {
     /* single rank */
     int m = A.map().nrows();
     int n = B.map().ncols();
@@ -238,25 +227,21 @@ add(M0& C,
                              typename vector1_t::storage_t::memory_space>::value,
                 "c,a not on same memory");
 
-  if (A.map().is_local() && C.map().is_local()) {
-    /* single rank */
-    int m = A.map().nrows();
-    int n = C.map().ncols();
-    numeric_t* A_ptr = A.array().data();
-    numeric_t* C_ptr = C.array().data();
+  /* single rank */
+  int m = A.map().nrows();
+  int n = C.map().ncols();
+  numeric_t* A_ptr = A.array().data();
+  numeric_t* C_ptr = C.array().data();
 
-    if (A.array().stride(0) != 1 || C.array().stride(0) != 1) {
-      throw std::runtime_error("expecting column major layout");
-    }
-    // assume there are no strides
-    int lda = A.array().stride(1);
-    int ldc = C.array().stride(1);
-
-    using geam = cuda::geam<numeric_t>;
-    geam::call(geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc);
-  } else {
-    throw std::runtime_error("not implemented.");
+  if (A.array().stride(0) != 1 || C.array().stride(0) != 1) {
+    throw std::runtime_error("expecting column major layout");
   }
+  // assume there are no strides
+  int lda = A.array().stride(1);
+  int ldc = C.array().stride(1);
+
+  using geam = cuda::geam<numeric_t>;
+  geam::call(geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc);
 }
 #endif
 }  // namespace nlcglib
diff --git a/src/la/lapack_rocm.hpp b/src/la/lapack_rocm.hpp
index a4c6283..eff262a 100644
--- a/src/la/lapack_rocm.hpp
+++ b/src/la/lapack_rocm.hpp
@@ -4,12 +4,12 @@
 // #include <rocsolver.h>
 
 #include "la/cblas.hpp"
-#include "rocm.hpp"
-#include "rocblas.hpp"
-#include "rocsolver.hpp"
 #include "la/dvector.hpp"
-#include "la/utils.hpp"
 #include "la/lapack_cpu.hpp"
+#include "la/utils.hpp"
+#include "rocblas.hpp"
+#include "rocm.hpp"
+#include "rocsolver.hpp"
 
 #ifdef __NLCGLIB__MAGMA
 #include "magma.hpp"
@@ -28,7 +28,6 @@ eigh(KokkosDVector<T, LAYOUT, KOKKOS...>& U,
      const KokkosDVector<T, LAYOUT, KOKKOS...>& S)
 {
   if (U.map().is_local() && S.map().is_local()) {
-
     deep_copy(U, S);
 
     // int n = U.map().nrows();
@@ -46,12 +45,12 @@ eigh(KokkosDVector<T, LAYOUT, KOKKOS...>& U,
       int n = U_host.map().ncols();
       Kokkos::deep_copy(U_host.array(), S_host.array());
       LAPACKE_zheevd(
-          LAPACK_COL_MAJOR,                                           /* matrix layout */
-          'V',                                                        /* jobz */
-          'U',                                                        /* uplot */
-          n,                                                          /* matrix size */
+          LAPACK_COL_MAJOR,                                                /* matrix layout */
+          'V',                                                             /* jobz */
+          'U',                                                             /* uplot */
+          n,                                                               /* matrix size */
           reinterpret_cast<lapack_complex_double*>(U_host.array().data()), /* Complex double */
-          lda,                                                        /* lda */
+          lda,                                                             /* lda */
           w_host.data()                                                    /* eigenvalues */
       );
     }
@@ -60,7 +59,8 @@ eigh(KokkosDVector<T, LAYOUT, KOKKOS...>& U,
     // zheevd_magma(n, U.array().data(), lda, w_host.data());
     Kokkos::deep_copy(w, w_host);
     deep_copy(U, U_host);
-    // rocm::heevd(rocblas_evect::rocblas_evect_original, rocblas_fill::rocblas_fill_upper, n, U.array().data(), lda, w.data());
+    // rocm::heevd(rocblas_evect::rocblas_evect_original, rocblas_fill::rocblas_fill_upper, n,
+    // U.array().data(), lda, w.data());
   } else {
     throw std::runtime_error("distributed eigh not implemented");
   }
@@ -68,7 +68,8 @@ eigh(KokkosDVector<T, LAYOUT, KOKKOS...>& U,
 
 /// stores result in RHS, after the call A will contain the cholesky factorization of a
 template <class T, class LAYOUT, class... KOKKOS>
-std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value>
+std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space,
+                              Kokkos::Experimental::HIPSpace>::value>
 cholesky(KokkosDVector<T, LAYOUT, KOKKOS...>& A)
 {
   if (A.map().is_local()) {
@@ -89,9 +90,9 @@ cholesky(KokkosDVector<T, LAYOUT, KOKKOS...>& A)
 
 /// stores result in RHS, after the call A will contain the cholesky factorization of a
 template <class T, class LAYOUT, class... KOKKOS>
-std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value>
-solve_sym(KokkosDVector<T, LAYOUT, KOKKOS...>& A,
-          KokkosDVector<T, LAYOUT, KOKKOS...>& RHS)
+std::enable_if_t<std::is_same<typename KokkosDVector<T, LAYOUT, KOKKOS...>::storage_t::memory_space,
+                              Kokkos::Experimental::HIPSpace>::value>
+solve_sym(KokkosDVector<T, LAYOUT, KOKKOS...>& A, KokkosDVector<T, LAYOUT, KOKKOS...>& RHS)
 {
   auto A_host = create_mirror_view_and_copy(Kokkos::HostSpace(), A);
   auto RHS_host = create_mirror_view_and_copy(Kokkos::HostSpace(), RHS);
@@ -125,7 +126,9 @@ solve_sym(KokkosDVector<T, LAYOUT, KOKKOS...>& A,
 
 /// Inner product c = a^H * b, on GPU
 template <class M0, class M1, class M2>
-std::enable_if_t<std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value, void>
+std::enable_if_t<
+    std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value,
+    void>
 inner(M0& c,
       const M1& a,
       const M2& b,
@@ -141,7 +144,7 @@ inner(M0& c,
                              typename M2::storage_t::memory_space>::value,
                 "a,b not on same memory");
   // static_assert(std::is_same<LAYOUT1, LAYOUT2>::value, "matrix layout do not match");
-  if (a.map().is_local() && b.map().is_local() && c.map().is_local()) {
+  if (c.map().is_local()) {
     if (a.array().stride(0) != 1 || b.array().stride(0) != 1 || c.array().stride(0) != 1) {
       throw std::runtime_error("expecting column major layout");
     }
@@ -160,18 +163,16 @@ inner(M0& c,
     auto H = rocblas_operation::rocblas_operation_conjugate_transpose;
     auto N = rocblas_operation::rocblas_operation_none;
     rocm::gemm(H, N, m, n, k, alpha, A_ptr, lda, B_ptr, ldb, beta, C_ptr, ldc);
+    allreduce(c, a.map().comm());
   } else {
     throw std::runtime_error("distributed inner product not implemented.");
   }
 }
 
 /// Inner product c = a^H * b, on GPU
-template <class M0,
-          class M1,
-          class M2>
+template <class M0, class M1, class M2>
 std::enable_if_t<
-    std::is_same<typename M0::storage_t::memory_space,
-                 Kokkos::Experimental::HIPSpace>::value,
+    std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value,
     void>
 outer(M0& c,
       const M1& a,
@@ -218,12 +219,11 @@ outer(M0& c,
 
 /// C <- beta * C + alpha * A @ B
 template <class M0, class M1, class M2>
-std::enable_if_t<std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value, void>
-transform(M0& C,
-          typename M0::numeric_t beta,
-          typename M0::numeric_t alpha,
-          const M1& A,
-          const M2& B)
+std::enable_if_t<
+    std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value,
+    void>
+transform(
+    M0& C, typename M0::numeric_t beta, typename M0::numeric_t alpha, const M1& A, const M2& B)
 {
   typedef M0 vector0_t;
   typedef M1 vector1_t;
@@ -236,9 +236,7 @@ transform(M0& C,
   static_assert(std::is_same<typename vector1_t::storage_t::memory_space,
                              typename vector2_t::storage_t::memory_space>::value,
                 "a,b not on same memory");
-  // static_assert(std::is_same<LAYOUT1, LAYOUT2>::value, "matrix layout do not match");
-
-  if (A.map().is_local() && B.map().is_local() && C.map().is_local()) {
+  if (B.map().is_local()) {
     /* single rank */
     int m = A.map().nrows();
     int n = B.map().ncols();
@@ -265,7 +263,9 @@ transform(M0& C,
 
 /// add C <- alpha * A + beta * C
 template <class M0, class M1>
-std::enable_if_t<std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value, void>
+std::enable_if_t<
+    std::is_same<typename M0::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value,
+    void>
 add(M0& C,
     const M1& A,
     typename M0::numeric_t alpha,
@@ -279,29 +279,25 @@ add(M0& C,
                              typename vector1_t::storage_t::memory_space>::value,
                 "c,a not on same memory");
 
-  if (A.map().is_local() && C.map().is_local()) {
-    /* single rank */
-    int m = A.map().nrows();
-    int n = C.map().ncols();
-    numeric_t* A_ptr = A.array().data();
-    numeric_t* C_ptr = C.array().data();
-
-    if (A.array().stride(0) != 1 || C.array().stride(0) != 1) {
-      throw std::runtime_error("expecting column major layout");
-    }
-    // assume there are no strides
-    int lda = A.array().stride(1);
-    int ldc = C.array().stride(1);
+  /* single rank */
+  int m = A.map().nrows();
+  int n = C.map().ncols();
+  numeric_t* A_ptr = A.array().data();
+  numeric_t* C_ptr = C.array().data();
 
-    // using geam = rocm::geam<numeric_t>;
-    auto N = rocblas_operation::rocblas_operation_none;
-    // rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, B_ptr, ldb, C, ldc);
-    rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc);
-  } else {
-    throw std::runtime_error("not implemented.");
+  if (A.array().stride(0) != 1 || C.array().stride(0) != 1) {
+    throw std::runtime_error("expecting column major layout");
   }
+  // assume there are no strides
+  int lda = A.array().stride(1);
+  int ldc = C.array().stride(1);
+
+  // using geam = rocm::geam<numeric_t>;
+  auto N = rocblas_operation::rocblas_operation_none;
+  // rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, B_ptr, ldb, C, ldc);
+  rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc);
 }
 
 #endif
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/la/magma.cpp b/src/la/magma.cpp
index 0c41aad..eff53e5 100644
--- a/src/la/magma.cpp
+++ b/src/la/magma.cpp
@@ -1,11 +1,11 @@
 #ifdef __NLCGLIB__MAGMA
+#include "la/magma.hpp"
 #include <magma_auxiliary.h>
 #include <magma_types.h>
 #include <magma_v2.h>
 #include <Kokkos_Core.hpp>
 #include <complex>
 #include <stdexcept>
-#include "la/magma.hpp"
 
 #define TESTING_CHECK(err)                                    \
   do {                                                        \
@@ -23,14 +23,16 @@
   } while (0)
 
 
-void nlcg_init_magma()
+void
+nlcg_init_magma()
 {
   TESTING_CHECK(magma_init());
   // magma_print_environment();
 }
 
 
-void nlcg_finalize_magma()
+void
+nlcg_finalize_magma()
 {
   TESTING_CHECK(magma_finalize());
 }
@@ -133,7 +135,9 @@ zpotrs_magma(int n, int nrhs, COMPLEX* dA, int lda, COMPLEX* dB, int ldb)
   }
 }
 
-template void zpotrs_magma(int, int, Kokkos::complex<double>*, int, Kokkos::complex<double>*, int);
-template void zpotrs_magma(int, int, std::complex<double>*, int, std::complex<double>*, int);
+template void
+zpotrs_magma(int, int, Kokkos::complex<double>*, int, Kokkos::complex<double>*, int);
+template void
+zpotrs_magma(int, int, std::complex<double>*, int, std::complex<double>*, int);
 
 #endif /*__NLCGLIB__MAGMA*/
diff --git a/src/la/magma.hpp b/src/la/magma.hpp
index c86d445..a81fe9b 100644
--- a/src/la/magma.hpp
+++ b/src/la/magma.hpp
@@ -2,17 +2,22 @@
 #ifdef __NLCGLIB__MAGMA
 #include <Kokkos_Core.hpp>
 
-void nlcg_init_magma();
-void nlcg_finalize_magma();
+void
+nlcg_init_magma();
+void
+nlcg_finalize_magma();
 
-template<class COMPLEX>
-void zheevd_magma(int n, COMPLEX* dA, int lda, double* w);
+template <class COMPLEX>
+void
+zheevd_magma(int n, COMPLEX* dA, int lda, double* w);
 
 template <class COMPLEX>
-void zpotrf_magma(int n, COMPLEX* dA, int lda);
+void
+zpotrf_magma(int n, COMPLEX* dA, int lda);
 
 template <class COMPLEX>
-void zpotrs_magma(int n, int nrhs, COMPLEX* dA, int lda, COMPLEX* dB, int ldb);
+void
+zpotrs_magma(int n, int nrhs, COMPLEX* dA, int lda, COMPLEX* dB, int ldb);
 
 
 #endif /*__NLCGLIB__MAGMA*/
diff --git a/src/la/mvector.hpp b/src/la/mvector.hpp
index c34610a..f9afb14 100644
--- a/src/la/mvector.hpp
+++ b/src/la/mvector.hpp
@@ -1,136 +1,108 @@
 #pragma once
 
-#include <map>
-#include <vector>
-#include <numeric>
-#include <algorithm>
-#include "mpi/communicator.hpp"
-#include "traits.hpp"
-#include "helper/funcs.hpp"
 #include <Kokkos_Core.hpp>
+#include <algorithm>
 #include <cassert>
-#include <memory>
 #include <iomanip>
-#include "nlcglib.hpp"
+#include <map>
+#include <memory>
+#include <numeric>
+#include <vector>
+#include "gpu/acc.hpp"
 #include "la/dvector.hpp"
+#include "mpi/communicator.hpp"
+#include "traits.hpp"
 #include "utils.hpp"
-#include "gpu/acc.hpp"
 
 namespace nlcglib {
 
-template<class DERIVED, class ELEM>
+template <class DERIVED, class ELEM>
 class mvector_base
 {
 };
 
 
-template<class T>
-class mvector : public mvector_base<mvector<T>, T> {
+template <class T>
+class mvector : public mvector_base<mvector<T>, T>
+{
   static_assert(std::is_same<T, std::remove_reference_t<T>>::value, "must have ownership");
+
 public:
   using value_type = T;
   using key_t = std::pair<int, int>;
   using container_t = std::map<key_t, T>;
 
-public :
-  mvector(Communicator comm) : comm_(comm) {}
+public:
+  mvector(Communicator comm)
+      : comm_(comm)
+  {
+  }
   mvector() = default;
   mvector(const mvector&) = default;
   mvector(mvector&&) = default;
   mvector& operator=(const mvector&) = default;
-  mvector(const container_t& data) : data_(data) {}
-
-  T& operator[] (key_t k)
+  mvector(const container_t& data)
+      : data_(data)
   {
-    return data_[k];
   }
 
-  const T& operator[] (key_t k) const
-  {
-    return data_.at(k);
-  }
+  T& operator[](key_t k) { return data_[k]; }
 
-  T& at(key_t k)
-  {
-    return data_.at(k);
-  }
+  const T& operator[](key_t k) const { return data_.at(k); }
 
-  const T& at(key_t k) const
-  {
-    return data_.at(k);
-  }
+  T& at(key_t k) { return data_.at(k); }
 
+  const T& at(key_t k) const { return data_.at(k); }
 
-  auto begin()
-  {
-    return data_.begin();
-  }
+  auto begin() { return data_.begin(); }
 
-  auto end()
-  {
-    return data_.end();
-  }
+  auto end() { return data_.end(); }
 
   auto begin() const { return data_.begin(); }
 
   auto end() const { return data_.end(); }
 
-  auto& data()
-  {
-    return this->data_;
-  }
+  auto& data() { return this->data_; }
 
-  auto& data() const
-  {
-    return this->data_;
-  }
+  auto& data() const { return this->data_; }
 
-  auto find()
-  {
-    return data_.find();
-  }
+  auto find() { return data_.find(); }
 
   auto size() { return data_.size(); }
   auto size() const { return data_.size(); }
 
   mvector empty_like();
 
-  mvector& operator=(std::map<key_t, T>&& data)
-  {
-    data_ = std::forward<std::map<key_t, T>>(data);
-  }
+  mvector& operator=(std::map<key_t, T>&& data) { data_ = std::forward<std::map<key_t, T>>(data); }
 
-  template<typename Z>
+  template <typename Z>
   mvector& operator=(mvector<Z>& other)
   {
     // iterate over keys and call assign
-    for(auto& elem : data_) {
+    for (auto& elem : data_) {
       auto key = elem.first;
       data_[key] = eval(other.at(key));
     }
     return *this;
   }
 
-  const Communicator& commk() const
-  {
-    return comm_;
-  }
+  const Communicator& commk() const { return comm_; }
 
-  template<class X=T>
-  std::enable_if_t<std::is_scalar<X>::value, mvector<X>>
-  allgather(Communicator comm = Communicator{MPI_COMM_NULL}) const;
+  template <class X = T>
+  std::enable_if_t<std::is_scalar<X>::value, mvector<X>> allgather(Communicator comm = Communicator{
+                                                                       MPI_COMM_NULL}) const;
 
-  template<class X=T>
-  std::enable_if_t<is_kokkos_view<X>::value, mvector<X>>
-  allgather(Communicator comm = Communicator{MPI_COMM_NULL}) const;
+  template <class X = T>
+  std::enable_if_t<is_kokkos_view<X>::value, mvector<X>> allgather(Communicator comm = Communicator{
+                                                                       MPI_COMM_NULL}) const;
 
 private:
   container_t data_;
   Communicator comm_;
 };
 
-template<class T>
-template<class X>
+template <class T>
+template <class X>
 std::enable_if_t<std::is_scalar<X>::value, mvector<X>>
 mvector<T>::allgather(Communicator comm) const
 {
@@ -163,8 +135,9 @@ mvector<T>::allgather(Communicator comm) const
 }
 
 
-template<class VAL>
-std::vector<std::vector<VAL>> _allgather(const std::vector<VAL>& values, const Communicator& comm)
+template <class VAL>
+std::vector<std::vector<VAL>>
+_allgather(const std::vector<VAL>& values, const Communicator& comm)
 {
   int nranks = comm.size();
   std::vector<int> nelems(nranks);
@@ -180,20 +153,20 @@ std::vector<std::vector<VAL>> _allgather(const std::vector<VAL>& values, const C
 
   std::vector<std::vector<VAL>> result(nranks);
   for (int i = 0; i < nranks; ++i) {
-    result[i] = std::vector<VAL>(sendrecv_buffer.data() + scan[i], sendrecv_buffer.data() + scan[i+1]);
+    result[i] =
+        std::vector<VAL>(sendrecv_buffer.data() + scan[i], sendrecv_buffer.data() + scan[i + 1]);
   }
   return result;
 }
 
 
 /// allgather, copy to host -> communicate -> copy to original memory (if needed)
-template<class T>
-template<class X>
+template <class T>
+template <class X>
 std::enable_if_t<is_kokkos_view<X>::value, mvector<X>>
 mvector<T>::allgather(Communicator comm) const
 {
-  if (comm == Communicator{MPI_COMM_NULL})
-    comm = comm_;
+  if (comm == Communicator{MPI_COMM_NULL}) comm = comm_;
   if (comm < comm_) {
     throw std::runtime_error("mvector::allgather: most likely gave unintended communicator");
   }
@@ -213,8 +186,9 @@ mvector<T>::allgather(Communicator comm) const
 
   // collect total size
   std::vector<int> local_number_of_elements(data_.size());
-  std::transform(data_.begin(), data_.end(), local_number_of_elements.data(),
-                 [](auto& elem) {return elem.second.size();});
+  std::transform(data_.begin(), data_.end(), local_number_of_elements.data(), [](auto& elem) {
+    return elem.second.size();
+  });
   auto global_number_of_elements = _allgather(local_number_of_elements, comm);
 
   // collect the offsets as a list [every mpi rank] of list [every block]
@@ -239,30 +213,31 @@ mvector<T>::allgather(Communicator comm) const
       auto host_view = Kokkos::create_mirror_view(arr);
       Kokkos::deep_copy(host_view, arr);
       assert(offsets[rank][i] < send_recv_buffer.size());
-      std::copy(host_view.data(), host_view.data() + host_view.size(),
+      std::copy(host_view.data(),
+                host_view.data() + host_view.size(),
                 send_recv_buffer.data() + offsets[rank][i]);
       ++i;
     }
   }
 
   std::vector<int> recv_counts(comm.size());
-  std::transform(global_number_of_elements.begin(), global_number_of_elements.end(),
+  std::transform(global_number_of_elements.begin(),
+                 global_number_of_elements.end(),
                  recv_counts.begin(),
                  [](auto& in) { return std::accumulate(in.begin(), in.end(), 0); });
   comm.allgather(send_recv_buffer.data(), recv_counts);
 
   // copy into results and issue memory transfer if needed.
   mvector<T> result(Communicator{MPI_COMM_SELF});
-  for(int rank = 0; rank < nranks; ++rank) {
+  for (int rank = 0; rank < nranks; ++rank) {
     for (auto block_id = 0ul; block_id < offsets[rank].size(); ++block_id) {
       int lsize = global_number_of_elements[rank][block_id];
       int offset = offsets[rank][block_id];
       Kokkos::View<numeric_t*, Kokkos::HostSpace> tmp(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, ""),
-          lsize);
-      std::copy(send_recv_buffer.data() + offset, send_recv_buffer.data() + offset + lsize, tmp.data());
-      T dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, ""),
-            lsize);
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, ""), lsize);
+      std::copy(
+          send_recv_buffer.data() + offset, send_recv_buffer.data() + offset + lsize, tmp.data());
+      T dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, ""), lsize);
       Kokkos::deep_copy(dst, tmp);
       auto key = global_keys[rank][block_id];
       result[key] = dst;
@@ -272,9 +247,10 @@ mvector<T>::allgather(Communicator comm) const
 }
 
 
-template<class mspc, class xspc=mspc, class _=void>
+template <class mspc, class xspc = mspc, class _ = void>
 struct make_mmatrix_return_type
-{};
+{
+};
 
 template <class mspc, class xspc>
 struct make_mmatrix_return_type<mspc, xspc, std::enable_if_t<std::is_same<mspc, xspc>::value>>
@@ -289,10 +265,7 @@ struct make_mmatrix_return_type<mspc, xspc, std::enable_if_t<std::is_same<mspc,
 template <class mspc, class xspc>
 struct make_mmatrix_return_type<mspc, xspc, std::enable_if_t<!std::is_same<mspc, xspc>::value>>
 {
-  using type = KokkosDVector<Kokkos::complex<double>**,
-                             SlabLayoutV,
-                             Kokkos::LayoutLeft,
-                             xspc>;
+  using type = KokkosDVector<Kokkos::complex<double>**, SlabLayoutV, Kokkos::LayoutLeft, xspc>;
   using view_type = KokkosDVector<Kokkos::complex<double>**,
                                   SlabLayoutV,
                                   Kokkos::LayoutStride,
@@ -304,11 +277,12 @@ struct make_mmatrix_return_type<mspc, xspc, std::enable_if_t<!std::is_same<mspc,
 /// @brief create an mvector from SIRIUS adaptor
 /// @tparam T Kokkos memory space
 /// @tparam execution memory space
-template <class T, class X=T>
+template <class T, class X = T>
 mvector<typename make_mmatrix_return_type<T, X>::type>
-make_mmatrix(std::shared_ptr<MatrixBaseZ> matrix_base, std::enable_if_t<std::is_same<T, X>::value>* _ = nullptr)
+make_mmatrix(std::shared_ptr<MatrixBaseZ> matrix_base,
+             std::enable_if_t<std::is_same<T, X>::value>* _ = nullptr)
 {
-  static_assert(std::is_same<T,X>::value, "invalid template parameters");
+  static_assert(std::is_same<T, X>::value, "invalid template parameters");
   using memspace = T;
   typedef typename make_mmatrix_return_type<T>::type matrix_t;
   mvector<matrix_t> mvector(Communicator(matrix_base->mpicomm()));
@@ -328,21 +302,20 @@ make_mmatrix(std::shared_ptr<MatrixBaseZ> matrix_base, std::enable_if_t<std::is_
 #endif
     if (Kokkos::SpaceAccessibility<Kokkos::Serial, memspace>::accessible) {
       // make sure is memory type device
-      if (buffer.memtype != memory_type::host)
-        throw std::runtime_error("expected host memory");
+      if (buffer.memtype != memory_type::host) throw std::runtime_error("expected host memory");
     }
     mvector[kindex] =
-      matrix_t(Map<>(comm, SlabLayoutV({{0, 0, buffer.size[0], buffer.size[1]}})), buffer);
-
+        matrix_t(Map<>(comm, SlabLayoutV({{0, 0, buffer.size[0], buffer.size[1]}})), buffer);
   }
   return mvector;
 }
 
 
 /// copy implementation
-template<class T, class X>
+template <class T, class X>
 mvector<typename make_mmatrix_return_type<T, X>::type>
-make_mmatrix(std::shared_ptr<MatrixBaseZ> matrix_base, std::enable_if_t<!std::is_same<T, X>::value>* _ =nullptr)
+make_mmatrix(std::shared_ptr<MatrixBaseZ> matrix_base,
+             std::enable_if_t<!std::is_same<T, X>::value>* _ = nullptr)
 {
   static_assert(!std::is_same<T, X>::value, "invalid template parameters");
   using memspace = T;
@@ -364,21 +337,21 @@ make_mmatrix(std::shared_ptr<MatrixBaseZ> matrix_base, std::enable_if_t<!std::is
 #endif
     if (Kokkos::SpaceAccessibility<Kokkos::Serial, memspace>::accessible) {
       // make sure is memory type device
-      if (buffer.memtype != memory_type::host)
-        throw std::runtime_error("expected host memory");
+      if (buffer.memtype != memory_type::host) throw std::runtime_error("expected host memory");
     }
     // copy view T, using cuda memcpy ...
     matrix_t mat(Map<>(comm, SlabLayoutV({{0, 0, buffer.size[0], buffer.size[1]}})));
     // issue memcpy
-    acc::copy(mat.array().data(), buffer.data, buffer.size[0]*buffer.size[1]);
+    acc::copy(mat.array().data(), buffer.data, buffer.size[0] * buffer.size[1]);
     mvector[kindex] = mat;
   }
   return mvector;
 }
 
 
-template<class T>
-auto make_mmvector(std::shared_ptr<VectorBaseZ> vector_base)
+template <class T>
+auto
+make_mmvector(std::shared_ptr<VectorBaseZ> vector_base)
 {
   using memspace = T;
   typedef Kokkos::View<double*, memspace> vector_t;
@@ -389,7 +362,8 @@ auto make_mmvector(std::shared_ptr<VectorBaseZ> vector_base)
     auto buffer = vector_base->get(i);
     if (buffer.memtype == memory_type::device) {
 #ifdef __NLCGLIB__CUDA
-      Kokkos::View<double*, Kokkos::CudaSpace, Kokkos::MemoryUnmanaged> src(buffer.data, buffer.size[0]);
+      Kokkos::View<double*, Kokkos::CudaSpace, Kokkos::MemoryUnmanaged> src(buffer.data,
+                                                                            buffer.size[0]);
       vector_t dst("vector", buffer.size[0]);
       Kokkos::deep_copy(dst, src);
       auto kindex = vector_base->kpoint_index(i);
@@ -410,11 +384,12 @@ auto make_mmvector(std::shared_ptr<VectorBaseZ> vector_base)
 }
 
 
-inline auto make_mmscalar(std::shared_ptr<ScalarBaseZ> scalar_base)
+inline auto
+make_mmscalar(std::shared_ptr<ScalarBaseZ> scalar_base)
 {
   mvector<ScalarBaseZ::buffer_t> mvector(Communicator(scalar_base->mpicomm()));
   int num_vec = scalar_base->size();
-  for(int i = 0; i < num_vec; ++i) {
+  for (int i = 0; i < num_vec; ++i) {
     auto v = scalar_base->get(i);
     auto key = scalar_base->kpoint_index(i);
     mvector[key] = v;
@@ -437,7 +412,8 @@ eval_threaded(const mvector<T>& input)
 
 
 template <typename T>
-void execute(const mvector<T>& input)
+void
+execute(const mvector<T>& input)
 {
   for (auto& elem : input) {
     eval(elem.second);
@@ -446,29 +422,23 @@ void execute(const mvector<T>& input)
 
 
 template <class numeric_t, class... ARGS>
-auto sum(const Kokkos::View<numeric_t*, ARGS...>& x)
+auto
+sum(const Kokkos::View<numeric_t*, ARGS...>& x)
 {
   using view_type = Kokkos::View<numeric_t*, ARGS...>;
-  static_assert(view_type::dimension::rank == 1,
-                "KokkosView");
+  static_assert(view_type::dimension::rank == 1, "KokkosView");
 
   auto host_mirror = Kokkos::create_mirror_view(x);
   Kokkos::deep_copy(host_mirror, x);
 
-  return std::accumulate(host_mirror.data(), host_mirror.data()+ host_mirror.size(), 0.0);
+  return std::accumulate(host_mirror.data(), host_mirror.data() + host_mirror.size(), 0.0);
 }
 
 
 template <class T>
-std::enable_if_t<is_kokkos_view<eval_t<T>>::value, mvector<std::function<double()> > >
-sum(const mvector<T>& x)
-{
-  return tapply([](auto xi) { return(sum(eval(xi))); }, x);
-}
-
-
-template<class T>
-std::enable_if_t<std::is_scalar<eval_t<T>>::value || std::is_same<Kokkos::complex<double>, T>::value, eval_t<T>>
+std::enable_if_t<std::is_scalar<eval_t<T>>::value ||
+                     std::is_same<Kokkos::complex<double>, T>::value,
+                 eval_t<T>>
 sum(const mvector<T>& x, Communicator comm = Communicator{MPI_COMM_NULL})
 {
   if (comm == Communicator{MPI_COMM_NULL}) comm = x.commk();
@@ -478,22 +448,23 @@ sum(const mvector<T>& x, Communicator comm = Communicator{MPI_COMM_NULL})
   }
 
   eval_t<T> sum = 0;
-  for (auto& elem: x) {
+  for (auto& elem : x) {
     sum += eval(elem.second);
   }
   return comm.allreduce(sum, mpi_op::sum);
 }
 
 
-template<class T1, class T2>
-auto operator*(const mvector<T1>& a, const mvector<T2>& b)
+template <class T1, class T2>
+auto
+operator*(const mvector<T1>& a, const mvector<T2>& b)
 {
-  return tapply([](auto x, auto y) { return eval(x)*eval(y); }, a, b);
+  return tapply([](auto x, auto y) { return eval(x) * eval(y); }, a, b);
 }
 
 
-template<class numeric_t, class... ARGS>
-std::enable_if_t<Kokkos::View<numeric_t*,ARGS...>::dimension::rank == 1>
+template <class numeric_t, class... ARGS>
+std::enable_if_t<Kokkos::View<numeric_t*, ARGS...>::dimension::rank == 1>
 print(const mvector<Kokkos::View<numeric_t*, ARGS...>>& vec)
 {
   for (auto& elem : vec) {
@@ -509,8 +480,9 @@ print(const mvector<Kokkos::View<numeric_t*, ARGS...>>& vec)
 }
 
 
-struct do_copy {
-  template<class X>
+struct do_copy
+{
+  template <class X>
   to_layout_left_t<std::remove_reference_t<X>> operator()(X&& x)
   {
     auto copy = empty_like()(x);
@@ -521,14 +493,17 @@ struct do_copy {
 
 
 template <class X>
-auto copy(const mvector<X>& x)
+auto
+copy(const mvector<X>& x)
 {
   return eval_threaded(tapply(do_copy(), x));
 }
 
 
 template <class... T>
-auto unzip(const mvector<std::tuple<T...>>& V) {
+auto
+unzip(const mvector<std::tuple<T...>>& V)
+{
   std::tuple<mvector<T>...> U;
 
   for (auto& elem : V) {
@@ -583,7 +558,7 @@ auto
 unzip(const std::tuple<T...>& src, std::tuple<mvector<T>...>& dst, const key_t& key)
 {
   using tuple_t = std::tuple<T...>;
-  unzip_impl<std::tuple_size<tuple_t>::value-1>::apply(src, dst , key);
+  unzip_impl<std::tuple_size<tuple_t>::value - 1>::apply(src, dst, key);
 }
 
 
@@ -594,8 +569,9 @@ print(const mvector<numeric_t>& vec)
   for (auto& elem : vec) {
     auto key = elem.first;
     auto& val = elem.second;
-    std::cout << "kindex (" << key.first << ", " << key.second << "): " << std::setprecision(10) << val << "\n";
+    std::cout << "kindex (" << key.first << ", " << key.second << "): " << std::setprecision(10)
+              << val << "\n";
   }
 }
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/la/rocblas.hpp b/src/la/rocblas.hpp
index 6decd54..e87a763 100644
--- a/src/la/rocblas.hpp
+++ b/src/la/rocblas.hpp
@@ -28,8 +28,7 @@ namespace nlcglib {
 namespace rocm {
 
 
+}  // namespace rocm
 
-}  // rocm
 
-
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/la/utils.hpp b/src/la/utils.hpp
index c37423b..b58d1a4 100644
--- a/src/la/utils.hpp
+++ b/src/la/utils.hpp
@@ -1,12 +1,12 @@
 #pragma once
 
 #include <Kokkos_Core.hpp>
+#include <exec_space.hpp>
 #include <functional>
 #include <future>
-#include <vector>
 #include <la/dvector.hpp>
-#include <exec_space.hpp>
 #include <traits.hpp>
+#include <vector>
 
 namespace nlcglib {
 
@@ -54,7 +54,10 @@ struct to_layout_left<KokkosDVector<aT, LAYOUT, KOKKOS...>&&>
 
 // const ref
 template <class aT, class LAYOUT, class... KOKKOS>
-struct to_layout_left<const KokkosDVector<aT, LAYOUT, KOKKOS...>&> : to_layout_left<KokkosDVector<aT, LAYOUT, KOKKOS...>> {};
+struct to_layout_left<const KokkosDVector<aT, LAYOUT, KOKKOS...>&>
+    : to_layout_left<KokkosDVector<aT, LAYOUT, KOKKOS...>>
+{
+};
 
 // ref
 template <class aT, class LAYOUT, class... KOKKOS>
@@ -119,13 +122,13 @@ Kokkos::View<T*, ARGS...>
 _empty_like(const Kokkos::View<T*, ARGS...>& other)
 {
   // return Kokkos::View<T*, ARGS...>("tmp", other.size());
-  auto ret = Kokkos::View<T*, ARGS...>(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp"), other.size());
+  auto ret = Kokkos::View<T*, ARGS...>(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp"),
+                                       other.size());
 #ifdef DEBUG
   // initialize with NAN to throw an error immediately if not overwritten
   using memspc = typename Kokkos::View<T*, ARGS...>::memory_space;
-  Kokkos::parallel_for(Kokkos::RangePolicy<exec_t<memspc>>(0, other.size()), KOKKOS_LAMBDA(int i) {
-      ret(i) = NAN;
-      });
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<exec_t<memspc>>(0, other.size()), KOKKOS_LAMBDA(int i) { ret(i) = NAN; });
 #endif
   return ret;
 }
@@ -205,11 +208,6 @@ tapply(FUNCTOR&& fun, const ARG& arg0, const ARGS&... args)
   return result;
 }
 
-template <class T>
-class find_type
-{
-  using t = typename T::fjsadlfjasf;
-};
 
 /// tapply for packed operators
 template <class OP, class ARG0, class... ARGS>
@@ -223,10 +221,9 @@ tapply_op(OP&& op, const ARG0& arg0, const ARGS&... args)
   mvector<std::function<R()>> result(arg0.commk());
   for (auto& elem : arg0) {
     auto key = elem.first;
-    // find_type<decltype(key)>::t;
     auto get_key = [key](auto container) { return container.at(key); };
     auto fun = op.at(key);
-    result[key] = [=](){ return fun(eval(get_key(arg0)), eval(get_key(args))...); };
+    result[key] = [=]() { return fun(eval(get_key(arg0)), eval(get_key(args))...); };
   }
   return result;
 }
diff --git a/src/linesearch/linesearch.hpp b/src/linesearch/linesearch.hpp
index 3edc424..23a27ee 100644
--- a/src/linesearch/linesearch.hpp
+++ b/src/linesearch/linesearch.hpp
@@ -1,16 +1,16 @@
 #pragma once
 
-#include "exceptions.hpp"
-#include "utils/logger.hpp"
 #include <iomanip>
 #include <tuple>
+#include "exceptions.hpp"
+#include "utils/logger.hpp"
 
 namespace nlcglib {
 
 
 struct line_search_info
 {
-  std::string type; // the ls-type used
+  std::string type;  // the ls-type used
 };
 
 class line_search
@@ -34,11 +34,13 @@ class line_search
     Logger::GetInstance() << "line search t_trial = " << std::scientific << t_trial << "\n";
     double F0 = FE.get_F();
     try {
-      return std::tuple_cat(qline(G, FE, slope, force_restart), std::make_tuple(line_search_info{"qline"}));
+      return std::tuple_cat(qline(G, FE, slope, force_restart),
+                            std::make_tuple(line_search_info{"qline"}));
     } catch (StepError& step_error) {
       Logger::GetInstance() << "\t"
                             << "quadratic line search failed -> backtracking search\n";
-      return std::tuple_cat(bt_search(G, FE, F0, force_restart), std::make_tuple(line_search_info{"btsearch"}));
+      return std::tuple_cat(bt_search(G, FE, F0, force_restart),
+                            std::make_tuple(line_search_info{"btsearch"}));
     }
   }
 
@@ -56,7 +58,6 @@ template <class GEODESIC, class FREE_ENERGY>
 auto
 line_search::bt_search(GEODESIC& G, FREE_ENERGY& FE, double F0, bool& force_restart)
 {
-
   if (tau >= 1) {
     throw std::runtime_error("invalid value");
   }
@@ -65,19 +66,22 @@ line_search::bt_search(GEODESIC& G, FREE_ENERGY& FE, double F0, bool& force_rest
   while (t > 1e-8) {
     auto ek_ul = G(t);
     double Fp = FE.get_F();
-    Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3) << (Fp - F0) / t << " t: " << t
-                          << " F:" << std::fixed << std::setprecision(13) << Fp << "\n";
+    Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3)
+                          << (Fp - F0) / t << " t: " << t << " F:" << std::fixed
+                          << std::setprecision(13) << Fp << "\n";
     if (Fp < F0) {
-      Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3) << (Fp - F0)/t << "\n";
+      Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3)
+                            << (Fp - F0) / t << "\n";
       force_restart = false;
       return ek_ul;
     }
     t *= tau;
-    Logger::GetInstance() << "\tbacktracking search tau = " << std::scientific << std::setprecision(5) << t << "\n";
+    Logger::GetInstance() << "\tbacktracking search tau = " << std::scientific
+                          << std::setprecision(5) << t << "\n";
   }
   // TODO: let logger print state
   Logger::GetInstance().flush();
-  if (force_restart)  {
+  if (force_restart) {
     throw DescentError();
   } else {
     force_restart = true;
@@ -102,7 +106,8 @@ line_search::qline(GEODESIC& G, FREE_ENERGY& FE, double slope, bool& force_resta
   //   G(dt);
   //   double F1 = FE.get_F();
   //   double fd_slope = (F1-F0)/dt;
-  //   Logger::GetInstance() << "\t DEBUG qline slope = " << std::setprecision(6) << slope << ", fd_slope = " << fd_slope << "\n";
+  //   Logger::GetInstance() << "\t DEBUG qline slope = " << std::setprecision(6) << slope << ",
+  //   fd_slope = " << fd_slope << "\n";
   // }
 
   // (END) DEBUG check slope
@@ -135,13 +140,12 @@ line_search::qline(GEODESIC& G, FREE_ENERGY& FE, double slope, bool& force_resta
   // evaluate FE at predicted minimum
   auto ek_ul = G(t_min);
   double F_min = FE.get_F();
-  Logger::GetInstance() << "\t t_min = " << t_min
-                        << " q line prediction error: " << std::scientific << std::setprecision(8) << (F_pred - F_min)
-                        << " dE: " << std::scientific << std::setprecision(8) << (F0 - F_min) << "\n";
+  Logger::GetInstance() << "\t t_min = " << t_min << " q line prediction error: " << std::scientific
+                        << std::setprecision(8) << (F_pred - F_min) << " dE: " << std::scientific
+                        << std::setprecision(8) << (F0 - F_min) << "\n";
 
   if (F_min > F0) {
-    Logger::GetInstance() << std::setprecision(13)
-                          << "\t quadratic line search failed:\n"
+    Logger::GetInstance() << std::setprecision(13) << "\t quadratic line search failed:\n"
                           << "\t - F_min: " << F_min << "\n"
                           << "\t - F0:    " << F0 << "\n\n";
     throw StepError();
@@ -153,21 +157,4 @@ line_search::qline(GEODESIC& G, FREE_ENERGY& FE, double slope, bool& force_resta
   return ek_ul;
 }
 
-
-
-// template <class GEODESIC, class FREE_ENERGY>
-// auto
-// line_search(GEODESIC&& G_base, FREE_ENERGY&& FE, double slope, double t_trial = 0.2)
-// {
-//   double F0 = FE.get_F();
-//   try {
-//     return qline(G_base, FE, slope, t_trial);
-//   } catch (StepError& step_error) {
-//     Logger::GetInstance() << "\t"
-//              << "quadratic line search failed -> backtracking search\n";
-//     return bt_search(G_base, FE, F0, t_trial, 0.1);
-//   }
-// }
-
-
 }  // namespace nlcglib
diff --git a/src/mpi/communicator.hpp b/src/mpi/communicator.hpp
index a3e7ff3..ffc3bf3 100644
--- a/src/mpi/communicator.hpp
+++ b/src/mpi/communicator.hpp
@@ -110,6 +110,10 @@ class Communicator
   template <class T>
   T allreduce(T val, enum mpi_op op) const;
 
+  template <class T>
+  void allreduce(T* buffer, int count, enum mpi_op op) const;
+
+
   void barrier() const { CALL_MPI(MPI_Barrier, (mpicomm_)); }
 
   ~Communicator()
@@ -224,4 +228,25 @@ Communicator::allreduce(T val, enum mpi_op op) const
   return result;
 }
 
+template <class T>
+void
+Communicator::allreduce(T* buffer, int count, enum mpi_op op) const
+{
+  switch (op) {
+    case mpi_op::sum: {
+      CALL_MPI(MPI_Allreduce,
+               (MPI_IN_PLACE,
+                buffer,
+                count,
+                mpi_type<T>::type(),
+                mpi_op_<mpi_op::sum>::value(),
+                mpicomm_));
+      break;
+    }
+    default: {
+      throw std::runtime_error("Error: invalid MPI_Op given.");
+    }
+  }
+}
+
 }  // namespace nlcglib
diff --git a/src/mpi/mpi_type.hpp b/src/mpi/mpi_type.hpp
index 8fdf03d..6c292db 100644
--- a/src/mpi/mpi_type.hpp
+++ b/src/mpi/mpi_type.hpp
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <mpi.h>
-#include <utility>
-#include <complex>
 #include <Kokkos_Core.hpp>
+#include <complex>
+#include <utility>
 
 namespace nlcglib {
 
@@ -23,9 +23,11 @@ enum class mpi_op
 };
 
 template <enum mpi_op>
-struct mpi_op_{};
+struct mpi_op_
+{
+};
 
-template<>
+template <>
 struct mpi_op_<mpi_op::sum>
 {
   static MPI_Op value() { return MPI_SUM; }
@@ -88,12 +90,8 @@ struct mpi_type<std::pair<T1, T2>>
     int array_of_block_lengths[2] = {1, 1};
     MPI_Aint array_of_displacements[2] = {0, sizeof(T1)};
     MPI_Datatype types[2] = {mpi_type<T1>::type(), mpi_type<T2>::type()};
-    CALL_MPI(
-        MPI_Type_create_struct, (2,
-                                 array_of_block_lengths,
-                                 array_of_displacements,
-                                 types,
-                                 &result));
+    CALL_MPI(MPI_Type_create_struct,
+             (2, array_of_block_lengths, array_of_displacements, types, &result));
     CALL_MPI(MPI_Type_commit, (&result));
     return result;
   }
diff --git a/src/mvp2/descent_direction_impl.hpp b/src/mvp2/descent_direction_impl.hpp
index 0dade0c..5ff82de 100644
--- a/src/mvp2/descent_direction_impl.hpp
+++ b/src/mvp2/descent_direction_impl.hpp
@@ -2,10 +2,8 @@
 
 #include <Kokkos_Core.hpp>
 #include "la/dvector.hpp"
-#include "la/mvector.hpp"
 #include "mvp2.hpp"
 #include "pseudo_hamiltonian/grad_eta.hpp"
-#include "utils/logger.hpp"
 
 
 namespace nlcglib {
diff --git a/src/nlcglib.cpp b/src/nlcglib.cpp
index fb1e9cf..9a6cfa9 100644
--- a/src/nlcglib.cpp
+++ b/src/nlcglib.cpp
@@ -1,31 +1,20 @@
 #include <Kokkos_Core.hpp>
-// #include <Kokkos_Parallel.hpp>
-#include <cfenv>
-#include <cstdio>
 #include <iomanip>
 #include <ios>
 #include <iostream>
 #include <nlcglib.hpp>
-#include "exec_space.hpp"
 #include "free_energy.hpp"
 #include "geodesic.hpp"
 #include "interface.hpp"
-#include "la/dvector.hpp"
 #include "la/lapack.hpp"
-#include "la/layout.hpp"
-#include "la/magma.hpp"
-#include "la/map.hpp"
 #include "la/mvector.hpp"
 #include "la/utils.hpp"
 #include "linesearch/linesearch.hpp"
+#include "mpi/communicator.hpp"
 #include "mvp2/descent_direction.hpp"
 #include "overlap.hpp"
-#include "preconditioner.hpp"
-#include "pseudo_hamiltonian/grad_eta.hpp"
 #include "smearing.hpp"
-#include "traits.hpp"
 #include "ultrasoft_precond.hpp"
-#include "utils/format.hpp"
 #include "utils/logger.hpp"
 #include "utils/step_logger.hpp"
 #include "utils/timer.hpp"
@@ -37,16 +26,8 @@ namespace nlcglib {
 void
 initialize()
 {
-#if KOKKOS_VERSION < 30700
-  Kokkos::InitArguments args;
-  args.disable_warnings = true;
-#ifdef USE_OPENMP
-  args.num_threads = omp_get_max_threads();
-#endif /* endif USE_OPENMP */
-#else  /* KOKKOS_VERSION >= 3.7.00 */
   Kokkos::InitializationSettings args;
   args.set_disable_warnings(true);
-#endif /* endif KOKKOS VERSION */
 #ifdef USE_OPENMP
   args.num_threads = omp_get_max_threads();
 #endif
@@ -228,6 +209,8 @@ nlcg_us(EnergyBase& energy_base,
   //                ~FE_UNDERFLOW);  // Enable all floating point exceptions but FE_INEXACT
   nlcg_info info;
 
+  Communicator comm_world(energy_base.comm_world());
+
   auto S = Overlap(overlap_base);
   auto P = USPreconditioner(us_precond_base);
 
@@ -296,6 +279,7 @@ nlcg_us(EnergyBase& energy_base,
   auto eta = eval_threaded(tapply(make_diag(), ek));
   auto slope_zx_zeta = dd.restarted(xspace(), X, ek, fn, Hx, wk, mu, S, P, free_energy);
   double slope = std::get<0>(slope_zx_zeta);
+
   auto z_x = std::get<1>(slope_zx_zeta);
   auto z_eta = std::get<2>(slope_zx_zeta);
   // allocate rotation matrices
@@ -323,7 +307,7 @@ nlcg_us(EnergyBase& energy_base,
                          ek,
                          fn,
                          free_energy.ks_energy_components(),
-                         commk,
+                         comm_world,
                          cg_iter);
 
       free_energy.ehandle().print_info();  // print magnetization
@@ -363,7 +347,7 @@ nlcg_us(EnergyBase& energy_base,
                          ek,
                          fn,
                          free_energy.ks_energy_components(),
-                         commk,
+                         comm_world,
                          cg_iter);
 
       timer.start();
diff --git a/src/operator.hpp b/src/operator.hpp
index e7807ff..65007dd 100644
--- a/src/operator.hpp
+++ b/src/operator.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "la/dvector.hpp"
-#include "la/mvector.hpp"
 
 namespace nlcglib {
 
diff --git a/src/overlap.hpp b/src/overlap.hpp
index c63e9a6..2928584 100644
--- a/src/overlap.hpp
+++ b/src/overlap.hpp
@@ -1,11 +1,8 @@
 #pragma once
 
-#include <memory>
 #include "interface.hpp"
-#include "la/mvector.hpp"
-#include "la/dvector.hpp"
-#include "operator.hpp"
 #include "mpi/communicator.hpp"
+#include "operator.hpp"
 
 
 namespace nlcglib {
@@ -27,14 +24,18 @@ class Overlap
 
   auto at(const key_t& key) const -> value_type;
 
-  auto begin() { return local::op_iterator<Overlap> (overlap_base.get_keys(), *this, false); }
+  auto begin() { return local::op_iterator<Overlap>(overlap_base.get_keys(), *this, false); }
   auto end() { return local::op_iterator<Overlap>(overlap_base.get_keys(), *this, true); }
-  auto begin() const { return local::op_iterator<const Overlap>(overlap_base.get_keys(), *this, false); }
-  auto end() const { return local::op_iterator<const Overlap>(overlap_base.get_keys(), *this, true); }
-
-  Communicator commk() const {
-    throw std::runtime_error("not implemented");
+  auto begin() const
+  {
+    return local::op_iterator<const Overlap>(overlap_base.get_keys(), *this, false);
   }
+  auto end() const
+  {
+    return local::op_iterator<const Overlap>(overlap_base.get_keys(), *this, true);
+  }
+
+  Communicator commk() const { throw std::runtime_error("not implemented"); }
 
 
 private:
diff --git a/src/preconditioner.hpp b/src/preconditioner.hpp
index 2966701..7bffd94 100644
--- a/src/preconditioner.hpp
+++ b/src/preconditioner.hpp
@@ -2,10 +2,10 @@
 
 #include <Kokkos_Core.hpp>
 
+#include "exec_space.hpp"
 #include "la/dvector.hpp"
-#include "la/mvector.hpp"
 #include "la/lapack.hpp"
-#include "exec_space.hpp"
+#include "la/mvector.hpp"
 
 namespace nlcglib {
 
@@ -19,14 +19,11 @@ class diagonal_preconditioner
 public:
   diagonal_preconditioner(const view_t<SPACE>& entries)
       : entries(entries)
-  {}
-
-  template <class M1,
-            class M2,
-            class... KOKKOS_ARGS3>
-  static void apply(M1& dst,
-                    const M2& src,
-                    const Kokkos::View<double*, KOKKOS_ARGS3...>& entries)
+  {
+  }
+
+  template <class M1, class M2, class... KOKKOS_ARGS3>
+  static void apply(M1& dst, const M2& src, const Kokkos::View<double*, KOKKOS_ARGS3...>& entries)
   {
     typedef Kokkos::MDRangePolicy<Kokkos::Rank<2>, exec_t<SPACE>> mdrange_policy;
     int m = dst.array().extent(0);
@@ -40,7 +37,7 @@ class diagonal_preconditioner
         });
   }
 
-  template<typename X>
+  template <typename X>
   auto operator()(const X& x)
   {
     auto y = empty_like()(x);
@@ -48,7 +45,7 @@ class diagonal_preconditioner
     return y;
   }
 
-  template<typename X>
+  template <typename X>
   void apply_in_place(X& x)
   {
     diagonal_preconditioner::apply(x, x, entries);
@@ -65,11 +62,13 @@ class diagonal_preconditioner
  *  http://dx.doi.org/10.1103/RevModPhys.64.1045
  */
 template <class SPACE>
-class PreconditionerTeter //: public mvector_base<PreconditionerTeter<SPACE>, diagonal_preconditioner<SPACE>>
+class PreconditionerTeter  //: public mvector_base<PreconditionerTeter<SPACE>,
+                           //: diagonal_preconditioner<SPACE>>
 {
 public:
   using memspace = SPACE;
   using value_type = diagonal_preconditioner<SPACE>;
+
 public:
   PreconditionerTeter(std::shared_ptr<VectorBaseZ> ekin)
   {
@@ -94,13 +93,13 @@ class PreconditionerTeter //: public mvector_base<PreconditionerTeter<SPACE>, di
   }
 
 
-  template<class key_t>
+  template <class key_t>
   auto operator[](const key_t& key) const
   {
     return diagonal_preconditioner<memspace>(kinetic_diag_precond.at(key));
   }
 
-  template<class key_t>
+  template <class key_t>
   auto at(const key_t& key) const
   {
     return this->operator[](key);
diff --git a/src/pseudo_hamiltonian/grad_eta.hpp b/src/pseudo_hamiltonian/grad_eta.hpp
index 3e07e62..c7cd556 100644
--- a/src/pseudo_hamiltonian/grad_eta.hpp
+++ b/src/pseudo_hamiltonian/grad_eta.hpp
@@ -39,7 +39,7 @@ struct GradEtaHelper
           "dFdmu",
           Kokkos::RangePolicy<Kokkos::Serial>(0, nbands),
           KOKKOS_LAMBDA(int i, Kokkos::complex<double>& result) {
-            double delta = smearing<smearing_t>::delta((en_loc(i) - mu) / kT, mo);
+            double delta = smearing<smearing_t>::delta((mu - en_loc(i)) / kT, mo);
             result += (hii(i) - en_loc(i)) * (delta);
           },
           v);
@@ -51,7 +51,7 @@ struct GradEtaHelper
 
   /** w_k * fn (1-fn) summed over all k-points
      \f[
-         \sum_{n, k'} w_{k'} f_n (mo-f_n)
+         \sum_{n, k'} w_{k'} delta( (μ - e_{n,k'}) / kT )
      \f]
      \$mo\$ is 1 for spin-polarized calucations and 2 otherwise.
   */
@@ -72,7 +72,7 @@ struct GradEtaHelper
       int nbands = en[key].size();
       auto en_loc = en[key];
       for (int i = 0; i < nbands; ++i) {
-        double delta = smearing<smearing_t>::delta((en_loc(i) - mu) / kT, mo);
+        double delta = smearing<smearing_t>::delta((mu - en_loc(i)) / kT, mo);
         v += delta * w_k;
       }
     }
@@ -151,7 +151,7 @@ class GradEta
     double kT_loc = kT;
     Kokkos::parallel_for(
         "gEta (1)", Kokkos::RangePolicy<exec_space>(0, nbands), KOKKOS_LAMBDA(int i) {
-          double delta = smearing<smearing_t>::delta((ek(i) - mu) / kT_loc, mo);
+          double delta = smearing<smearing_t>::delta((mu - ek(i)) / kT_loc, mo);
           mgETA(i, i) = -1 / kT_loc * (mHij(i, i) - wk * ek(i)) * (delta);
         });
 
@@ -161,7 +161,7 @@ class GradEta
       Kokkos::parallel_for(
           "gEta (2)", Kokkos::RangePolicy<exec_space>(0, nbands), KOKKOS_LAMBDA(int i) {
             // sumfn dmuFn
-            double delta = smearing<smearing_t>::delta((ek(i) - mu) / kT_loc, mo);
+            double delta = smearing<smearing_t>::delta((mu - ek(i)) / kT_loc, mo);
             mgETA(i, i) += wk * (delta) / dmu_deta * (dFdmu / kT_loc);
           });
     }
@@ -196,7 +196,6 @@ class GradEta
                  const mvector<vector_t>& ek,
                  const mvector<vector2_t>& wk)
   {
-    // delta_eta = kappa * (hij - diag(ek))
     return tapply_async(_delta_eta(kappa), Hij, ek, wk);
   }
 
diff --git a/src/smearing.hpp b/src/smearing.hpp
index 40ed9f3..4876be1 100644
--- a/src/smearing.hpp
+++ b/src/smearing.hpp
@@ -3,7 +3,6 @@
 #include <Kokkos_Core.hpp>
 #include <cmath>
 #include <stdexcept>
-#include <valarray>
 #include "constants.hpp"
 #include "dft/newton_minimization_smearing.hpp"
 #include "interface.hpp"
@@ -11,7 +10,6 @@
 #include "la/utils.hpp"
 #include "utils/env.hpp"
 #include "utils/logger.hpp"
-#include "utils/timer.hpp"
 
 namespace nlcglib {
 
@@ -244,9 +242,8 @@ struct cold_smearing : summed<cold_smearing>, non_monotonous
     if (x < -8) return 0;
     if (x > 10) return 0;
     double sqrt2 = std::sqrt(2.0);
-    double z = (x - 1/sqrt2);
-    return mo * std::exp(-z*z) * (sqrt2 - 6 * x + 2 * sqrt2 * x * x) /
-           std::sqrt(constants::pi);
+    double z = (x - 1 / sqrt2);
+    return mo * std::exp(-z * z) * (sqrt2 - 6 * x + 2 * sqrt2 * x * x) / std::sqrt(constants::pi);
   }
 
   KOKKOS_INLINE_FUNCTION static double entropy(double x, double mo)
@@ -255,8 +252,8 @@ struct cold_smearing : summed<cold_smearing>, non_monotonous
     if (x > 10) return 0;
     double sqrtpi = std::sqrt(constants::pi);
     double sqrt2 = std::sqrt(2.0);
-    double z = (x - 1/sqrt2);
-    return mo * std::exp(-z*z) * (1 - sqrt2 * x) / 2 / sqrtpi;
+    double z = (x - 1 / sqrt2);
+    return mo * std::exp(-z * z) * (1 - sqrt2 * x) / 2 / sqrtpi;
   }
 };
 
diff --git a/src/traits.hpp b/src/traits.hpp
index 85d0866..1d899fe 100644
--- a/src/traits.hpp
+++ b/src/traits.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
 #include <Kokkos_Core.hpp>
-#include <iostream>
 #include <functional>
-#include <utility>
 #include <future>
+#include <iostream>
+#include <utility>
 
 namespace nlcglib {
 
@@ -36,7 +36,8 @@ struct is_kokkos_view : _is_kokkos_view<std::remove_cv_t<std::remove_reference_t
 
 template <class X>
 struct _is_future : std::false_type
-{};
+{
+};
 
 template <class X>
 struct _is_future<std::shared_future<X>> : std::true_type
@@ -79,14 +80,16 @@ class mvector;
 
 template <class X>
 constexpr auto&&
-eval(X&& x, std::enable_if_t<(!is_callable<X>::value && !is_future<X>::value) || is_kokkos_view<X>::value> * = nullptr)
+eval(X&& x,
+     std::enable_if_t<(!is_callable<X>::value && !is_future<X>::value) ||
+                      is_kokkos_view<X>::value>* = nullptr)
 {
   return std::forward<X>(x);
 }
 
 template <class X>
 constexpr auto
-eval(X &&x, std::enable_if_t<is_callable<X>::value && !is_kokkos_view<X>::value> * = nullptr)
+eval(X&& x, std::enable_if_t<is_callable<X>::value && !is_kokkos_view<X>::value>* = nullptr)
 {
   return x();
 }
@@ -105,8 +108,9 @@ eval(const std::shared_future<X>& x)
   return x.get();
 }
 
-template<class X>
-struct eval_type {
+template <class X>
+struct eval_type
+{
   using type = std::remove_reference_t<decltype(eval(std::declval<X>()))>;
 };
 
@@ -114,13 +118,13 @@ template <class X>
 using eval_t = typename eval_type<X>::type;
 
 
-template<class X>
+template <class X>
 struct result_of
 {
   using type = decltype(eval(std::declval<X>()));
 };
 
-template<class X>
+template <class X>
 struct result_of<mvector<X>>
 {
   using type = typename result_of<X>::type;
@@ -132,9 +136,11 @@ using result_of_t = typename result_of<X>::type;
 
 /// add const if argument is true
 template <class T, bool is_const>
-struct conditional_add_const {};
+struct conditional_add_const
+{
+};
 
-template<class T>
+template <class T>
 struct conditional_add_const<T, true>
 {
   using type = typename std::add_const_t<T>;
@@ -146,7 +152,7 @@ struct conditional_add_const<T, false>
   using type = T;
 };
 
-template<class T, bool is_const>
+template <class T, bool is_const>
 using conditional_add_const_t = typename conditional_add_const<T, is_const>::type;
 
 
diff --git a/src/ultrasoft_precond.hpp b/src/ultrasoft_precond.hpp
index c1086e2..cac78d1 100644
--- a/src/ultrasoft_precond.hpp
+++ b/src/ultrasoft_precond.hpp
@@ -10,7 +10,6 @@ namespace nlcglib {
 /// Wrapper for overlap operation computed by sirius, behaves like mvector in an expression.
 class USPreconditioner
 {
-
 public:
   // TODO: rename to k_index
   using key_t = std::pair<int, int>;
@@ -25,10 +24,22 @@ class USPreconditioner
 
   auto at(const key_t& key) const;
 
-  auto begin() { return local::op_iterator<USPreconditioner> (us_precond_base.get_keys(), *this, false); }
-  auto end() { return local::op_iterator<USPreconditioner>(us_precond_base.get_keys(), *this, true); }
-  auto begin() const { return local::op_iterator<const USPreconditioner>(us_precond_base.get_keys(), *this, false); }
-  auto end() const { return local::op_iterator<const USPreconditioner>(us_precond_base.get_keys(), *this, true); }
+  auto begin()
+  {
+    return local::op_iterator<USPreconditioner>(us_precond_base.get_keys(), *this, false);
+  }
+  auto end()
+  {
+    return local::op_iterator<USPreconditioner>(us_precond_base.get_keys(), *this, true);
+  }
+  auto begin() const
+  {
+    return local::op_iterator<const USPreconditioner>(us_precond_base.get_keys(), *this, false);
+  }
+  auto end() const
+  {
+    return local::op_iterator<const USPreconditioner>(us_precond_base.get_keys(), *this, true);
+  }
 
 private:
   const UltrasoftPrecondBase& us_precond_base;
@@ -41,4 +52,4 @@ USPreconditioner::at(const key_t& key) const
 }
 
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/utils/format.hpp b/src/utils/format.hpp
index b14e9d6..5df6491 100644
--- a/src/utils/format.hpp
+++ b/src/utils/format.hpp
@@ -1,12 +1,13 @@
 #pragma once
 
-#include <iostream>
 #include <cstdio>
+#include <iostream>
 
 namespace nlcglib {
 
-template<class... ARGS>
-std::string format(std::string format_string, ARGS&&... args)
+template <class... ARGS>
+std::string
+format(std::string format_string, ARGS&&... args)
 {
   char buf[format_string.size()];
   std::sprintf(buf, format_string.c_str(), args...);
@@ -14,4 +15,4 @@ std::string format(std::string format_string, ARGS&&... args)
   return std::string(buf);
 }
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/utils/logger.hpp b/src/utils/logger.hpp
index 4f581d7..b491ea0 100644
--- a/src/utils/logger.hpp
+++ b/src/utils/logger.hpp
@@ -2,8 +2,8 @@
 
 #include <mpi.h>
 #include <fstream>
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <list>
 #include <memory>
 #include <mutex>
@@ -13,14 +13,14 @@
 
 namespace nlcglib {
 
-const static struct to_stdout_trigger {} TO_STDOUT;
+const static struct to_stdout_trigger
+{
+} TO_STDOUT;
 
 class Logger : public CSingleton<Logger>
 {
 public:
-  Logger() {
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid_);
-  }
+  Logger() { MPI_Comm_rank(MPI_COMM_WORLD, &pid_); }
 
   Logger(Logger&&) = default;
 
@@ -34,8 +34,7 @@ class Logger : public CSingleton<Logger>
   void attach_file_master(const std::string& fname = "nlcg.out")
   {
     MPI_Comm_rank(MPI_COMM_WORLD, &pid_);
-    if (pid_ == 0)
-      stream_ptr_ = std::make_shared<std::ofstream>(fname);
+    if (pid_ == 0) stream_ptr_ = std::make_shared<std::ofstream>(fname);
   }
 
   template <typename T>
@@ -87,7 +86,7 @@ class Logger : public CSingleton<Logger>
 
   void flush()
   {
-    if(stream_ptr_) {
+    if (stream_ptr_) {
       std::mutex mutex;
       std::lock_guard<std::mutex> lock(mutex);
       auto& out = *(stream_ptr_.get());
diff --git a/src/utils/step_logger.hpp b/src/utils/step_logger.hpp
index daf99d0..8edf179 100644
--- a/src/utils/step_logger.hpp
+++ b/src/utils/step_logger.hpp
@@ -1,12 +1,11 @@
 #pragma once
 
-#include <nlohmann/json.hpp>
 #include <Kokkos_Complex.hpp>
-#include <iostream>
 #include <fstream>
+#include <iostream>
+#include <nlohmann/json.hpp>
 #include <string>
 #include <type_traits>
-#include <type_traits>
 #include "la/mvector.hpp"
 
 namespace Kokkos {
@@ -26,25 +25,28 @@ namespace nlcglib {
 class StepLogger
 {
 public:
-  StepLogger(int i, std::string fname = "nlcg.json", bool active=true)
-      : i(i), fname(fname), active(active)
+  StepLogger(int i, std::string fname = "nlcg.json", bool active = true)
+      : i(i)
+      , fname(fname)
+      , active(active)
   {
     dict["type"] = "cg_iteration";
     dict["step"] = i;
   }
 
-  template<class X>
-  std::enable_if_t<std::is_scalar<std::remove_reference_t<X>>::value> log(const std::string& key, X&& x);
+  template <class X>
+  std::enable_if_t<std::is_scalar<std::remove_reference_t<X>>::value> log(const std::string& key,
+                                                                          X&& x);
 
-  template<class X>
+  template <class X>
   void log(const std::string& key, const std::map<std::string, X>& v);
 
-  template<class V>
+  template <class V>
   void log(const std::string& key, const mvector<V>& x);
 
   ~StepLogger()
   {
-    if(active) {
+    if (active) {
       std::ofstream fout(std::string("nlcg") + ".json", std::ios_base::app);
       fout << dict;
       fout.flush();
@@ -62,14 +64,15 @@ template <class X>
 std::enable_if_t<std::is_scalar<std::remove_reference_t<X>>::value>
 StepLogger::log(const std::string& key, X&& x)
 {
-  if(!active) return;
+  if (!active) return;
   dict[key] = x;
 }
 
-template<class X>
-void StepLogger::log(const std::string& key, const std::map<std::string, X>& v)
+template <class X>
+void
+StepLogger::log(const std::string& key, const std::map<std::string, X>& v)
 {
-  if(!active) return;
+  if (!active) return;
   dict[key] = v;
 }
 
@@ -77,7 +80,7 @@ template <class V>
 void
 StepLogger::log(const std::string& key, const mvector<V>& x)
 {
-  if(!active) return;
+  if (!active) return;
   // assuming V is a 1-d kokkos array
   for (auto& elem : x) {
     auto x_key = elem.first;
diff --git a/src/utils/timer.hpp b/src/utils/timer.hpp
index 513dd21..8bf2d37 100644
--- a/src/utils/timer.hpp
+++ b/src/utils/timer.hpp
@@ -7,8 +7,10 @@
 
 namespace nlcglib {
 
-template<class T>
-struct duration_string {};
+template <class T>
+struct duration_string
+{
+};
 
 template <>
 struct duration_string<std::chrono::milliseconds>
@@ -56,4 +58,4 @@ Timer::stop()
   return std::chrono::duration_cast<std::chrono::duration<double>>(now - this->t).count();
 }
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/test/test.cpp b/test/test.cpp
index b910a41..48cf9a9 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -13,11 +13,13 @@ using namespace nlcglib;
 
 typedef std::complex<double> complex_double;
 
-void run() {
-  KokkosDVector<complex_double**, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> X(
+void
+run()
+{
+  KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> X(
       Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})));
 
-  KokkosDVector<complex_double**, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> H(
+  KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> H(
       Map<>(Communicator(), SlabLayoutV({{0, 0, 20, 20}})));
 
   auto kokkos_array = X.array();
@@ -33,11 +35,10 @@ run_unmanaged()
   KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> X(
       Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})));
 
-  std::vector<complex_double> yptr(200*20);
+  std::vector<complex_double> yptr(200 * 20);
   KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutStride, Kokkos::MemoryUnmanaged> Y(
       Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})),
-      buffer_protocol<complex_double, 2>({1, 200}, {200, 20}, yptr.data(), memory_type::host)
-  );
+      buffer_protocol<complex_double, 2>({1, 200}, {200, 20}, yptr.data(), memory_type::host));
 
   KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> H(
       Map<>(Communicator(), SlabLayoutV({{0, 0, 20, 20}})));
@@ -48,7 +49,7 @@ run_unmanaged()
 
   auto S = H.copy();
 
-  Kokkos::View<double*, Kokkos::HostSpace> eigvals("eigvals", 20);
+  Kokkos::View<double *, Kokkos::HostSpace> eigvals("eigvals", 20);
   eigh(H, eigvals, S);
 }
 
@@ -61,10 +62,10 @@ run_unmanaged_cuda()
 
   complex_double yptr[200 * 20];
   KokkosDVector<complex_double **,
-                    SlabLayoutV,
-                    Kokkos::LayoutStride,
-                    Kokkos::CudaSpace,
-                    Kokkos::MemoryUnmanaged>
+                SlabLayoutV,
+                Kokkos::LayoutStride,
+                Kokkos::CudaSpace,
+                Kokkos::MemoryUnmanaged>
       Y(Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})),
         buffer_protocol<complex_double, 2>({1, 200}, {200, 20}, yptr, memory_type::host));
 
@@ -77,10 +78,9 @@ run_unmanaged_cuda()
 
   auto S = H.copy();
 
-  Kokkos::View<double*, Kokkos::CudaSpace> eigvals("eigvals", 20);
+  Kokkos::View<double *, Kokkos::CudaSpace> eigvals("eigvals", 20);
 
   eigh(S, eigvals, H);
-
 }
 #endif
 
@@ -102,7 +102,8 @@ run_stacked_vector()
 }
 
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   Kokkos::initialize();
   Communicator::init(argc, argv);
@@ -117,9 +118,9 @@ int main(int argc, char *argv[])
 
   run();
 
-  #ifdef __NLCGLIB__CUDA
+#ifdef __NLCGLIB__CUDA
   run_unmanaged_cuda();
-  #endif
+#endif
 
   Communicator::finalize();
   Kokkos::finalize();
diff --git a/test/test3.cpp b/test/test3.cpp
index a53358a..ab60f1a 100644
--- a/test/test3.cpp
+++ b/test/test3.cpp
@@ -1,15 +1,15 @@
 #include <stdlib.h>
 #include <Kokkos_Core.hpp>
 #include <iostream>
-#include "smearing.hpp"
 #include "la/mvector.hpp"
+#include "smearing.hpp"
 
 
 auto
 run()
 {
   int n = 10;
-  using vector_t = Kokkos::View<double*, Kokkos::HostSpace>;
+  using vector_t = Kokkos::View<double *, Kokkos::HostSpace>;
   vector_t a_view("test", n);
   for (int i = 0; i < n; ++i) {
     a_view(i) = i;
diff --git a/test/test_buffer.cpp b/test/test_buffer.cpp
index 8351f65..8dd9e8a 100644
--- a/test/test_buffer.cpp
+++ b/test/test_buffer.cpp
@@ -16,7 +16,8 @@ using typeX = KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft,
 //   return make_const(m);
 // }
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> X(
       Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})));
diff --git a/test/test_deep_copy.cpp b/test/test_deep_copy.cpp
index b403ed4..61264aa 100644
--- a/test/test_deep_copy.cpp
+++ b/test/test_deep_copy.cpp
@@ -4,39 +4,37 @@
 #include "traits.hpp"
 
 #ifdef __NLCGLIB__CUDA
-void run()
+void
+run()
 {
   int n = 10;
   double arr[10];
 
-  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>
-    c(arr, n);
-  Kokkos::View<double*, Kokkos::HostSpace>
-    a("A",  n);
-  Kokkos::View<double*, Kokkos::CudaSpace>
-    a_device("A", n);
-  Kokkos::View<double*, Kokkos::HostSpace>
-    b("b", n);
+  Kokkos::View<double *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> c(arr, n);
+  Kokkos::View<double *, Kokkos::HostSpace> a("A", n);
+  Kokkos::View<double *, Kokkos::CudaSpace> a_device("A", n);
+  Kokkos::View<double *, Kokkos::HostSpace> b("b", n);
   for (int i = 0; i < n; ++i) {
     a(i) = i;
   }
 
-  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, n), KOKKOS_LAMBDA(int i) {
-      a_device[i] = i;
-    });
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::Cuda>(0, n), KOKKOS_LAMBDA(int i) { a_device[i] = i; });
 
   Kokkos::deep_copy(b, a);
   Kokkos::deep_copy(c, a_device);
   std::cout << "a.data: " << a.data() << "\n";
   std::cout << "b.data: " << b.data() << "\n";
 
-  std::cout << "b" << "\n";
+  std::cout << "b"
+            << "\n";
   for (int i = 0; i < n; ++i) {
     std::cout << b(i) << ", ";
   }
   std::cout << "\n";
 
-  std::cout << "c (should be same as b)" << "\n";
+  std::cout << "c (should be same as b)"
+            << "\n";
   for (int i = 0; i < n; ++i) {
     std::cout << c(i) << ", ";
   }
@@ -48,7 +46,7 @@ void
 run_2d()
 {
   int n = 3;
-  double arr[n*n];
+  double arr[n * n];
 
   Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> c(arr, n, n);
   Kokkos::View<double **, Kokkos::HostSpace> a("A", n, n);
diff --git a/test/test_dvector_mirror_view_and_copy.cpp b/test/test_dvector_mirror_view_and_copy.cpp
index 315d5cc..922f09f 100644
--- a/test/test_dvector_mirror_view_and_copy.cpp
+++ b/test/test_dvector_mirror_view_and_copy.cpp
@@ -15,8 +15,10 @@ using namespace nlcglib;
 
 typedef Kokkos::complex<double> complex_double;
 
-template<class T>
-struct print_type {};
+template <class T>
+struct print_type
+{
+};
 
 void
 run()
@@ -29,13 +31,10 @@ run()
 
   auto Y = create_mirror_view_and_copy(device_space_t(), X);
 
-  if (Y.array().data() == X.array().data())
-  {
+  if (Y.array().data() == X.array().data()) {
     std::cout << "create_mirror_view_and_copy(SPACE, KokkosDVectors) works"
               << "\n";
-  }
-  else
-  {
+  } else {
     std::cout << "create_mirror_view_and_copy(SPACE, KokkosDVectors) broken"
               << "\n";
   }
@@ -58,7 +57,7 @@ run_copy_to_host()
 void
 test2()
 {
-  using matrix_t = Kokkos::View<double **, Kokkos::LayoutLeft ,device_space_t>;
+  using matrix_t = Kokkos::View<double **, Kokkos::LayoutLeft, device_space_t>;
 
   int n = 10;
   matrix_t A("foo", n, n);
@@ -69,8 +68,9 @@ test2()
 
   std::cout << "A.ptr = " << A.data() << "\n";
   std::cout << "B.ptr = " << B.data() << "\n";
-  if ( A.data() != B.data()) {
-    std::cout << "A and B are _distinct_ arrays!" << "\n";
+  if (A.data() != B.data()) {
+    std::cout << "A and B are _distinct_ arrays!"
+              << "\n";
   }
 
   auto C = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A);
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index b73c43b..417c84b 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,6 +1,6 @@
+#include <Kokkos_Core.hpp>
 #include "la/dvector.hpp"
 #include "la/lapack.hpp"
-#include <Kokkos_Core.hpp>
 
 #include <mpi.h>
 #include <iostream>
@@ -29,7 +29,9 @@ std::mt19937 gen(0);
 
 typedef Kokkos::complex<double> complex_double;
 
-void run() {
+void
+run()
+{
   int n = 4000;
   int m = 400;
 
@@ -51,8 +53,6 @@ void run() {
 
     Kokkos::deep_copy(X.array(), host);
 
-
-    // eigh(X);
     auto H = inner_()(X, X);
     auto HH = empty_like()(H);
     // auto  Z = transform_alloc(X, H, 1.0);
@@ -63,10 +63,10 @@ void run() {
   // Kokkos::parallel_for("init_H",
   //                      mdrange_policy({0, 0}, {m, m}),
   //                      MDFunctor<typename vector_t::storage_t>(M));
-
 }
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   Kokkos::initialize();
   Communicator::init(argc, argv);
diff --git a/test/test_kokkos.cpp b/test/test_kokkos.cpp
index b3c9797..c73b8a1 100644
--- a/test/test_kokkos.cpp
+++ b/test/test_kokkos.cpp
@@ -1,24 +1,25 @@
+#include <stdlib.h>
 #include <Kokkos_Core.hpp>
-#include "hip/hip_space.hpp"
 #include <iostream>
-#include <stdlib.h>
+#include "hip/hip_space.hpp"
 /// to have exp available on device
 #include <math.h>
 
 
-auto unmanaged()
+auto
+unmanaged()
 {
   std::cout << "\nunmanaged\n";
   int n = 10;
-  double* A = new double[n*n];
+  double* A = new double[n * n];
 
   for (int i = 0; i < n; ++i) {
     for (int j = 0; j < n; ++j) {
-      A[n*i + j] = n * i + j;
+      A[n * i + j] = n * i + j;
     }
   }
 
-  Kokkos::View<double**, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_view(
+  Kokkos::View<double**, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_view(
       A, n, n);
 
   for (int i = 0; i < n; ++i) {
@@ -32,21 +33,25 @@ auto unmanaged()
 }
 
 
-auto unmanaged_strided()
+auto
+unmanaged_strided()
 {
   std::cout << "\nunmanaged_strided\n";
-  int n = 10; // rows
-  int m = 10; // cols
+  int n = 10;  // rows
+  int m = 10;  // cols
   int lda = 12;
 
   double* A;
-  posix_memalign(reinterpret_cast<void**>(&A), 256, lda*m*sizeof(double));
+  posix_memalign(reinterpret_cast<void**>(&A), 256, lda * m * sizeof(double));
   for (int i = 0; i < n; ++i) {
     for (int j = 0; j < m; ++j) {
-      A[j*lda + i] = i*n + j;
+      A[j * lda + i] = i * n + j;
     }
   }
-  typedef Kokkos::View<double**, Kokkos::LayoutStride, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+  typedef Kokkos::View<double**,
+                       Kokkos::LayoutStride,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       vector_t;
   vector_t a_view(A, Kokkos::LayoutStride(n, 1, m, lda));
 
@@ -60,27 +65,26 @@ auto unmanaged_strided()
 }
 
 
-
-template<typename numeric_t>
-void kokkos_reduction()
+template <typename numeric_t>
+void
+kokkos_reduction()
 {
   using space = Kokkos::HostSpace;
   using exec_space = Kokkos::Serial;
   // using space = Kokkos::CudaSpace;
   // using exec_space = Kokkos::Cuda;
 
-  int n  = 100;
+  int n = 100;
   Kokkos::View<numeric_t*, space> view("", n);
-  Kokkos::parallel_for("foo", Kokkos::RangePolicy<exec_space>(0, n),
-                       KOKKOS_LAMBDA (int i)
-                       {
-                         view(i) = i;
-                       }
-                      );
+  Kokkos::parallel_for(
+      "foo", Kokkos::RangePolicy<exec_space>(0, n), KOKKOS_LAMBDA(int i) { view(i) = i; });
 
   numeric_t sum = 0;
-  Kokkos::parallel_reduce("summation", Kokkos::RangePolicy<exec_space>(0, view.size()),
-                          KOKKOS_LAMBDA (int i, numeric_t& loc_sum) { loc_sum += view(i); }, sum);
+  Kokkos::parallel_reduce(
+      "summation",
+      Kokkos::RangePolicy<exec_space>(0, view.size()),
+      KOKKOS_LAMBDA(int i, numeric_t& loc_sum) { loc_sum += view(i); },
+      sum);
   std::cout << "Sum is: " << sum << "\n";
 }
 
@@ -88,10 +92,10 @@ template <typename numeric_t>
 void
 kokkos_reduction_device()
 {
-  #ifdef __NLCGLIB__CUDA
+#ifdef __NLCGLIB__CUDA
   using space = Kokkos::CudaSpace;
   using exec_space = Kokkos::Cuda;
-  #elif defined __NLCGLIB__ROCM
+#elif defined __NLCGLIB__ROCM
   using space = Kokkos::Experimental::HIPSpace;
   using exec_space = Kokkos::Experimental::HIP;
 #endif
@@ -116,16 +120,19 @@ struct fun
   __device__ __host__ double operator()(double x) const { return 1 / (1 + exp(x)); }
 };
 
-int main(int argc, char *argv[])
+int
+main(int argc, char* argv[])
 {
   Kokkos::initialize();
   auto x = unmanaged();
   auto x2 = unmanaged();
   unmanaged_strided();
 
-  std::cout << "trying reduction on cpu: " << "\n";
+  std::cout << "trying reduction on cpu: "
+            << "\n";
   kokkos_reduction<double>();
-  std::cout << "trying reduction on gpu: " << "\n";
+  std::cout << "trying reduction on gpu: "
+            << "\n";
   kokkos_reduction_device<double>();
   // // test
   // kokkos_view_stuff();
diff --git a/test/test_mpi_wrapper.cpp b/test/test_mpi_wrapper.cpp
index 9516d07..f3934ca 100644
--- a/test/test_mpi_wrapper.cpp
+++ b/test/test_mpi_wrapper.cpp
@@ -48,7 +48,8 @@ run_pair()
   comm.allgather(buffer.data(), 2);
 
   for (auto i = 0ul; i < buffer.size(); ++i) {
-    std::cout << "rank " << comm.rank() << ": " << buffer[i].first << " , " << buffer[i].second << "\n";
+    std::cout << "rank " << comm.rank() << ": " << buffer[i].first << " , " << buffer[i].second
+              << "\n";
   }
 }
 
@@ -72,10 +73,8 @@ run_pair2()
 
   for (auto i = 0ul; i < buffer.size(); ++i) {
     auto p1 = buffer[i];
-    std::cout << "rank " << comm.rank() << ": ("
-              << p1.first.first << " , "
-              << p1.first.second << ") , "
-              << p1.second << "\n";
+    std::cout << "rank " << comm.rank() << ": (" << p1.first.first << " , " << p1.first.second
+              << ") , " << p1.second << "\n";
   }
 }
 
diff --git a/test/test_mvector_cuda.cpp b/test/test_mvector_cuda.cpp
index f80db38..045b01c 100644
--- a/test/test_mvector_cuda.cpp
+++ b/test/test_mvector_cuda.cpp
@@ -1,21 +1,22 @@
 #include <stdlib.h>
 #include <Kokkos_Core.hpp>
 #include <iostream>
-#include "smearing.hpp"
 #include "la/mvector.hpp"
+#include "smearing.hpp"
 
 #ifdef __NLCGLIB__CUDA
-void run()
+void
+run()
 {
   int n = 10;
-  using vector_t = Kokkos::View<double*, Kokkos::CudaSpace>;
+  using vector_t = Kokkos::View<double *, Kokkos::CudaSpace>;
   vector_t a_view("test", n);
 
 
   auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_view);
 
   for (int i = 0; i < n; ++i) {
-    host_view(i) = i+1;
+    host_view(i) = i + 1;
   }
 
   Kokkos::deep_copy(a_view, host_view);
diff --git a/test/test_smearing.cpp b/test/test_smearing.cpp
index 75a3b62..8ffa5d4 100644
--- a/test/test_smearing.cpp
+++ b/test/test_smearing.cpp
@@ -1,4 +1,6 @@
 #include <Kokkos_Core.hpp>
+
+
 #include "smearing.hpp"
 
 using namespace nlcglib;
@@ -24,7 +26,7 @@ run(smearing_type smearing_t)
 
   int nk_loc = nk / comm.size();
 
-  if (pid == nranks-1) {
+  if (pid == nranks - 1) {
     nk_loc = nk - (nranks - 1) * nk_loc;
   }
 
@@ -38,13 +40,13 @@ run(smearing_type smearing_t)
     }
 
     double check = sum(wk, comm);
-    if (std::abs(check-1) > 1e-10)  {
-      std::cout << sum(wk,comm) << "\n";
+    if (std::abs(check - 1) > 1e-10) {
+      std::cout << sum(wk, comm) << "\n";
       throw std::runtime_error("wrong weights");
     }
   }
 
-  if(pid == nranks-1) {
+  if (pid == nranks - 1) {
     print(wk);
   }
 
@@ -63,30 +65,29 @@ run(smearing_type smearing_t)
   mvector<vec_t> ek;
 
   for (int i = 0; i < nk_loc; ++i) {
-
     auto key = std::make_pair(pid * nk / nranks + i, 0);
 
     double lb = -10;
     vec_t eki("ek" + std::to_string(i), num_bands);
     eki(0) = lb;
     for (int ib = 1; ib < num_bands; ++ib) {
-      eki(ib) = eki(ib-1) + std::exp(-0.05*ib);
+      eki(ib) = eki(ib - 1) + std::exp(-0.05 * ib);
     }
     ek[key] = eki;
-    if (i == 0)
-    print(ek);
+    if (i == 0) print(ek);
   }
 
   auto mu_fn = smearing.fn(ek);
   double S = smearing.entropy(std::get<1>(mu_fn), ek, std::get<0>(mu_fn));
   double smax = comm.allreduce(S, mpi_op::max);
-  if ( S != smax) {
+  if (S != smax) {
     throw std::runtime_error("entropy differs");
   }
   std::cout << "entropy is " << S << "\n";
 }
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   MPI_Init(&argc, &argv);
   Kokkos::initialize();
diff --git a/test/test_solver.cpp b/test/test_solver.cpp
index 2050dc4..abf44e4 100644
--- a/test/test_solver.cpp
+++ b/test/test_solver.cpp
@@ -21,7 +21,7 @@ std::uniform_real_distribution<double> unif01(0, 1);
 std::mt19937 gen(0);
 
 
-template<class SPACE=Kokkos::HostSpace>
+template <class SPACE = Kokkos::HostSpace>
 void
 run_unmanaged()
 {
@@ -39,8 +39,7 @@ run_unmanaged()
   Kokkos::deep_copy(arr, host_view);
 
   using matrix_t = KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, SPACE>;
-  matrix_t H(
-      Map<>(Communicator(), SlabLayoutV({{0, 0, ncols, ncols}})));
+  matrix_t H(Map<>(Communicator(), SlabLayoutV({{0, 0, ncols, ncols}})));
 
   inner(H, X, X);
   auto S = H.copy();
@@ -48,22 +47,23 @@ run_unmanaged()
   solve_sym(H /* will be overwritten by Cholesky factorization */,
             S /* will be overwritten by solution */);
 
-  std::cout <<  "extent " << S.array().extent(0) << "\n";
-  std::cout <<  "extent " << S.array().extent(1) << "\n";
+  std::cout << "extent " << S.array().extent(0) << "\n";
+  std::cout << "extent " << S.array().extent(1) << "\n";
 
   auto S_host = Kokkos::create_mirror(S.array());
   Kokkos::deep_copy(S_host, S.array());
   // print result
   for (int i = 0; i < ncols; ++i) {
     for (int j = 0; j < ncols; ++j) {
-      std::cout << S_host(i,j) << "  ";
+      std::cout << S_host(i, j) << "  ";
     }
     std::cout << "\n";
   }
 }
 
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   Kokkos::initialize();
   Communicator::init(argc, argv);
@@ -76,7 +76,8 @@ int main(int argc, char *argv[])
   }
   std::cout << "thread_count: " << threads_count;
 
-  std::cout << "run on HOST" << "\n";
+  std::cout << "run on HOST"
+            << "\n";
   run_unmanaged<Kokkos::HostSpace>();
 
   std::cout << "run on DEVICE"
diff --git a/test/test_stride_dvector.cpp b/test/test_stride_dvector.cpp
index d3e689c..67165e8 100644
--- a/test/test_stride_dvector.cpp
+++ b/test/test_stride_dvector.cpp
@@ -1,22 +1,23 @@
+#include <stdlib.h>
 #include <Kokkos_Core.hpp>
 #include <iostream>
-#include <stdlib.h>
 #include "la/dvector.hpp"
 
 
-auto unmanaged()
+auto
+unmanaged()
 {
   std::cout << "\nunmanaged\n";
   int n = 10;
-  double* A = new double[n*n];
+  double* A = new double[n * n];
 
   for (int i = 0; i < n; ++i) {
     for (int j = 0; j < n; ++j) {
-      A[n*i + j] = n * i + j;
+      A[n * i + j] = n * i + j;
     }
   }
 
-  Kokkos::View<double**, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_view(
+  Kokkos::View<double**, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_view(
       A, n, n);
 
   for (int i = 0; i < n; ++i) {
@@ -30,31 +31,33 @@ auto unmanaged()
 }
 
 
-auto unmanaged_strided()
+auto
+unmanaged_strided()
 {
   std::cout << "\nunmanaged_strided\n";
-  int n = 10; // rows
-  int m = 10; // cols
+  int n = 10;  // rows
+  int m = 10;  // cols
   int lda = 12;
 
   double* A;
-  int res = posix_memalign(reinterpret_cast<void**>(&A), 256, lda*m*sizeof(double));
+  int res = posix_memalign(reinterpret_cast<void**>(&A), 256, lda * m * sizeof(double));
   std::cout << "A: " << A << "\n";
   std::cout << "poisx_memalign: " << res << "\n";
   for (int i = 0; i < n; ++i) {
     for (int j = 0; j < m; ++j) {
-      A[j*lda + i] = i*n + j;
+      A[j * lda + i] = i * n + j;
     }
   }
-  nlcglib::buffer_protocol<double, 2> buf{std::array<int, 2>{1, lda}, std::array<int, 2>{n, m}, A, nlcglib::memory_type::host};
+  nlcglib::buffer_protocol<double, 2> buf{
+      std::array<int, 2>{1, lda}, std::array<int, 2>{n, m}, A, nlcglib::memory_type::host};
 
   nlcglib::Map<> map(nlcglib::Communicator(), nlcglib::SlabLayoutV({{0, 0, n, m}}));
-      nlcglib::KokkosDVector<double**,
-                             nlcglib::SlabLayoutV,
-                             Kokkos::LayoutStride,
-                             Kokkos::HostSpace,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-        dvector(map, buf);
+  nlcglib::KokkosDVector<double**,
+                         nlcglib::SlabLayoutV,
+                         Kokkos::LayoutStride,
+                         Kokkos::HostSpace,
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      dvector(map, buf);
 
   // check daata
   bool passed = true;
@@ -64,14 +67,17 @@ auto unmanaged_strided()
       passed = passed && is_same;
     }
   }
-  if (passed) std::cout << "worked!\n";
-  else std::cout << "failed!\n";
+  if (passed)
+    std::cout << "worked!\n";
+  else
+    std::cout << "failed!\n";
 
   return dvector;
 }
 
 
-int main(int argc, char *argv[])
+int
+main(int argc, char* argv[])
 {
   Kokkos::initialize();
   auto x = unmanaged();
@@ -84,5 +90,4 @@ int main(int argc, char *argv[])
   std::cout << x2.data() << "\n";
   Kokkos::finalize();
   return 0;
-
 }
diff --git a/test/test_threaded.cpp b/test/test_threaded.cpp
index e9a213a..ae5789f 100644
--- a/test/test_threaded.cpp
+++ b/test/test_threaded.cpp
@@ -2,8 +2,8 @@
 #include "la/lapack.hpp"
 
 #include <mpi.h>
-#include <iostream>
 #include <complex>
+#include <iostream>
 
 using namespace nlcglib;
 
@@ -32,7 +32,8 @@ run_stacked_vector()
 }
 
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   Kokkos::initialize();
   MPI_Init(&argc, &argv);
diff --git a/test/test_traits.cpp b/test/test_traits.cpp
index 67905e4..2a531f3 100644
--- a/test/test_traits.cpp
+++ b/test/test_traits.cpp
@@ -7,15 +7,21 @@ class Foo
 {
 public:
   Foo() = default;
-  Foo(const Foo& ) { std::cout << "copy ctr" << "\n";}
+  Foo(const Foo&)
+  {
+    std::cout << "copy ctr"
+              << "\n";
+  }
   Foo(Foo&&)
   {
-    std::cout << "move ctr" << "\n";
+    std::cout << "move ctr"
+              << "\n";
   }
 };
 
 
-int main(int argc, char *argv[])
+int
+main(int argc, char* argv[])
 {
   auto x = eval(Foo());
   return 0;
diff --git a/test/test_utils.cpp b/test/test_utils.cpp
index 3e4a593..110c1c6 100644
--- a/test/test_utils.cpp
+++ b/test/test_utils.cpp
@@ -3,10 +3,14 @@
 using namespace nlcglib;
 
 
-template<class T>
-struct print_type {};
+template <class T>
+struct print_type
+{
+};
 
-void test_unzip() {
+void
+test_unzip()
+{
   mvector<std::tuple<int, int, double>> Z;
   auto X = unzip(Z);
   auto x0 = std::get<0>(X);
@@ -14,7 +18,8 @@ void test_unzip() {
 }
 
 
-int main(int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
   test_unzip();
   return 0;
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 564a067..9a81468 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -7,5 +7,5 @@ endif()
 if(BUILD_TESTS)
   add_executable(gtest local/test_la_wrappers.cpp local/test_solver_wrappers.cpp)
   target_link_libraries(gtest PUBLIC nlcglib_core)
-  target_link_libraries(gtest PRIVATE GTest::GTest GTest::Main)
+  target_link_libraries(gtest PRIVATE GTest::gtest GTest::gtest_main)
 endif()
diff --git a/unit_tests/local/test_la_wrappers.cpp b/unit_tests/local/test_la_wrappers.cpp
index 347caaa..29a4d79 100644
--- a/unit_tests/local/test_la_wrappers.cpp
+++ b/unit_tests/local/test_la_wrappers.cpp
@@ -1,10 +1,10 @@
 #include <gtest/gtest.h>
-#include "hip/hip_space.hpp"
+#include <iomanip>
 #include <iostream>
+#include "hip/hip_space.hpp"
 #include "la/dvector.hpp"
 #include "la/lapack.hpp"
 #include "la/magma.hpp"
-#include <iomanip>
 
 using namespace nlcglib;
 
@@ -51,8 +51,7 @@ class CPUKokkosVectors : public ::testing::Test
     }
   }
 
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace>
-      vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> vector_t;
 
   vector_t a_;
   vector_t b_;
@@ -62,8 +61,7 @@ class CPUKokkosVectors : public ::testing::Test
 
 TEST_F(CPUKokkosVectors, InnerProductCPU)
 {
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace>
-      vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> vector_t;
   int n = 5;
   vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, n, n}})));
   inner(c, a_, b_);
@@ -83,8 +81,7 @@ TEST_F(CPUKokkosVectors, InnerProductCPU)
 
 TEST_F(CPUKokkosVectors, TransformCPU)
 {
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace>
-      vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> vector_t;
   int n = a_.map().ncols();
   int m = a_.map().nrows();
   vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, m, n}})));
@@ -153,15 +150,16 @@ class GPUKokkosVectors : public ::testing::Test
     auto U = u_.array();
     Kokkos::parallel_for(
         "init", mdrange_policy({0, 0}, {5, 5}), KOKKOS_LAMBDA(int i, int j) {
-              M(i, j) = c_arr[i * n + j];
-              if (i == j) U(i, j) = 1;
-              else U(i, j) = 0;
+          M(i, j) = c_arr[i * n + j];
+          if (i == j)
+            U(i, j) = 1;
+          else
+            U(i, j) = 0;
         });
   }
 
 protected:
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t>
-      vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t> vector_t;
 
   vector_t a_;
   vector_t b_;
@@ -172,14 +170,12 @@ class GPUKokkosVectors : public ::testing::Test
 
 TEST_F(GPUKokkosVectors, InnerProductGPU)
 {
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t>
-      vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t> vector_t;
   int n = 5;
   vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, n, n}})));
   inner(c, a_, b_);
   // host vector type
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace>
-      h_vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> h_vector_t;
 
   h_vector_t h_c(c.map());
   h_vector_t h_cRef(c.map());
@@ -199,15 +195,13 @@ TEST_F(GPUKokkosVectors, InnerProductGPU)
 
 TEST_F(GPUKokkosVectors, TransformGPU)
 {
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t>
-      vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t> vector_t;
   int n = a_.map().ncols();
   int m = a_.map().nrows();
   vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, m, n}})));
   transform(c, 0., 1., a_, u_);
   // host vector type
-  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace>
-      h_vector_t;
+  typedef KokkosDVector<double **, SlabLayoutV, Kokkos::LayoutLeft, Kokkos::HostSpace> h_vector_t;
 
   h_vector_t h_c(c.map());
   h_vector_t h_cRef(c.map());
@@ -226,8 +220,7 @@ TEST(EigenValues, EigHermitian)
 {
   // Poisson matrix: n =5, ones on diagonal, -2 on first off-diagonals
   typedef std::complex<double> numeric_t;
-  typedef KokkosDVector<numeric_t **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t>
-      vector_t;
+  typedef KokkosDVector<numeric_t **, SlabLayoutV, Kokkos::LayoutLeft, device_space_t> vector_t;
 
   const std::vector<numeric_t> _Varr = {
       2.88675135e-01,  5.00000000e-01,  5.77350269e-01,  5.00000000e-01, 2.88675135e-01,
@@ -250,7 +243,7 @@ TEST(EigenValues, EigHermitian)
   Kokkos::parallel_for(
       "init", mdrange_policy({0, 0}, {5, 5}), KOKKOS_LAMBDA(int i, int j) {
         A_array(i, i) = 1;
-        if (std::abs(i-j) == 1) A_array(i, j) = -2;
+        if (std::abs(i - j) == 1) A_array(i, j) = -2;
       });
 
   //
@@ -271,7 +264,8 @@ TEST(EigenValues, EigHermitian)
   auto wh = Kokkos::create_mirror(w);
   Kokkos::deep_copy(wh, w);
 
-  std::cout << "eigenvalues" << "\n";
+  std::cout << "eigenvalues"
+            << "\n";
   for (int i = 0; i < wh.extent(0); ++i) {
     // double err = eigs[i] - wh(i);
     EXPECT_NEAR(wh(i), eigs[i], 1e-8);
@@ -289,17 +283,17 @@ main(int argc, char *argv[])
 
   Kokkos::initialize();
 
-  #ifdef __NLCGLIB__MAGMA
+#ifdef __NLCGLIB__MAGMA
   nlcg_init_magma();
-  #endif
+#endif
 
   result = RUN_ALL_TESTS();
 
   Kokkos::finalize();
 
-  #ifdef __NLCGLIB__MAGMA
+#ifdef __NLCGLIB__MAGMA
   nlcg_finalize_magma();
-  #endif
+#endif
 
   Communicator::finalize();
 
diff --git a/unit_tests/local/test_solver_wrappers.cpp b/unit_tests/local/test_solver_wrappers.cpp
index 5c1c14d..5f7cf59 100644
--- a/unit_tests/local/test_solver_wrappers.cpp
+++ b/unit_tests/local/test_solver_wrappers.cpp
@@ -1,11 +1,11 @@
 #include <gtest/gtest.h>
-#include "hip/hip_space.hpp"
 #include <iomanip>
 #include <iostream>
+#include <random>
 #include "gtest/gtest.h"
+#include "hip/hip_space.hpp"
 #include "la/dvector.hpp"
 #include "la/lapack.hpp"
-#include <random>
 
 using namespace nlcglib;
 
@@ -15,7 +15,7 @@ using complex_double = Kokkos::complex<double>;
 int nrows = 200;
 int ncols = 20;
 
-template<typename T>
+template <typename T>
 class TestSymSolve : public ::testing::Test
 {
 public:
@@ -23,7 +23,9 @@ class TestSymSolve : public ::testing::Test
   TestSymSolve()
       : X(Map<>(Communicator(), SlabLayoutV({{0, 0, nrows, ncols}})))
       , H(Map<>(Communicator(), SlabLayoutV({{0, 0, ncols, ncols}})))
-  {}
+  {
+  }
+
 protected:
   using vector_t = KokkosDVector<complex_double **, SlabLayoutV, Kokkos::LayoutLeft, T>;
   //
@@ -33,7 +35,8 @@ class TestSymSolve : public ::testing::Test
 
 
 template <typename T>
-void TestSymSolve<T>::SetUp()
+void
+TestSymSolve<T>::SetUp()
 {
   std::uniform_real_distribution<double> unif01(0, 1);
   std::mt19937 gen(0);
@@ -77,11 +80,10 @@ TYPED_TEST(TestSymSolve, PotrfPotrs)
   // check result
   for (int i = 0; i < ncols; ++i) {
     for (int j = 0; j < ncols; ++j) {
-      if ( i == j ) {
+      if (i == j) {
         EXPECT_NEAR(S_host(i, j).real(), 1, 1e-8);
         EXPECT_NEAR(S_host(i, j).imag(), 0, 1e-8);
-      }
-      else {
+      } else {
         EXPECT_NEAR(S_host(i, j).imag(), 0, 1e-8);
         EXPECT_NEAR(S_host(i, j).real(), 0, 1e-8);
       }