From 674039fd2b131ce12d46d105b437265419999197 Mon Sep 17 00:00:00 2001 From: Simon Pintarelli <1237199+simonpintarelli@users.noreply.github.com> Date: Thu, 6 Jun 2024 10:00:31 +0200 Subject: [PATCH] distributed wfc support / misc updates (#20) * remove enable_language(CUDA) * enable distributed wave-functions * update spack recipe for intel-oneapi-mkl * cmake: simplify blas/mkl * add check-format * remove kokkos <4 initialization --- .github/workflows/check-format.yml | 15 ++ .gitignore | 1 + CMakeLists.txt | 26 ++- check_format.sh | 15 ++ cmake/nlcglib_macros.cmake | 28 +-- include/interface.hpp | 32 ++- include/nlcglib.hpp | 18 +- spack/packages/nlcglib/package.py | 29 +++ src/CMakeLists.txt | 11 - src/constants.hpp | 6 +- src/dft/newton_minimization_smearing.hpp | 4 +- src/exceptions.hpp | 2 +- src/exec_space.hpp | 4 +- src/free_energy.hpp | 3 +- src/geodesic.hpp | 1 - src/gpu/acc.cpp | 2 +- src/gpu/acc.hpp | 6 +- src/hip/hip_space.hpp | 2 +- src/la/cublas.hpp | 7 +- src/la/dvector.hpp | 39 +++- src/la/lapack.hpp | 13 +- src/la/lapack_cpu.hpp | 105 ++++----- src/la/lapack_cuda.hpp | 91 ++++---- src/la/lapack_rocm.hpp | 100 ++++----- src/la/magma.cpp | 14 +- src/la/magma.hpp | 17 +- src/la/mvector.hpp | 248 ++++++++++----------- src/la/rocblas.hpp | 5 +- src/la/utils.hpp | 25 +-- src/linesearch/linesearch.hpp | 55 ++--- src/mpi/communicator.hpp | 25 +++ src/mpi/mpi_type.hpp | 18 +- src/mvp2/descent_direction_impl.hpp | 2 - src/nlcglib.cpp | 28 +-- src/operator.hpp | 1 - src/overlap.hpp | 21 +- src/preconditioner.hpp | 29 ++- src/pseudo_hamiltonian/grad_eta.hpp | 11 +- src/smearing.hpp | 11 +- src/traits.hpp | 30 ++- src/ultrasoft_precond.hpp | 23 +- src/utils/format.hpp | 9 +- src/utils/logger.hpp | 15 +- src/utils/step_logger.hpp | 33 +-- src/utils/timer.hpp | 8 +- test/test.cpp | 33 +-- test/test3.cpp | 4 +- test/test_buffer.cpp | 3 +- test/test_deep_copy.cpp | 28 ++- test/test_dvector_mirror_view_and_copy.cpp | 20 +- test/test_gpu.cpp | 12 +- test/test_kokkos.cpp | 65 +++--- test/test_mpi_wrapper.cpp | 9 +- test/test_mvector_cuda.cpp | 9 +- test/test_smearing.cpp | 21 +- test/test_solver.cpp | 17 +- test/test_stride_dvector.cpp | 47 ++-- test/test_threaded.cpp | 5 +- test/test_traits.cpp | 12 +- test/test_utils.cpp | 13 +- unit_tests/CMakeLists.txt | 2 +- unit_tests/local/test_la_wrappers.cpp | 52 ++--- unit_tests/local/test_solver_wrappers.cpp | 18 +- 63 files changed, 816 insertions(+), 742 deletions(-) create mode 100644 .github/workflows/check-format.yml create mode 100755 check_format.sh diff --git a/.github/workflows/check-format.yml b/.github/workflows/check-format.yml new file mode 100644 index 0000000..3423c0b --- /dev/null +++ b/.github/workflows/check-format.yml @@ -0,0 +1,15 @@ +name: Check source code formatting + +on: + push: {} + pull_request: {} + +jobs: + check: + runs-on: ubuntu-latest + container: zhongruoyu/llvm-ports:17.0.4-slim-focal + steps: + - uses: actions/checkout@v4 + - name: Check .cpp and .hpp files + run: | + ./check_format.sh diff --git a/.gitignore b/.gitignore index b50ee76..4a8fa95 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ spack-build-* *~undo-tree~ __pycache__/ compile_commands.json +build-linux-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 8971a32..099f063 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ set(USE_OPENMP On CACHE BOOL "use OpenMP") set(USE_CUDA Off CACHE BOOL "use cuda") set(USE_ROCM Off CACHE BOOL "use amd gpus") set(USE_MAGMA Off CACHE BOOL "use magma eigensolver for amd gpus") +set(USE_GPU_DIRECT Off CACHE BOOL "use gpu direct") set(BUILD_TESTS OFF CACHE BOOL "build tests") set(LAPACK_VENDOR "OpenBLAS" CACHE STRING "lapack vendor") @@ -21,22 +22,19 @@ endif() set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_EXPORT_COMPILE_COMMANDS "YES") -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") include(cmake/nlcglib_macros.cmake) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules") -find_package(Kokkos) if(USE_CUDA) find_package(CUDAToolkit REQUIRED) - enable_language(CUDA) include(cmake/cudalibs_target.cmake) endif() - if(USE_ROCM) include(cmake/rocmlibs_target.cmake) endif() +find_package(Kokkos) if(USE_MAGMA) find_package(MAGMA REQUIRED) @@ -53,12 +51,24 @@ if(LAPACK_VENDOR MATCHES OpenBLAS) INTERFACE_INCLUDE_DIRECTORIES "${OpenBLAS_INCLUDE_DIRS}" INTERFACE_LINK_LIBRARIES "${OpenBLAS_LIBRARIES}") endif() -elseif(LAPACK_VENDOR MATCHES MKL) - message("LAPACK VENDOR MKL") - find_package(MKL REQUIRED) elseif(LAPACK_VENDOR STREQUAL MKLONEAPI) - # set(MKL_THREADING gnu_thread) + message("LAPACK VENDOR MKL") + set(MKL_INTERFACE "lp64" CACHE STRING "") + set(MKL_THREADING "sequential" CACHE STRING "") + set(MKL_MPI "mpich" CACHE STRING "") + find_package(MKL CONFIG REQUIRED) + if(NOT TARGET nlcg::cpu_lapack) + add_library(nlcg::cpu_lapack INTERFACE IMPORTED) + target_link_libraries(nlcg::cpu_lapack INTERFACE MKL::MKL) + target_compile_definitions(nlcg::cpu_lapack INTERFACE __USE_MKL) + endif() +elseif(LAPACK_VENDOR STREQUAL MKL) find_package(MKL REQUIRED NO_MODULE) + if(NOT TARGET nlcg::cpu_lapack) + add_library(nlcg::cpu_lapack INTERFACE IMPORTED) + target_link_libraries(nlcg::cpu_lapack INTERFACE mkl::mkl_intel_32bit_omp_dyn) + target_compile_definitions(nlcg::cpu_lapack INTERFACE __USE_MKL) + endif() elseif(LAPACK_VENDOR STREQUAL CRAY_LIBSCI) message("LAPACK VENDOR Cray Libsci") find_package(SCI REQUIRED) diff --git a/check_format.sh b/check_format.sh new file mode 100755 index 0000000..c4d05f0 --- /dev/null +++ b/check_format.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +check_diff() { + local status=0 + for file in "$@"; do + if ! diff -q "$file" <(clang-format "$file"); then + status=1 + fi + done + return $status +} + +export -f check_diff + +find . -type f \( -name "*.cpp" -o -name "*.hpp" \) ! -path "./build-env/*" ! -path "./.*" -exec bash -c 'check_diff "$@"' sh {} + diff --git a/cmake/nlcglib_macros.cmake b/cmake/nlcglib_macros.cmake index b956797..9852bce 100644 --- a/cmake/nlcglib_macros.cmake +++ b/cmake/nlcglib_macros.cmake @@ -2,42 +2,30 @@ MACRO(NLCGLIB_SETUP_TARGET _target) target_link_libraries( ${_target} PUBLIC Kokkos::kokkos - # ${LAPACK_LIBRARIES} MPI::MPI_CXX - # $ + $ $ $ $ $ # only required for magma $ # only required for magma nlohmann_json::nlohmann_json - ) + nlcg::cpu_lapack + ) target_include_directories(${_target} PUBLIC ${CMAKE_SOURCE_DIR}/src ${CMAKE_SOURCE_DIR}/include - ) + ) - if(USE_ROCM) - target_compile_options(${_target} PUBLIC --offload-arch=gfx90a) - endif() - - if(LAPACK_VENDOR MATCHES MKL) - target_compile_definitions(${_target} PUBLIC __USE_MKL) - # if(USE_OPENMP) - target_link_libraries(${_target} PUBLIC mkl::mkl_intel_32bit_omp_dyn) - # else() - # target_link_libraries(${_target} PUBLIC mkl::mkl_intel_32bit_seq_dyn) - # endif() - elseif(LAPACK_VENDOR STREQUAL MKLONEAPI) - target_link_libraries(${_target} PUBLIC MKL::MKL) - else() - target_link_libraries(${_target} PRIVATE nlcg::cpu_lapack) + if(USE_ROCM) + target_compile_options(${_target} PUBLIC --offload-arch=gfx90a) endif() + target_compile_definitions(${_target} PUBLIC $<$:__USE_OPENMP>) target_compile_definitions(${_target} PUBLIC $<$:__NLCGLIB__CUDA>) target_compile_definitions(${_target} PUBLIC $<$:__NLCGLIB__ROCM>) target_compile_definitions(${_target} PUBLIC $<$:__NLCGLIB__MAGMA>) + target_compile_definitions(${_target} PUBLIC $<$:__NLCGLIB__GPU_DIRECT>) target_include_directories(${_target} PUBLIC $) - ENDMACRO() diff --git a/include/interface.hpp b/include/interface.hpp index a5dd4fe..2fe525f 100644 --- a/include/interface.hpp +++ b/include/interface.hpp @@ -2,10 +2,10 @@ #include #include -#include +#include #include +#include #include -#include #include #include "mpi.h" @@ -18,9 +18,8 @@ enum class memory_type device }; -static std::map memory_names = {{memory_type::none, "none"}, - {memory_type::host, "host"}, - {memory_type::device, "device"}}; +static std::map memory_names = { + {memory_type::none, "none"}, {memory_type::host, "host"}, {memory_type::device, "device"}}; enum class smearing_type { @@ -50,20 +49,18 @@ struct buffer_protocol std::array size, T* data, enum memory_type memtype, - MPI_Comm mpi_comm=MPI_COMM_SELF) + MPI_Comm mpi_comm = MPI_COMM_SELF) : stride(std::move(stride)) , size(std::move(size)) , data(data) , memtype(memtype) , mpi_comm(mpi_comm) - { /* empty */ } + { /* empty */ + } // 1d constructor // template> - buffer_protocol(int size, - T* data, - enum memory_type memtype, - MPI_Comm mpi_comm= MPI_COMM_SELF) + buffer_protocol(int size, T* data, enum memory_type memtype, MPI_Comm mpi_comm = MPI_COMM_SELF) : buffer_protocol({1}, {size}, data, memtype, mpi_comm) { static_assert(d == 1, "not available."); @@ -79,7 +76,7 @@ struct buffer_protocol MPI_Comm mpi_comm{MPI_COMM_SELF}; }; -template +template class BufferBase { public: @@ -100,7 +97,7 @@ class BufferBase virtual kindex_t kpoint_index(int i) const = 0; }; -template +template class BufferBase<0, numeric_t> { public: @@ -140,23 +137,24 @@ class EnergyBase virtual std::shared_ptr get_sphi(memory_type) = 0; virtual std::shared_ptr get_C(memory_type) = 0; virtual std::shared_ptr get_fn() = 0; - virtual void set_fn(const std::vector>&, const std::vector>&) = 0; + virtual void set_fn(const std::vector>&, + const std::vector>&) = 0; virtual std::shared_ptr get_ek() = 0; virtual std::shared_ptr get_gkvec_ekin() = 0; virtual std::shared_ptr get_kpoint_weights() = 0; virtual void set_chemical_potential(double) = 0; virtual double get_chemical_potential() = 0; virtual void print_info() const = 0; + virtual MPI_Comm comm_world() const = 0; }; class OpBase { public: using key_t = std::pair; + public: - virtual void apply(const key_t&, - MatrixBaseZ::buffer_t& out, - MatrixBaseZ::buffer_t& in) const = 0; + virtual void apply(const key_t&, MatrixBaseZ::buffer_t& out, MatrixBaseZ::buffer_t& in) const = 0; virtual std::vector get_keys() const = 0; }; diff --git a/include/nlcglib.hpp b/include/nlcglib.hpp index 9bc3f1e..bb5a97e 100644 --- a/include/nlcglib.hpp +++ b/include/nlcglib.hpp @@ -4,8 +4,10 @@ namespace nlcglib { -void initialize(); -void finalize(); +void +initialize(); +void +finalize(); nlcg_info nlcg_mvp2_cpu(EnergyBase& energy_base, @@ -28,12 +30,12 @@ nlcg_mvp2_device(EnergyBase& energy_base, nlcg_info nlcg_mvp2_cpu_device(EnergyBase& energy_base, - smearing_type smearing, - double temp, - double tol, - double kappa, - double tau, - int maxiter, + smearing_type smearing, + double temp, + double tol, + double kappa, + double tau, + int maxiter, int restart); nlcg_info nlcg_mvp2_device_cpu(EnergyBase& energy_base, diff --git a/spack/packages/nlcglib/package.py b/spack/packages/nlcglib/package.py index 47a54db..a140d7a 100644 --- a/spack/packages/nlcglib/package.py +++ b/spack/packages/nlcglib/package.py @@ -28,6 +28,7 @@ class Nlcglib(CMakePackage, CudaPackage, ROCmPackage): description="CMake build type", values=("Debug", "Release", "RelWithDebInfo"), ) + variant("gpu_direct", default=False) depends_on("cmake@3.21:", type="build") depends_on("mpi") @@ -38,6 +39,10 @@ class Nlcglib(CMakePackage, CudaPackage, ROCmPackage): depends_on("googletest", type="build", when="+tests") depends_on("nlohmann-json") + depends_on("kokkos@4:", when="@1.1:") + + # MKLConfig.cmake introduced in 2021.3 + conflicts("intel-oneapi-mkl@:2021.2", when="^intel-oneapi-mkl") with when("@:0.9"): conflicts("+rocm") @@ -59,6 +64,7 @@ def cmake_args(self): self.define_from_variant("USE_OPENMP", "openmp"), self.define_from_variant("BUILD_TESTS", "tests"), self.define_from_variant("USE_ROCM", "rocm"), + self.define_from_variant("USE_GPU_DIRECT", "gpu_direct"), self.define_from_variant("USE_MAGMA", "magma"), self.define_from_variant("USE_CUDA", "cuda"), ] @@ -67,6 +73,29 @@ def cmake_args(self): options += [self.define("LAPACK_VENDOR", "MKL")] elif self.spec["blas"].name in ["intel-oneapi-mkl"]: options += [self.define("LAPACK_VENDOR", "MKLONEAPI")] + mkl_mapper = { + "threading": { + "none": "sequential", + "openmp": "gnu_thread", + "tbb": "tbb_thread", + }, + "mpi": {"intel-mpi": "intelmpi", "mpich": "mpich", "openmpi": "openmpi"}, + } + + mkl_threads = mkl_mapper["threading"][self.spec["intel-oneapi-mkl"].variants["threads"].value] + + mpi_provider = self.spec["mpi"].name + if mpi_provider in ["mpich", "cray-mpich", "mvapich", "mvapich2"]: + mkl_mpi = mkl_mapper["mpi"]["mpich"] + else: + mkl_mpi = mkl_mapper["mpi"][mpi_provider] + + options.extend([ + self.define("MKL_INTERFACE", "lp64"), + self.define("MKL_THREADING", mkl_threads), + self.define("MKL_MPI", mkl_mpi) + ]) + elif self.spec["blas"].name in ["openblas"]: options += [self.define("LAPACK_VENDOR", "OpenBLAS")] else: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1fb6969..c50cfe1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,17 +11,6 @@ add_library(nlcglib SHARED nlcglib.cpp) target_include_directories(nlcglib PUBLIC $ $) target_link_libraries(nlcglib PRIVATE nlcglib_core) -if(LAPACK_VENDOR MATCHES MKL) - target_compile_definitions(nlcglib PRIVATE __USE_MKL) - if(USE_OPENMP) - target_link_libraries(nlcglib PRIVATE mkl::mkl_intel_32bit_omp_dyn) - else() - target_link_libraries(nlcglib PRIVATE mkl::mkl_intel_32bit_seq_st) - endif() -else() - target_link_libraries(nlcglib PRIVATE nlcg::cpu_lapack) -endif() - set_target_properties(nlcglib PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/include/nlcglib.hpp ) diff --git a/src/constants.hpp b/src/constants.hpp index b87b468..e54711d 100644 --- a/src/constants.hpp +++ b/src/constants.hpp @@ -4,10 +4,10 @@ namespace nlcglib { namespace constants { const double pi{3.1415926535897932385}; -} // constants +} // namespace constants namespace physical_constants { const double kb{0.00000316681156340226}; -} // physical_constants +} // namespace physical_constants -} // nlcglib +} // namespace nlcglib diff --git a/src/dft/newton_minimization_smearing.hpp b/src/dft/newton_minimization_smearing.hpp index 583cf29..733a44d 100644 --- a/src/dft/newton_minimization_smearing.hpp +++ b/src/dft/newton_minimization_smearing.hpp @@ -28,8 +28,8 @@ double newton_minimization_chemical_potential( Nt&& N, DNt&& dN, D2Nt&& ddN, double mu0, double ne, double tol, int maxstep = 1000) { - // Newton finds the minimum, not necessarily N(mu) == ne, tolerate up to `tol_ne` difference in number of electrons - // if |N(mu_0) -ne| > tol_ne an error is thrown. + // Newton finds the minimum, not necessarily N(mu) == ne, tolerate up to `tol_ne` difference in + // number of electrons if |N(mu_0) -ne| > tol_ne an error is thrown. const double tol_ne = 1e-2; double mu = mu0; diff --git a/src/exceptions.hpp b/src/exceptions.hpp index 065454b..d9f8fbe 100644 --- a/src/exceptions.hpp +++ b/src/exceptions.hpp @@ -25,4 +25,4 @@ class DescentError : public std::exception }; -} // nlcglib +} // namespace nlcglib diff --git a/src/exec_space.hpp b/src/exec_space.hpp index 5ba443e..8d2ae5b 100644 --- a/src/exec_space.hpp +++ b/src/exec_space.hpp @@ -30,7 +30,7 @@ template <> struct exec { #ifdef __USE_OPENMP - using type = Kokkos::OpenMP; + using type = Kokkos::OpenMP; #else using type = Kokkos::Serial; #endif @@ -40,4 +40,4 @@ template using exec_t = typename exec::type; -} // nlcglib +} // namespace nlcglib diff --git a/src/free_energy.hpp b/src/free_energy.hpp index d4e1e43..22c6e67 100644 --- a/src/free_energy.hpp +++ b/src/free_energy.hpp @@ -92,7 +92,8 @@ FreeEnergy::compute(const mvector& X, const mvector& fn, const mvector - #include "la/lapack.hpp" #include "la/utils.hpp" diff --git a/src/gpu/acc.cpp b/src/gpu/acc.cpp index c3573a0..1a2e972 100644 --- a/src/gpu/acc.cpp +++ b/src/gpu/acc.cpp @@ -2,9 +2,9 @@ #include #include #include +#include #include #include -#include namespace nlcglib { #define CALL_DEVICE_API(func__, args__) \ diff --git a/src/gpu/acc.hpp b/src/gpu/acc.hpp index c962c0d..43cdd6f 100644 --- a/src/gpu/acc.hpp +++ b/src/gpu/acc.hpp @@ -5,9 +5,9 @@ namespace nlcglib { namespace acc { -template +template void copy(T* target, const U* src, size_t n); -} // acc -} // nlcglib +} // namespace acc +} // namespace nlcglib diff --git a/src/hip/hip_space.hpp b/src/hip/hip_space.hpp index 66d9251..6d9e2cb 100644 --- a/src/hip/hip_space.hpp +++ b/src/hip/hip_space.hpp @@ -3,5 +3,5 @@ #include #else #include -#endif // KOKKOS > 3.7.01 +#endif // KOKKOS > 3.7.01 #endif diff --git a/src/la/cublas.hpp b/src/la/cublas.hpp index 030ce6d..01ce63b 100644 --- a/src/la/cublas.hpp +++ b/src/la/cublas.hpp @@ -12,11 +12,12 @@ struct cublasHandle static cublasHandle_t handle{nullptr}; return handle; } + public: static cublasHandle_t& get() { cublasHandle_t& handle = _get(); - if(!handle) { + if (!handle) { cublasCreate(&handle); } return handle; @@ -24,10 +25,10 @@ struct cublasHandle static void destroy() { - if(!_get()) cublasDestroy(_get()); + if (!_get()) cublasDestroy(_get()); _get() = nullptr; } }; -} // cublas +} // namespace cublas diff --git a/src/la/dvector.hpp b/src/la/dvector.hpp index 1150061..c82db8a 100644 --- a/src/la/dvector.hpp +++ b/src/la/dvector.hpp @@ -1,14 +1,14 @@ #pragma once #include -#include "hip/hip_space.hpp" #include #include #include #include #include +#include "hip/hip_space.hpp" +#include "interface.hpp" #include "map.hpp" -#include "nlcglib.hpp" namespace nlcglib { @@ -346,5 +346,40 @@ print(const KokkosDVector& mat, O&& out, int precision = 4) } } +template +void +allreduce(KokkosDVector& C, const Communicator& comm) +{ + typename KokkosDVector::storage_t::execution_space ex; + ex.fence(); +#ifdef __NLCGLIB___GPU_DIRECT + auto C_ptr = C.array().data(); + int m = C.map().nrows(); + int n = C.map().ncols(); + if (C.array().stride(0) != 1) { + throw std::runtime_error("allreduce, expected stride(0) == 1"); + } + if (C.array().stride(1) != m) { + throw std::runtime_error("allreduce, expected stride(1) == ncols"); + } + + comm.allreduce(C_ptr, m * n, mpi_op::sum); +#else + auto C_h = create_mirror_view_and_copy(Kokkos::HostSpace(), C); + auto C_ptr = C_h.array().data(); + int m = C_h.map().nrows(); + int n = C_h.map().ncols(); + if (static_cast(C_h.array().stride(0)) != 1) { + throw std::runtime_error("allreduce, expected stride(0) == 1"); + } + if (static_cast(C_h.array().stride(1)) != m) { + throw std::runtime_error("allreduce, expected stride(1) == ncols"); + } + comm.allreduce(C_ptr, m * n, mpi_op::sum); + // copy back to original memory + deep_copy(C, C_h); +#endif +} + } // namespace nlcglib diff --git a/src/la/lapack.hpp b/src/la/lapack.hpp index bf411c3..662a5b5 100644 --- a/src/la/lapack.hpp +++ b/src/la/lapack.hpp @@ -1,10 +1,12 @@ #pragma once -#include "hip/hip_space.hpp" +#include #include #include +#include "hip/hip_space.hpp" #include "la/map.hpp" #include "lapack_cpu.hpp" +#include "mpi/communicator.hpp" #ifdef __NLCGLIB__CUDA #include "lapack_cuda.hpp" #endif @@ -240,7 +242,7 @@ struct inner_ { int n = A.map().ncols(); int m = B.map().ncols(); - Map map(A.map().comm(), SlabLayoutV({{0, 0, n, m}})); + Map map(Communicator(), SlabLayoutV({{0, 0, n, m}})); to_layout_left_t C(map); inner(C, A, B, alpha, beta); return C; @@ -250,7 +252,7 @@ struct inner_ /// Hermitian inner product, summed struct innerh_tr { -#if defined(__NLCGLIB__CUDA) || defined(__NLCGLIB__ROCM) +#if defined(__NLCGLIB__CUDA) || defined(__NLCGLIB__ROCM) template std::enable_if_t< !Kokkos::SpaceAccessibility::accessible, @@ -285,6 +287,9 @@ struct innerh_tr Kokkos::RangePolicy>(0, nrows), KOKKOS_LAMBDA(int i, T& lsum) { lsum += tmp(i); }, sum); + exec_t spc; + spc.fence(); + sum = X.map().comm().allreduce(sum, mpi_op::sum); return sum; } #endif @@ -321,7 +326,7 @@ struct innerh_tr Kokkos::RangePolicy(0, nrows), KOKKOS_LAMBDA(int i, T& lsum) { lsum += tmp(i); }, sum); - + sum = X.map().comm().allreduce(sum, mpi_op::sum); return sum; } }; diff --git a/src/la/lapack_cpu.hpp b/src/la/lapack_cpu.hpp index cb0e2e6..9b286e9 100644 --- a/src/la/lapack_cpu.hpp +++ b/src/la/lapack_cpu.hpp @@ -47,7 +47,7 @@ eigh(KokkosDVector& U, ); if (info != 0) throw std::runtime_error("cblas zheevd failed"); } else { - throw std::runtime_error("not yet implemented"); + throw std::runtime_error("eigh: not yet implemented"); } } @@ -81,7 +81,7 @@ solve_sym(KokkosDVector& A, KokkosDVector potrs_t; potrs_t::call(order, uplo, n, nrhs, ptr_A, lda, ptr_B, ldb); } else { - throw std::runtime_error("not implemented"); + throw std::runtime_error("solve_sym: not implemented"); } } @@ -120,39 +120,36 @@ inner(KokkosDVector& C, static_assert(std::is_same::value, "matrix layout do not match"); // single rank - if (A.map().is_local() && B.map().is_local() && C.map().is_local()) { - int m = A.map().ncols(); - int k = A.map().nrows(); - int n = B.map().ncols(); - numeric_t* A_ptr = A.array().data(); - numeric_t* B_ptr = B.array().data(); - numeric_t* C_ptr = C.array().data(); - - if (A.array().stride(0) != 1 || B.array().stride(0) != 1 || C.array().stride(0) != 1) { - throw std::runtime_error("expecting column major layout"); - } - int lda = A.array().stride(1); - int ldb = B.array().stride(1); - int ldc = C.array().stride(1); - - // single rank inner product - cblas::gemm::call(CblasColMajor, - cblas::gemm::H, - CblasNoTrans, - m, - n, - k, - alpha, - A_ptr, - lda, - B_ptr, - ldb, - beta, - C_ptr, - ldc); - } else { - throw std::runtime_error("not implemented."); + int m = A.map().ncols(); + int k = A.map().nrows(); + int n = B.map().ncols(); + numeric_t* A_ptr = A.array().data(); + numeric_t* B_ptr = B.array().data(); + numeric_t* C_ptr = C.array().data(); + + if (A.array().stride(0) != 1 || B.array().stride(0) != 1 || C.array().stride(0) != 1) { + throw std::runtime_error("expecting column major layout"); } + int lda = A.array().stride(1); + int ldb = B.array().stride(1); + int ldc = C.array().stride(1); + + // single rank inner product + cblas::gemm::call(CblasColMajor, + cblas::gemm::H, + CblasNoTrans, + m, + n, + k, + alpha, + A_ptr, + lda, + B_ptr, + ldb, + beta, + C_ptr, + ldc); + allreduce(C, A.map().comm()); } /// Inner product: c = a^H * b, on CPU @@ -220,7 +217,7 @@ outer(KokkosDVector& C, C_ptr, ldc); } else { - throw std::runtime_error("not implemented."); + throw std::runtime_error("outer: not implemented."); } } @@ -257,7 +254,7 @@ transform(KokkosDVector& C, "a,b not on same memory"); static_assert(std::is_same::value, "matrix layout do not match"); - if (A.map().is_local() && B.map().is_local() && C.map().is_local()) { + if (B.map().is_local()) { /* single rank */ int m = A.map().nrows(); int n = B.map().ncols(); @@ -267,7 +264,7 @@ transform(KokkosDVector& C, numeric_t* C_ptr = C.array().data(); if (A.array().stride(0) != 1 || B.array().stride(0) != 1 || C.array().stride(0) != 1) { - throw std::runtime_error("expecting column major layout"); + throw std::runtime_error("transform: expecting column major layout"); } int lda = A.array().stride(1); int ldb = B.array().stride(1); @@ -289,7 +286,7 @@ transform(KokkosDVector& C, C_ptr, ldc); } else { - throw std::runtime_error("not implemented."); + throw std::runtime_error("tranform: not implemented."); } } @@ -310,26 +307,22 @@ add(M0& C, typename vector1_t::storage_t::memory_space>::value, "c,a not on same memory"); - if (A.map().is_local() && C.map().is_local()) { - /* single rank */ - int m = A.map().nrows(); - int n = C.map().ncols(); - numeric_t* A_ptr = A.array().data(); - numeric_t* C_ptr = C.array().data(); + /* single rank */ + int m = A.map().nrows(); + int n = C.map().ncols(); + numeric_t* A_ptr = A.array().data(); + numeric_t* C_ptr = C.array().data(); - if (A.array().stride(0) != 1 || C.array().stride(0) != 1) { - throw std::runtime_error("expecting column major layout"); - } - // assume there are no strides - int lda = A.array().stride(1); - int ldc = C.array().stride(1); - - using geam = cblas::geam; - geam::call( - CblasColMajor, geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc); - } else { - throw std::runtime_error("not implemented."); + if (A.array().stride(0) != 1 || C.array().stride(0) != 1) { + throw std::runtime_error("expecting column major layout"); } + // assume there are no strides + int lda = A.array().stride(1); + int ldc = C.array().stride(1); + + using geam = cblas::geam; + geam::call( + CblasColMajor, geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc); } } // namespace nlcglib diff --git a/src/la/lapack_cuda.hpp b/src/la/lapack_cuda.hpp index 65dfcc1..b1acf6c 100644 --- a/src/la/lapack_cuda.hpp +++ b/src/la/lapack_cuda.hpp @@ -34,7 +34,8 @@ eigh(KokkosDVector& U, /// stores result in RHS, after the call A will contain the cholesky factorization of a template -std::enable_if_t::storage_t::memory_space, Kokkos::CudaSpace>::value> +std::enable_if_t::storage_t::memory_space, + Kokkos::CudaSpace>::value> cholesky(KokkosDVector& A) { if (A.map().is_local()) { @@ -56,9 +57,9 @@ cholesky(KokkosDVector& A) /// stores result in RHS, after the call A will contain the cholesky factorization of a template -std::enable_if_t::storage_t::memory_space, Kokkos::CudaSpace>::value> -solve_sym(KokkosDVector& A, - KokkosDVector& RHS) +std::enable_if_t::storage_t::memory_space, + Kokkos::CudaSpace>::value> +solve_sym(KokkosDVector& A, KokkosDVector& RHS) { if (A.map().is_local() && RHS.map().is_local()) { typedef KokkosDVector vector_t; @@ -101,38 +102,30 @@ inner(M0& c, typename M2::storage_t::memory_space>::value, "a,b not on same memory"); // static_assert(std::is_same::value, "matrix layout do not match"); - if (a.map().is_local() && b.map().is_local() && c.map().is_local()) { - if (a.array().stride(0) != 1 || b.array().stride(0) != 1 || c.array().stride(0) != 1) { - throw std::runtime_error("expecting column major layout"); - } - int m = a.map().ncols(); - int k = a.map().nrows(); - int n = b.map().ncols(); - numeric_t* A_ptr = a.array().data(); - numeric_t* B_ptr = b.array().data(); - numeric_t* C_ptr = c.array().data(); + if (a.array().stride(0) != 1 || b.array().stride(0) != 1 || c.array().stride(0) != 1) { + throw std::runtime_error("expecting column major layout"); + } - int lda = a.array().stride(1); - int ldb = b.array().stride(1); - int ldc = c.array().stride(1); + int m = a.map().ncols(); + int k = a.map().nrows(); + int n = b.map().ncols(); + numeric_t* A_ptr = a.array().data(); + numeric_t* B_ptr = b.array().data(); + numeric_t* C_ptr = c.array().data(); - using gemm = cuda::gemm; - gemm::call(gemm::H, gemm::N, m, n, k, alpha, A_ptr, lda, B_ptr, ldb, beta, C_ptr, ldc); + int lda = a.array().stride(1); + int ldb = b.array().stride(1); + int ldc = c.array().stride(1); - } else { - throw std::runtime_error("distributed inner product not implemented."); - } + using gemm = cuda::gemm; + gemm::call(gemm::H, gemm::N, m, n, k, alpha, A_ptr, lda, B_ptr, ldb, beta, C_ptr, ldc); + allreduce(c, a.map().comm()); } /// Inner product c = a^H * b, on GPU -template -std::enable_if_t< - std::is_same::value, - void> +template +std::enable_if_t::value, void> outer(M0& c, const M1& a, const M2& b, @@ -178,11 +171,8 @@ outer(M0& c, /// C <- beta * C + alpha * A @ B template std::enable_if_t::value, void> -transform(M0& C, - typename M0::numeric_t beta, - typename M0::numeric_t alpha, - const M1& A, - const M2& B) +transform( + M0& C, typename M0::numeric_t beta, typename M0::numeric_t alpha, const M1& A, const M2& B) { typedef M0 vector0_t; typedef M1 vector1_t; @@ -195,9 +185,8 @@ transform(M0& C, static_assert(std::is_same::value, "a,b not on same memory"); - // static_assert(std::is_same::value, "matrix layout do not match"); - if (A.map().is_local() && B.map().is_local() && C.map().is_local()) { + if (B.map().is_local()) { /* single rank */ int m = A.map().nrows(); int n = B.map().ncols(); @@ -238,25 +227,21 @@ add(M0& C, typename vector1_t::storage_t::memory_space>::value, "c,a not on same memory"); - if (A.map().is_local() && C.map().is_local()) { - /* single rank */ - int m = A.map().nrows(); - int n = C.map().ncols(); - numeric_t* A_ptr = A.array().data(); - numeric_t* C_ptr = C.array().data(); + /* single rank */ + int m = A.map().nrows(); + int n = C.map().ncols(); + numeric_t* A_ptr = A.array().data(); + numeric_t* C_ptr = C.array().data(); - if (A.array().stride(0) != 1 || C.array().stride(0) != 1) { - throw std::runtime_error("expecting column major layout"); - } - // assume there are no strides - int lda = A.array().stride(1); - int ldc = C.array().stride(1); - - using geam = cuda::geam; - geam::call(geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc); - } else { - throw std::runtime_error("not implemented."); + if (A.array().stride(0) != 1 || C.array().stride(0) != 1) { + throw std::runtime_error("expecting column major layout"); } + // assume there are no strides + int lda = A.array().stride(1); + int ldc = C.array().stride(1); + + using geam = cuda::geam; + geam::call(geam::N, geam::N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc); } #endif } // namespace nlcglib diff --git a/src/la/lapack_rocm.hpp b/src/la/lapack_rocm.hpp index a4c6283..eff262a 100644 --- a/src/la/lapack_rocm.hpp +++ b/src/la/lapack_rocm.hpp @@ -4,12 +4,12 @@ // #include #include "la/cblas.hpp" -#include "rocm.hpp" -#include "rocblas.hpp" -#include "rocsolver.hpp" #include "la/dvector.hpp" -#include "la/utils.hpp" #include "la/lapack_cpu.hpp" +#include "la/utils.hpp" +#include "rocblas.hpp" +#include "rocm.hpp" +#include "rocsolver.hpp" #ifdef __NLCGLIB__MAGMA #include "magma.hpp" @@ -28,7 +28,6 @@ eigh(KokkosDVector& U, const KokkosDVector& S) { if (U.map().is_local() && S.map().is_local()) { - deep_copy(U, S); // int n = U.map().nrows(); @@ -46,12 +45,12 @@ eigh(KokkosDVector& U, int n = U_host.map().ncols(); Kokkos::deep_copy(U_host.array(), S_host.array()); LAPACKE_zheevd( - LAPACK_COL_MAJOR, /* matrix layout */ - 'V', /* jobz */ - 'U', /* uplot */ - n, /* matrix size */ + LAPACK_COL_MAJOR, /* matrix layout */ + 'V', /* jobz */ + 'U', /* uplot */ + n, /* matrix size */ reinterpret_cast(U_host.array().data()), /* Complex double */ - lda, /* lda */ + lda, /* lda */ w_host.data() /* eigenvalues */ ); } @@ -60,7 +59,8 @@ eigh(KokkosDVector& U, // zheevd_magma(n, U.array().data(), lda, w_host.data()); Kokkos::deep_copy(w, w_host); deep_copy(U, U_host); - // rocm::heevd(rocblas_evect::rocblas_evect_original, rocblas_fill::rocblas_fill_upper, n, U.array().data(), lda, w.data()); + // rocm::heevd(rocblas_evect::rocblas_evect_original, rocblas_fill::rocblas_fill_upper, n, + // U.array().data(), lda, w.data()); } else { throw std::runtime_error("distributed eigh not implemented"); } @@ -68,7 +68,8 @@ eigh(KokkosDVector& U, /// stores result in RHS, after the call A will contain the cholesky factorization of a template -std::enable_if_t::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value> +std::enable_if_t::storage_t::memory_space, + Kokkos::Experimental::HIPSpace>::value> cholesky(KokkosDVector& A) { if (A.map().is_local()) { @@ -89,9 +90,9 @@ cholesky(KokkosDVector& A) /// stores result in RHS, after the call A will contain the cholesky factorization of a template -std::enable_if_t::storage_t::memory_space, Kokkos::Experimental::HIPSpace>::value> -solve_sym(KokkosDVector& A, - KokkosDVector& RHS) +std::enable_if_t::storage_t::memory_space, + Kokkos::Experimental::HIPSpace>::value> +solve_sym(KokkosDVector& A, KokkosDVector& RHS) { auto A_host = create_mirror_view_and_copy(Kokkos::HostSpace(), A); auto RHS_host = create_mirror_view_and_copy(Kokkos::HostSpace(), RHS); @@ -125,7 +126,9 @@ solve_sym(KokkosDVector& A, /// Inner product c = a^H * b, on GPU template -std::enable_if_t::value, void> +std::enable_if_t< + std::is_same::value, + void> inner(M0& c, const M1& a, const M2& b, @@ -141,7 +144,7 @@ inner(M0& c, typename M2::storage_t::memory_space>::value, "a,b not on same memory"); // static_assert(std::is_same::value, "matrix layout do not match"); - if (a.map().is_local() && b.map().is_local() && c.map().is_local()) { + if (c.map().is_local()) { if (a.array().stride(0) != 1 || b.array().stride(0) != 1 || c.array().stride(0) != 1) { throw std::runtime_error("expecting column major layout"); } @@ -160,18 +163,16 @@ inner(M0& c, auto H = rocblas_operation::rocblas_operation_conjugate_transpose; auto N = rocblas_operation::rocblas_operation_none; rocm::gemm(H, N, m, n, k, alpha, A_ptr, lda, B_ptr, ldb, beta, C_ptr, ldc); + allreduce(c, a.map().comm()); } else { throw std::runtime_error("distributed inner product not implemented."); } } /// Inner product c = a^H * b, on GPU -template +template std::enable_if_t< - std::is_same::value, + std::is_same::value, void> outer(M0& c, const M1& a, @@ -218,12 +219,11 @@ outer(M0& c, /// C <- beta * C + alpha * A @ B template -std::enable_if_t::value, void> -transform(M0& C, - typename M0::numeric_t beta, - typename M0::numeric_t alpha, - const M1& A, - const M2& B) +std::enable_if_t< + std::is_same::value, + void> +transform( + M0& C, typename M0::numeric_t beta, typename M0::numeric_t alpha, const M1& A, const M2& B) { typedef M0 vector0_t; typedef M1 vector1_t; @@ -236,9 +236,7 @@ transform(M0& C, static_assert(std::is_same::value, "a,b not on same memory"); - // static_assert(std::is_same::value, "matrix layout do not match"); - - if (A.map().is_local() && B.map().is_local() && C.map().is_local()) { + if (B.map().is_local()) { /* single rank */ int m = A.map().nrows(); int n = B.map().ncols(); @@ -265,7 +263,9 @@ transform(M0& C, /// add C <- alpha * A + beta * C template -std::enable_if_t::value, void> +std::enable_if_t< + std::is_same::value, + void> add(M0& C, const M1& A, typename M0::numeric_t alpha, @@ -279,29 +279,25 @@ add(M0& C, typename vector1_t::storage_t::memory_space>::value, "c,a not on same memory"); - if (A.map().is_local() && C.map().is_local()) { - /* single rank */ - int m = A.map().nrows(); - int n = C.map().ncols(); - numeric_t* A_ptr = A.array().data(); - numeric_t* C_ptr = C.array().data(); - - if (A.array().stride(0) != 1 || C.array().stride(0) != 1) { - throw std::runtime_error("expecting column major layout"); - } - // assume there are no strides - int lda = A.array().stride(1); - int ldc = C.array().stride(1); + /* single rank */ + int m = A.map().nrows(); + int n = C.map().ncols(); + numeric_t* A_ptr = A.array().data(); + numeric_t* C_ptr = C.array().data(); - // using geam = rocm::geam; - auto N = rocblas_operation::rocblas_operation_none; - // rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, B_ptr, ldb, C, ldc); - rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc); - } else { - throw std::runtime_error("not implemented."); + if (A.array().stride(0) != 1 || C.array().stride(0) != 1) { + throw std::runtime_error("expecting column major layout"); } + // assume there are no strides + int lda = A.array().stride(1); + int ldc = C.array().stride(1); + + // using geam = rocm::geam; + auto N = rocblas_operation::rocblas_operation_none; + // rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, B_ptr, ldb, C, ldc); + rocm::geam(N, N, m, n, alpha, A_ptr, lda, beta, C_ptr, ldc, C_ptr, ldc); } #endif -} // nlcglib +} // namespace nlcglib diff --git a/src/la/magma.cpp b/src/la/magma.cpp index 0c41aad..eff53e5 100644 --- a/src/la/magma.cpp +++ b/src/la/magma.cpp @@ -1,11 +1,11 @@ #ifdef __NLCGLIB__MAGMA +#include "la/magma.hpp" #include #include #include #include #include #include -#include "la/magma.hpp" #define TESTING_CHECK(err) \ do { \ @@ -23,14 +23,16 @@ } while (0) -void nlcg_init_magma() +void +nlcg_init_magma() { TESTING_CHECK(magma_init()); // magma_print_environment(); } -void nlcg_finalize_magma() +void +nlcg_finalize_magma() { TESTING_CHECK(magma_finalize()); } @@ -133,7 +135,9 @@ zpotrs_magma(int n, int nrhs, COMPLEX* dA, int lda, COMPLEX* dB, int ldb) } } -template void zpotrs_magma(int, int, Kokkos::complex*, int, Kokkos::complex*, int); -template void zpotrs_magma(int, int, std::complex*, int, std::complex*, int); +template void +zpotrs_magma(int, int, Kokkos::complex*, int, Kokkos::complex*, int); +template void +zpotrs_magma(int, int, std::complex*, int, std::complex*, int); #endif /*__NLCGLIB__MAGMA*/ diff --git a/src/la/magma.hpp b/src/la/magma.hpp index c86d445..a81fe9b 100644 --- a/src/la/magma.hpp +++ b/src/la/magma.hpp @@ -2,17 +2,22 @@ #ifdef __NLCGLIB__MAGMA #include -void nlcg_init_magma(); -void nlcg_finalize_magma(); +void +nlcg_init_magma(); +void +nlcg_finalize_magma(); -template -void zheevd_magma(int n, COMPLEX* dA, int lda, double* w); +template +void +zheevd_magma(int n, COMPLEX* dA, int lda, double* w); template -void zpotrf_magma(int n, COMPLEX* dA, int lda); +void +zpotrf_magma(int n, COMPLEX* dA, int lda); template -void zpotrs_magma(int n, int nrhs, COMPLEX* dA, int lda, COMPLEX* dB, int ldb); +void +zpotrs_magma(int n, int nrhs, COMPLEX* dA, int lda, COMPLEX* dB, int ldb); #endif /*__NLCGLIB__MAGMA*/ diff --git a/src/la/mvector.hpp b/src/la/mvector.hpp index c34610a..f9afb14 100644 --- a/src/la/mvector.hpp +++ b/src/la/mvector.hpp @@ -1,136 +1,108 @@ #pragma once -#include -#include -#include -#include -#include "mpi/communicator.hpp" -#include "traits.hpp" -#include "helper/funcs.hpp" #include +#include #include -#include #include -#include "nlcglib.hpp" +#include +#include +#include +#include +#include "gpu/acc.hpp" #include "la/dvector.hpp" +#include "mpi/communicator.hpp" +#include "traits.hpp" #include "utils.hpp" -#include "gpu/acc.hpp" namespace nlcglib { -template +template class mvector_base { }; -template -class mvector : public mvector_base, T> { +template +class mvector : public mvector_base, T> +{ static_assert(std::is_same>::value, "must have ownership"); + public: using value_type = T; using key_t = std::pair; using container_t = std::map; -public : - mvector(Communicator comm) : comm_(comm) {} +public: + mvector(Communicator comm) + : comm_(comm) + { + } mvector() = default; mvector(const mvector&) = default; mvector(mvector&&) = default; mvector& operator=(const mvector&) = default; - mvector(const container_t& data) : data_(data) {} - - T& operator[] (key_t k) + mvector(const container_t& data) + : data_(data) { - return data_[k]; } - const T& operator[] (key_t k) const - { - return data_.at(k); - } + T& operator[](key_t k) { return data_[k]; } - T& at(key_t k) - { - return data_.at(k); - } + const T& operator[](key_t k) const { return data_.at(k); } - const T& at(key_t k) const - { - return data_.at(k); - } + T& at(key_t k) { return data_.at(k); } + const T& at(key_t k) const { return data_.at(k); } - auto begin() - { - return data_.begin(); - } + auto begin() { return data_.begin(); } - auto end() - { - return data_.end(); - } + auto end() { return data_.end(); } auto begin() const { return data_.begin(); } auto end() const { return data_.end(); } - auto& data() - { - return this->data_; - } + auto& data() { return this->data_; } - auto& data() const - { - return this->data_; - } + auto& data() const { return this->data_; } - auto find() - { - return data_.find(); - } + auto find() { return data_.find(); } auto size() { return data_.size(); } auto size() const { return data_.size(); } mvector empty_like(); - mvector& operator=(std::map&& data) - { - data_ = std::forward>(data); - } + mvector& operator=(std::map&& data) { data_ = std::forward>(data); } - template + template mvector& operator=(mvector& other) { // iterate over keys and call assign - for(auto& elem : data_) { + for (auto& elem : data_) { auto key = elem.first; data_[key] = eval(other.at(key)); } return *this; } - const Communicator& commk() const - { - return comm_; - } + const Communicator& commk() const { return comm_; } - template - std::enable_if_t::value, mvector> - allgather(Communicator comm = Communicator{MPI_COMM_NULL}) const; + template + std::enable_if_t::value, mvector> allgather(Communicator comm = Communicator{ + MPI_COMM_NULL}) const; - template - std::enable_if_t::value, mvector> - allgather(Communicator comm = Communicator{MPI_COMM_NULL}) const; + template + std::enable_if_t::value, mvector> allgather(Communicator comm = Communicator{ + MPI_COMM_NULL}) const; private: container_t data_; Communicator comm_; }; -template -template +template +template std::enable_if_t::value, mvector> mvector::allgather(Communicator comm) const { @@ -163,8 +135,9 @@ mvector::allgather(Communicator comm) const } -template -std::vector> _allgather(const std::vector& values, const Communicator& comm) +template +std::vector> +_allgather(const std::vector& values, const Communicator& comm) { int nranks = comm.size(); std::vector nelems(nranks); @@ -180,20 +153,20 @@ std::vector> _allgather(const std::vector& values, const C std::vector> result(nranks); for (int i = 0; i < nranks; ++i) { - result[i] = std::vector(sendrecv_buffer.data() + scan[i], sendrecv_buffer.data() + scan[i+1]); + result[i] = + std::vector(sendrecv_buffer.data() + scan[i], sendrecv_buffer.data() + scan[i + 1]); } return result; } /// allgather, copy to host -> communicate -> copy to original memory (if needed) -template -template +template +template std::enable_if_t::value, mvector> mvector::allgather(Communicator comm) const { - if (comm == Communicator{MPI_COMM_NULL}) - comm = comm_; + if (comm == Communicator{MPI_COMM_NULL}) comm = comm_; if (comm < comm_) { throw std::runtime_error("mvector::allgather: most likely gave unintended communicator"); } @@ -213,8 +186,9 @@ mvector::allgather(Communicator comm) const // collect total size std::vector local_number_of_elements(data_.size()); - std::transform(data_.begin(), data_.end(), local_number_of_elements.data(), - [](auto& elem) {return elem.second.size();}); + std::transform(data_.begin(), data_.end(), local_number_of_elements.data(), [](auto& elem) { + return elem.second.size(); + }); auto global_number_of_elements = _allgather(local_number_of_elements, comm); // collect the offsets as a list [every mpi rank] of list [every block] @@ -239,30 +213,31 @@ mvector::allgather(Communicator comm) const auto host_view = Kokkos::create_mirror_view(arr); Kokkos::deep_copy(host_view, arr); assert(offsets[rank][i] < send_recv_buffer.size()); - std::copy(host_view.data(), host_view.data() + host_view.size(), + std::copy(host_view.data(), + host_view.data() + host_view.size(), send_recv_buffer.data() + offsets[rank][i]); ++i; } } std::vector recv_counts(comm.size()); - std::transform(global_number_of_elements.begin(), global_number_of_elements.end(), + std::transform(global_number_of_elements.begin(), + global_number_of_elements.end(), recv_counts.begin(), [](auto& in) { return std::accumulate(in.begin(), in.end(), 0); }); comm.allgather(send_recv_buffer.data(), recv_counts); // copy into results and issue memory transfer if needed. mvector result(Communicator{MPI_COMM_SELF}); - for(int rank = 0; rank < nranks; ++rank) { + for (int rank = 0; rank < nranks; ++rank) { for (auto block_id = 0ul; block_id < offsets[rank].size(); ++block_id) { int lsize = global_number_of_elements[rank][block_id]; int offset = offsets[rank][block_id]; Kokkos::View tmp( - Kokkos::view_alloc(Kokkos::WithoutInitializing, ""), - lsize); - std::copy(send_recv_buffer.data() + offset, send_recv_buffer.data() + offset + lsize, tmp.data()); - T dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, ""), - lsize); + Kokkos::view_alloc(Kokkos::WithoutInitializing, ""), lsize); + std::copy( + send_recv_buffer.data() + offset, send_recv_buffer.data() + offset + lsize, tmp.data()); + T dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, ""), lsize); Kokkos::deep_copy(dst, tmp); auto key = global_keys[rank][block_id]; result[key] = dst; @@ -272,9 +247,10 @@ mvector::allgather(Communicator comm) const } -template +template struct make_mmatrix_return_type -{}; +{ +}; template struct make_mmatrix_return_type::value>> @@ -289,10 +265,7 @@ struct make_mmatrix_return_type struct make_mmatrix_return_type::value>> { - using type = KokkosDVector**, - SlabLayoutV, - Kokkos::LayoutLeft, - xspc>; + using type = KokkosDVector**, SlabLayoutV, Kokkos::LayoutLeft, xspc>; using view_type = KokkosDVector**, SlabLayoutV, Kokkos::LayoutStride, @@ -304,11 +277,12 @@ struct make_mmatrix_return_type +template mvector::type> -make_mmatrix(std::shared_ptr matrix_base, std::enable_if_t::value>* _ = nullptr) +make_mmatrix(std::shared_ptr matrix_base, + std::enable_if_t::value>* _ = nullptr) { - static_assert(std::is_same::value, "invalid template parameters"); + static_assert(std::is_same::value, "invalid template parameters"); using memspace = T; typedef typename make_mmatrix_return_type::type matrix_t; mvector mvector(Communicator(matrix_base->mpicomm())); @@ -328,21 +302,20 @@ make_mmatrix(std::shared_ptr matrix_base, std::enable_if_t::accessible) { // make sure is memory type device - if (buffer.memtype != memory_type::host) - throw std::runtime_error("expected host memory"); + if (buffer.memtype != memory_type::host) throw std::runtime_error("expected host memory"); } mvector[kindex] = - matrix_t(Map<>(comm, SlabLayoutV({{0, 0, buffer.size[0], buffer.size[1]}})), buffer); - + matrix_t(Map<>(comm, SlabLayoutV({{0, 0, buffer.size[0], buffer.size[1]}})), buffer); } return mvector; } /// copy implementation -template +template mvector::type> -make_mmatrix(std::shared_ptr matrix_base, std::enable_if_t::value>* _ =nullptr) +make_mmatrix(std::shared_ptr matrix_base, + std::enable_if_t::value>* _ = nullptr) { static_assert(!std::is_same::value, "invalid template parameters"); using memspace = T; @@ -364,21 +337,21 @@ make_mmatrix(std::shared_ptr matrix_base, std::enable_if_t::accessible) { // make sure is memory type device - if (buffer.memtype != memory_type::host) - throw std::runtime_error("expected host memory"); + if (buffer.memtype != memory_type::host) throw std::runtime_error("expected host memory"); } // copy view T, using cuda memcpy ... matrix_t mat(Map<>(comm, SlabLayoutV({{0, 0, buffer.size[0], buffer.size[1]}}))); // issue memcpy - acc::copy(mat.array().data(), buffer.data, buffer.size[0]*buffer.size[1]); + acc::copy(mat.array().data(), buffer.data, buffer.size[0] * buffer.size[1]); mvector[kindex] = mat; } return mvector; } -template -auto make_mmvector(std::shared_ptr vector_base) +template +auto +make_mmvector(std::shared_ptr vector_base) { using memspace = T; typedef Kokkos::View vector_t; @@ -389,7 +362,8 @@ auto make_mmvector(std::shared_ptr vector_base) auto buffer = vector_base->get(i); if (buffer.memtype == memory_type::device) { #ifdef __NLCGLIB__CUDA - Kokkos::View src(buffer.data, buffer.size[0]); + Kokkos::View src(buffer.data, + buffer.size[0]); vector_t dst("vector", buffer.size[0]); Kokkos::deep_copy(dst, src); auto kindex = vector_base->kpoint_index(i); @@ -410,11 +384,12 @@ auto make_mmvector(std::shared_ptr vector_base) } -inline auto make_mmscalar(std::shared_ptr scalar_base) +inline auto +make_mmscalar(std::shared_ptr scalar_base) { mvector mvector(Communicator(scalar_base->mpicomm())); int num_vec = scalar_base->size(); - for(int i = 0; i < num_vec; ++i) { + for (int i = 0; i < num_vec; ++i) { auto v = scalar_base->get(i); auto key = scalar_base->kpoint_index(i); mvector[key] = v; @@ -437,7 +412,8 @@ eval_threaded(const mvector& input) template -void execute(const mvector& input) +void +execute(const mvector& input) { for (auto& elem : input) { eval(elem.second); @@ -446,29 +422,23 @@ void execute(const mvector& input) template -auto sum(const Kokkos::View& x) +auto +sum(const Kokkos::View& x) { using view_type = Kokkos::View; - static_assert(view_type::dimension::rank == 1, - "KokkosView"); + static_assert(view_type::dimension::rank == 1, "KokkosView"); auto host_mirror = Kokkos::create_mirror_view(x); Kokkos::deep_copy(host_mirror, x); - return std::accumulate(host_mirror.data(), host_mirror.data()+ host_mirror.size(), 0.0); + return std::accumulate(host_mirror.data(), host_mirror.data() + host_mirror.size(), 0.0); } template -std::enable_if_t>::value, mvector > > -sum(const mvector& x) -{ - return tapply([](auto xi) { return(sum(eval(xi))); }, x); -} - - -template -std::enable_if_t>::value || std::is_same, T>::value, eval_t> +std::enable_if_t>::value || + std::is_same, T>::value, + eval_t> sum(const mvector& x, Communicator comm = Communicator{MPI_COMM_NULL}) { if (comm == Communicator{MPI_COMM_NULL}) comm = x.commk(); @@ -478,22 +448,23 @@ sum(const mvector& x, Communicator comm = Communicator{MPI_COMM_NULL}) } eval_t sum = 0; - for (auto& elem: x) { + for (auto& elem : x) { sum += eval(elem.second); } return comm.allreduce(sum, mpi_op::sum); } -template -auto operator*(const mvector& a, const mvector& b) +template +auto +operator*(const mvector& a, const mvector& b) { - return tapply([](auto x, auto y) { return eval(x)*eval(y); }, a, b); + return tapply([](auto x, auto y) { return eval(x) * eval(y); }, a, b); } -template -std::enable_if_t::dimension::rank == 1> +template +std::enable_if_t::dimension::rank == 1> print(const mvector>& vec) { for (auto& elem : vec) { @@ -509,8 +480,9 @@ print(const mvector>& vec) } -struct do_copy { - template +struct do_copy +{ + template to_layout_left_t> operator()(X&& x) { auto copy = empty_like()(x); @@ -521,14 +493,17 @@ struct do_copy { template -auto copy(const mvector& x) +auto +copy(const mvector& x) { return eval_threaded(tapply(do_copy(), x)); } template -auto unzip(const mvector>& V) { +auto +unzip(const mvector>& V) +{ std::tuple...> U; for (auto& elem : V) { @@ -583,7 +558,7 @@ auto unzip(const std::tuple& src, std::tuple...>& dst, const key_t& key) { using tuple_t = std::tuple; - unzip_impl::value-1>::apply(src, dst , key); + unzip_impl::value - 1>::apply(src, dst, key); } @@ -594,8 +569,9 @@ print(const mvector& vec) for (auto& elem : vec) { auto key = elem.first; auto& val = elem.second; - std::cout << "kindex (" << key.first << ", " << key.second << "): " << std::setprecision(10) << val << "\n"; + std::cout << "kindex (" << key.first << ", " << key.second << "): " << std::setprecision(10) + << val << "\n"; } } -} // nlcglib +} // namespace nlcglib diff --git a/src/la/rocblas.hpp b/src/la/rocblas.hpp index 6decd54..e87a763 100644 --- a/src/la/rocblas.hpp +++ b/src/la/rocblas.hpp @@ -28,8 +28,7 @@ namespace nlcglib { namespace rocm { +} // namespace rocm -} // rocm - -} // nlcglib +} // namespace nlcglib diff --git a/src/la/utils.hpp b/src/la/utils.hpp index c37423b..b58d1a4 100644 --- a/src/la/utils.hpp +++ b/src/la/utils.hpp @@ -1,12 +1,12 @@ #pragma once #include +#include #include #include -#include #include -#include #include +#include namespace nlcglib { @@ -54,7 +54,10 @@ struct to_layout_left&&> // const ref template -struct to_layout_left&> : to_layout_left> {}; +struct to_layout_left&> + : to_layout_left> +{ +}; // ref template @@ -119,13 +122,13 @@ Kokkos::View _empty_like(const Kokkos::View& other) { // return Kokkos::View("tmp", other.size()); - auto ret = Kokkos::View(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp"), other.size()); + auto ret = Kokkos::View(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp"), + other.size()); #ifdef DEBUG // initialize with NAN to throw an error immediately if not overwritten using memspc = typename Kokkos::View::memory_space; - Kokkos::parallel_for(Kokkos::RangePolicy>(0, other.size()), KOKKOS_LAMBDA(int i) { - ret(i) = NAN; - }); + Kokkos::parallel_for( + Kokkos::RangePolicy>(0, other.size()), KOKKOS_LAMBDA(int i) { ret(i) = NAN; }); #endif return ret; } @@ -205,11 +208,6 @@ tapply(FUNCTOR&& fun, const ARG& arg0, const ARGS&... args) return result; } -template -class find_type -{ - using t = typename T::fjsadlfjasf; -}; /// tapply for packed operators template @@ -223,10 +221,9 @@ tapply_op(OP&& op, const ARG0& arg0, const ARGS&... args) mvector> result(arg0.commk()); for (auto& elem : arg0) { auto key = elem.first; - // find_type::t; auto get_key = [key](auto container) { return container.at(key); }; auto fun = op.at(key); - result[key] = [=](){ return fun(eval(get_key(arg0)), eval(get_key(args))...); }; + result[key] = [=]() { return fun(eval(get_key(arg0)), eval(get_key(args))...); }; } return result; } diff --git a/src/linesearch/linesearch.hpp b/src/linesearch/linesearch.hpp index 3edc424..23a27ee 100644 --- a/src/linesearch/linesearch.hpp +++ b/src/linesearch/linesearch.hpp @@ -1,16 +1,16 @@ #pragma once -#include "exceptions.hpp" -#include "utils/logger.hpp" #include #include +#include "exceptions.hpp" +#include "utils/logger.hpp" namespace nlcglib { struct line_search_info { - std::string type; // the ls-type used + std::string type; // the ls-type used }; class line_search @@ -34,11 +34,13 @@ class line_search Logger::GetInstance() << "line search t_trial = " << std::scientific << t_trial << "\n"; double F0 = FE.get_F(); try { - return std::tuple_cat(qline(G, FE, slope, force_restart), std::make_tuple(line_search_info{"qline"})); + return std::tuple_cat(qline(G, FE, slope, force_restart), + std::make_tuple(line_search_info{"qline"})); } catch (StepError& step_error) { Logger::GetInstance() << "\t" << "quadratic line search failed -> backtracking search\n"; - return std::tuple_cat(bt_search(G, FE, F0, force_restart), std::make_tuple(line_search_info{"btsearch"})); + return std::tuple_cat(bt_search(G, FE, F0, force_restart), + std::make_tuple(line_search_info{"btsearch"})); } } @@ -56,7 +58,6 @@ template auto line_search::bt_search(GEODESIC& G, FREE_ENERGY& FE, double F0, bool& force_restart) { - if (tau >= 1) { throw std::runtime_error("invalid value"); } @@ -65,19 +66,22 @@ line_search::bt_search(GEODESIC& G, FREE_ENERGY& FE, double F0, bool& force_rest while (t > 1e-8) { auto ek_ul = G(t); double Fp = FE.get_F(); - Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3) << (Fp - F0) / t << " t: " << t - << " F:" << std::fixed << std::setprecision(13) << Fp << "\n"; + Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3) + << (Fp - F0) / t << " t: " << t << " F:" << std::fixed + << std::setprecision(13) << Fp << "\n"; if (Fp < F0) { - Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3) << (Fp - F0)/t << "\n"; + Logger::GetInstance() << "fd slope: " << std::scientific << std::setprecision(3) + << (Fp - F0) / t << "\n"; force_restart = false; return ek_ul; } t *= tau; - Logger::GetInstance() << "\tbacktracking search tau = " << std::scientific << std::setprecision(5) << t << "\n"; + Logger::GetInstance() << "\tbacktracking search tau = " << std::scientific + << std::setprecision(5) << t << "\n"; } // TODO: let logger print state Logger::GetInstance().flush(); - if (force_restart) { + if (force_restart) { throw DescentError(); } else { force_restart = true; @@ -102,7 +106,8 @@ line_search::qline(GEODESIC& G, FREE_ENERGY& FE, double slope, bool& force_resta // G(dt); // double F1 = FE.get_F(); // double fd_slope = (F1-F0)/dt; - // Logger::GetInstance() << "\t DEBUG qline slope = " << std::setprecision(6) << slope << ", fd_slope = " << fd_slope << "\n"; + // Logger::GetInstance() << "\t DEBUG qline slope = " << std::setprecision(6) << slope << ", + // fd_slope = " << fd_slope << "\n"; // } // (END) DEBUG check slope @@ -135,13 +140,12 @@ line_search::qline(GEODESIC& G, FREE_ENERGY& FE, double slope, bool& force_resta // evaluate FE at predicted minimum auto ek_ul = G(t_min); double F_min = FE.get_F(); - Logger::GetInstance() << "\t t_min = " << t_min - << " q line prediction error: " << std::scientific << std::setprecision(8) << (F_pred - F_min) - << " dE: " << std::scientific << std::setprecision(8) << (F0 - F_min) << "\n"; + Logger::GetInstance() << "\t t_min = " << t_min << " q line prediction error: " << std::scientific + << std::setprecision(8) << (F_pred - F_min) << " dE: " << std::scientific + << std::setprecision(8) << (F0 - F_min) << "\n"; if (F_min > F0) { - Logger::GetInstance() << std::setprecision(13) - << "\t quadratic line search failed:\n" + Logger::GetInstance() << std::setprecision(13) << "\t quadratic line search failed:\n" << "\t - F_min: " << F_min << "\n" << "\t - F0: " << F0 << "\n\n"; throw StepError(); @@ -153,21 +157,4 @@ line_search::qline(GEODESIC& G, FREE_ENERGY& FE, double slope, bool& force_resta return ek_ul; } - - -// template -// auto -// line_search(GEODESIC&& G_base, FREE_ENERGY&& FE, double slope, double t_trial = 0.2) -// { -// double F0 = FE.get_F(); -// try { -// return qline(G_base, FE, slope, t_trial); -// } catch (StepError& step_error) { -// Logger::GetInstance() << "\t" -// << "quadratic line search failed -> backtracking search\n"; -// return bt_search(G_base, FE, F0, t_trial, 0.1); -// } -// } - - } // namespace nlcglib diff --git a/src/mpi/communicator.hpp b/src/mpi/communicator.hpp index a3e7ff3..ffc3bf3 100644 --- a/src/mpi/communicator.hpp +++ b/src/mpi/communicator.hpp @@ -110,6 +110,10 @@ class Communicator template T allreduce(T val, enum mpi_op op) const; + template + void allreduce(T* buffer, int count, enum mpi_op op) const; + + void barrier() const { CALL_MPI(MPI_Barrier, (mpicomm_)); } ~Communicator() @@ -224,4 +228,25 @@ Communicator::allreduce(T val, enum mpi_op op) const return result; } +template +void +Communicator::allreduce(T* buffer, int count, enum mpi_op op) const +{ + switch (op) { + case mpi_op::sum: { + CALL_MPI(MPI_Allreduce, + (MPI_IN_PLACE, + buffer, + count, + mpi_type::type(), + mpi_op_::value(), + mpicomm_)); + break; + } + default: { + throw std::runtime_error("Error: invalid MPI_Op given."); + } + } +} + } // namespace nlcglib diff --git a/src/mpi/mpi_type.hpp b/src/mpi/mpi_type.hpp index 8fdf03d..6c292db 100644 --- a/src/mpi/mpi_type.hpp +++ b/src/mpi/mpi_type.hpp @@ -1,9 +1,9 @@ #pragma once #include -#include -#include #include +#include +#include namespace nlcglib { @@ -23,9 +23,11 @@ enum class mpi_op }; template -struct mpi_op_{}; +struct mpi_op_ +{ +}; -template<> +template <> struct mpi_op_ { static MPI_Op value() { return MPI_SUM; } @@ -88,12 +90,8 @@ struct mpi_type> int array_of_block_lengths[2] = {1, 1}; MPI_Aint array_of_displacements[2] = {0, sizeof(T1)}; MPI_Datatype types[2] = {mpi_type::type(), mpi_type::type()}; - CALL_MPI( - MPI_Type_create_struct, (2, - array_of_block_lengths, - array_of_displacements, - types, - &result)); + CALL_MPI(MPI_Type_create_struct, + (2, array_of_block_lengths, array_of_displacements, types, &result)); CALL_MPI(MPI_Type_commit, (&result)); return result; } diff --git a/src/mvp2/descent_direction_impl.hpp b/src/mvp2/descent_direction_impl.hpp index 0dade0c..5ff82de 100644 --- a/src/mvp2/descent_direction_impl.hpp +++ b/src/mvp2/descent_direction_impl.hpp @@ -2,10 +2,8 @@ #include #include "la/dvector.hpp" -#include "la/mvector.hpp" #include "mvp2.hpp" #include "pseudo_hamiltonian/grad_eta.hpp" -#include "utils/logger.hpp" namespace nlcglib { diff --git a/src/nlcglib.cpp b/src/nlcglib.cpp index fb1e9cf..9a6cfa9 100644 --- a/src/nlcglib.cpp +++ b/src/nlcglib.cpp @@ -1,31 +1,20 @@ #include -// #include -#include -#include #include #include #include #include -#include "exec_space.hpp" #include "free_energy.hpp" #include "geodesic.hpp" #include "interface.hpp" -#include "la/dvector.hpp" #include "la/lapack.hpp" -#include "la/layout.hpp" -#include "la/magma.hpp" -#include "la/map.hpp" #include "la/mvector.hpp" #include "la/utils.hpp" #include "linesearch/linesearch.hpp" +#include "mpi/communicator.hpp" #include "mvp2/descent_direction.hpp" #include "overlap.hpp" -#include "preconditioner.hpp" -#include "pseudo_hamiltonian/grad_eta.hpp" #include "smearing.hpp" -#include "traits.hpp" #include "ultrasoft_precond.hpp" -#include "utils/format.hpp" #include "utils/logger.hpp" #include "utils/step_logger.hpp" #include "utils/timer.hpp" @@ -37,16 +26,8 @@ namespace nlcglib { void initialize() { -#if KOKKOS_VERSION < 30700 - Kokkos::InitArguments args; - args.disable_warnings = true; -#ifdef USE_OPENMP - args.num_threads = omp_get_max_threads(); -#endif /* endif USE_OPENMP */ -#else /* KOKKOS_VERSION >= 3.7.00 */ Kokkos::InitializationSettings args; args.set_disable_warnings(true); -#endif /* endif KOKKOS VERSION */ #ifdef USE_OPENMP args.num_threads = omp_get_max_threads(); #endif @@ -228,6 +209,8 @@ nlcg_us(EnergyBase& energy_base, // ~FE_UNDERFLOW); // Enable all floating point exceptions but FE_INEXACT nlcg_info info; + Communicator comm_world(energy_base.comm_world()); + auto S = Overlap(overlap_base); auto P = USPreconditioner(us_precond_base); @@ -296,6 +279,7 @@ nlcg_us(EnergyBase& energy_base, auto eta = eval_threaded(tapply(make_diag(), ek)); auto slope_zx_zeta = dd.restarted(xspace(), X, ek, fn, Hx, wk, mu, S, P, free_energy); double slope = std::get<0>(slope_zx_zeta); + auto z_x = std::get<1>(slope_zx_zeta); auto z_eta = std::get<2>(slope_zx_zeta); // allocate rotation matrices @@ -323,7 +307,7 @@ nlcg_us(EnergyBase& energy_base, ek, fn, free_energy.ks_energy_components(), - commk, + comm_world, cg_iter); free_energy.ehandle().print_info(); // print magnetization @@ -363,7 +347,7 @@ nlcg_us(EnergyBase& energy_base, ek, fn, free_energy.ks_energy_components(), - commk, + comm_world, cg_iter); timer.start(); diff --git a/src/operator.hpp b/src/operator.hpp index e7807ff..65007dd 100644 --- a/src/operator.hpp +++ b/src/operator.hpp @@ -1,7 +1,6 @@ #pragma once #include "la/dvector.hpp" -#include "la/mvector.hpp" namespace nlcglib { diff --git a/src/overlap.hpp b/src/overlap.hpp index c63e9a6..2928584 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -1,11 +1,8 @@ #pragma once -#include #include "interface.hpp" -#include "la/mvector.hpp" -#include "la/dvector.hpp" -#include "operator.hpp" #include "mpi/communicator.hpp" +#include "operator.hpp" namespace nlcglib { @@ -27,14 +24,18 @@ class Overlap auto at(const key_t& key) const -> value_type; - auto begin() { return local::op_iterator (overlap_base.get_keys(), *this, false); } + auto begin() { return local::op_iterator(overlap_base.get_keys(), *this, false); } auto end() { return local::op_iterator(overlap_base.get_keys(), *this, true); } - auto begin() const { return local::op_iterator(overlap_base.get_keys(), *this, false); } - auto end() const { return local::op_iterator(overlap_base.get_keys(), *this, true); } - - Communicator commk() const { - throw std::runtime_error("not implemented"); + auto begin() const + { + return local::op_iterator(overlap_base.get_keys(), *this, false); } + auto end() const + { + return local::op_iterator(overlap_base.get_keys(), *this, true); + } + + Communicator commk() const { throw std::runtime_error("not implemented"); } private: diff --git a/src/preconditioner.hpp b/src/preconditioner.hpp index 2966701..7bffd94 100644 --- a/src/preconditioner.hpp +++ b/src/preconditioner.hpp @@ -2,10 +2,10 @@ #include +#include "exec_space.hpp" #include "la/dvector.hpp" -#include "la/mvector.hpp" #include "la/lapack.hpp" -#include "exec_space.hpp" +#include "la/mvector.hpp" namespace nlcglib { @@ -19,14 +19,11 @@ class diagonal_preconditioner public: diagonal_preconditioner(const view_t& entries) : entries(entries) - {} - - template - static void apply(M1& dst, - const M2& src, - const Kokkos::View& entries) + { + } + + template + static void apply(M1& dst, const M2& src, const Kokkos::View& entries) { typedef Kokkos::MDRangePolicy, exec_t> mdrange_policy; int m = dst.array().extent(0); @@ -40,7 +37,7 @@ class diagonal_preconditioner }); } - template + template auto operator()(const X& x) { auto y = empty_like()(x); @@ -48,7 +45,7 @@ class diagonal_preconditioner return y; } - template + template void apply_in_place(X& x) { diagonal_preconditioner::apply(x, x, entries); @@ -65,11 +62,13 @@ class diagonal_preconditioner * http://dx.doi.org/10.1103/RevModPhys.64.1045 */ template -class PreconditionerTeter //: public mvector_base, diagonal_preconditioner> +class PreconditionerTeter //: public mvector_base, + //: diagonal_preconditioner> { public: using memspace = SPACE; using value_type = diagonal_preconditioner; + public: PreconditionerTeter(std::shared_ptr ekin) { @@ -94,13 +93,13 @@ class PreconditionerTeter //: public mvector_base, di } - template + template auto operator[](const key_t& key) const { return diagonal_preconditioner(kinetic_diag_precond.at(key)); } - template + template auto at(const key_t& key) const { return this->operator[](key); diff --git a/src/pseudo_hamiltonian/grad_eta.hpp b/src/pseudo_hamiltonian/grad_eta.hpp index 3e07e62..c7cd556 100644 --- a/src/pseudo_hamiltonian/grad_eta.hpp +++ b/src/pseudo_hamiltonian/grad_eta.hpp @@ -39,7 +39,7 @@ struct GradEtaHelper "dFdmu", Kokkos::RangePolicy(0, nbands), KOKKOS_LAMBDA(int i, Kokkos::complex& result) { - double delta = smearing::delta((en_loc(i) - mu) / kT, mo); + double delta = smearing::delta((mu - en_loc(i)) / kT, mo); result += (hii(i) - en_loc(i)) * (delta); }, v); @@ -51,7 +51,7 @@ struct GradEtaHelper /** w_k * fn (1-fn) summed over all k-points \f[ - \sum_{n, k'} w_{k'} f_n (mo-f_n) + \sum_{n, k'} w_{k'} delta( (μ - e_{n,k'}) / kT ) \f] \$mo\$ is 1 for spin-polarized calucations and 2 otherwise. */ @@ -72,7 +72,7 @@ struct GradEtaHelper int nbands = en[key].size(); auto en_loc = en[key]; for (int i = 0; i < nbands; ++i) { - double delta = smearing::delta((en_loc(i) - mu) / kT, mo); + double delta = smearing::delta((mu - en_loc(i)) / kT, mo); v += delta * w_k; } } @@ -151,7 +151,7 @@ class GradEta double kT_loc = kT; Kokkos::parallel_for( "gEta (1)", Kokkos::RangePolicy(0, nbands), KOKKOS_LAMBDA(int i) { - double delta = smearing::delta((ek(i) - mu) / kT_loc, mo); + double delta = smearing::delta((mu - ek(i)) / kT_loc, mo); mgETA(i, i) = -1 / kT_loc * (mHij(i, i) - wk * ek(i)) * (delta); }); @@ -161,7 +161,7 @@ class GradEta Kokkos::parallel_for( "gEta (2)", Kokkos::RangePolicy(0, nbands), KOKKOS_LAMBDA(int i) { // sumfn dmuFn - double delta = smearing::delta((ek(i) - mu) / kT_loc, mo); + double delta = smearing::delta((mu - ek(i)) / kT_loc, mo); mgETA(i, i) += wk * (delta) / dmu_deta * (dFdmu / kT_loc); }); } @@ -196,7 +196,6 @@ class GradEta const mvector& ek, const mvector& wk) { - // delta_eta = kappa * (hij - diag(ek)) return tapply_async(_delta_eta(kappa), Hij, ek, wk); } diff --git a/src/smearing.hpp b/src/smearing.hpp index 40ed9f3..4876be1 100644 --- a/src/smearing.hpp +++ b/src/smearing.hpp @@ -3,7 +3,6 @@ #include #include #include -#include #include "constants.hpp" #include "dft/newton_minimization_smearing.hpp" #include "interface.hpp" @@ -11,7 +10,6 @@ #include "la/utils.hpp" #include "utils/env.hpp" #include "utils/logger.hpp" -#include "utils/timer.hpp" namespace nlcglib { @@ -244,9 +242,8 @@ struct cold_smearing : summed, non_monotonous if (x < -8) return 0; if (x > 10) return 0; double sqrt2 = std::sqrt(2.0); - double z = (x - 1/sqrt2); - return mo * std::exp(-z*z) * (sqrt2 - 6 * x + 2 * sqrt2 * x * x) / - std::sqrt(constants::pi); + double z = (x - 1 / sqrt2); + return mo * std::exp(-z * z) * (sqrt2 - 6 * x + 2 * sqrt2 * x * x) / std::sqrt(constants::pi); } KOKKOS_INLINE_FUNCTION static double entropy(double x, double mo) @@ -255,8 +252,8 @@ struct cold_smearing : summed, non_monotonous if (x > 10) return 0; double sqrtpi = std::sqrt(constants::pi); double sqrt2 = std::sqrt(2.0); - double z = (x - 1/sqrt2); - return mo * std::exp(-z*z) * (1 - sqrt2 * x) / 2 / sqrtpi; + double z = (x - 1 / sqrt2); + return mo * std::exp(-z * z) * (1 - sqrt2 * x) / 2 / sqrtpi; } }; diff --git a/src/traits.hpp b/src/traits.hpp index 85d0866..1d899fe 100644 --- a/src/traits.hpp +++ b/src/traits.hpp @@ -1,10 +1,10 @@ #pragma once #include -#include #include -#include #include +#include +#include namespace nlcglib { @@ -36,7 +36,8 @@ struct is_kokkos_view : _is_kokkos_view struct _is_future : std::false_type -{}; +{ +}; template struct _is_future> : std::true_type @@ -79,14 +80,16 @@ class mvector; template constexpr auto&& -eval(X&& x, std::enable_if_t<(!is_callable::value && !is_future::value) || is_kokkos_view::value> * = nullptr) +eval(X&& x, + std::enable_if_t<(!is_callable::value && !is_future::value) || + is_kokkos_view::value>* = nullptr) { return std::forward(x); } template constexpr auto -eval(X &&x, std::enable_if_t::value && !is_kokkos_view::value> * = nullptr) +eval(X&& x, std::enable_if_t::value && !is_kokkos_view::value>* = nullptr) { return x(); } @@ -105,8 +108,9 @@ eval(const std::shared_future& x) return x.get(); } -template -struct eval_type { +template +struct eval_type +{ using type = std::remove_reference_t()))>; }; @@ -114,13 +118,13 @@ template using eval_t = typename eval_type::type; -template +template struct result_of { using type = decltype(eval(std::declval())); }; -template +template struct result_of> { using type = typename result_of::type; @@ -132,9 +136,11 @@ using result_of_t = typename result_of::type; /// add const if argument is true template -struct conditional_add_const {}; +struct conditional_add_const +{ +}; -template +template struct conditional_add_const { using type = typename std::add_const_t; @@ -146,7 +152,7 @@ struct conditional_add_const using type = T; }; -template +template using conditional_add_const_t = typename conditional_add_const::type; diff --git a/src/ultrasoft_precond.hpp b/src/ultrasoft_precond.hpp index c1086e2..cac78d1 100644 --- a/src/ultrasoft_precond.hpp +++ b/src/ultrasoft_precond.hpp @@ -10,7 +10,6 @@ namespace nlcglib { /// Wrapper for overlap operation computed by sirius, behaves like mvector in an expression. class USPreconditioner { - public: // TODO: rename to k_index using key_t = std::pair; @@ -25,10 +24,22 @@ class USPreconditioner auto at(const key_t& key) const; - auto begin() { return local::op_iterator (us_precond_base.get_keys(), *this, false); } - auto end() { return local::op_iterator(us_precond_base.get_keys(), *this, true); } - auto begin() const { return local::op_iterator(us_precond_base.get_keys(), *this, false); } - auto end() const { return local::op_iterator(us_precond_base.get_keys(), *this, true); } + auto begin() + { + return local::op_iterator(us_precond_base.get_keys(), *this, false); + } + auto end() + { + return local::op_iterator(us_precond_base.get_keys(), *this, true); + } + auto begin() const + { + return local::op_iterator(us_precond_base.get_keys(), *this, false); + } + auto end() const + { + return local::op_iterator(us_precond_base.get_keys(), *this, true); + } private: const UltrasoftPrecondBase& us_precond_base; @@ -41,4 +52,4 @@ USPreconditioner::at(const key_t& key) const } -} // nlcglib +} // namespace nlcglib diff --git a/src/utils/format.hpp b/src/utils/format.hpp index b14e9d6..5df6491 100644 --- a/src/utils/format.hpp +++ b/src/utils/format.hpp @@ -1,12 +1,13 @@ #pragma once -#include #include +#include namespace nlcglib { -template -std::string format(std::string format_string, ARGS&&... args) +template +std::string +format(std::string format_string, ARGS&&... args) { char buf[format_string.size()]; std::sprintf(buf, format_string.c_str(), args...); @@ -14,4 +15,4 @@ std::string format(std::string format_string, ARGS&&... args) return std::string(buf); } -} // nlcglib +} // namespace nlcglib diff --git a/src/utils/logger.hpp b/src/utils/logger.hpp index 4f581d7..b491ea0 100644 --- a/src/utils/logger.hpp +++ b/src/utils/logger.hpp @@ -2,8 +2,8 @@ #include #include -#include #include +#include #include #include #include @@ -13,14 +13,14 @@ namespace nlcglib { -const static struct to_stdout_trigger {} TO_STDOUT; +const static struct to_stdout_trigger +{ +} TO_STDOUT; class Logger : public CSingleton { public: - Logger() { - MPI_Comm_rank(MPI_COMM_WORLD, &pid_); - } + Logger() { MPI_Comm_rank(MPI_COMM_WORLD, &pid_); } Logger(Logger&&) = default; @@ -34,8 +34,7 @@ class Logger : public CSingleton void attach_file_master(const std::string& fname = "nlcg.out") { MPI_Comm_rank(MPI_COMM_WORLD, &pid_); - if (pid_ == 0) - stream_ptr_ = std::make_shared(fname); + if (pid_ == 0) stream_ptr_ = std::make_shared(fname); } template @@ -87,7 +86,7 @@ class Logger : public CSingleton void flush() { - if(stream_ptr_) { + if (stream_ptr_) { std::mutex mutex; std::lock_guard lock(mutex); auto& out = *(stream_ptr_.get()); diff --git a/src/utils/step_logger.hpp b/src/utils/step_logger.hpp index daf99d0..8edf179 100644 --- a/src/utils/step_logger.hpp +++ b/src/utils/step_logger.hpp @@ -1,12 +1,11 @@ #pragma once -#include #include -#include #include +#include +#include #include #include -#include #include "la/mvector.hpp" namespace Kokkos { @@ -26,25 +25,28 @@ namespace nlcglib { class StepLogger { public: - StepLogger(int i, std::string fname = "nlcg.json", bool active=true) - : i(i), fname(fname), active(active) + StepLogger(int i, std::string fname = "nlcg.json", bool active = true) + : i(i) + , fname(fname) + , active(active) { dict["type"] = "cg_iteration"; dict["step"] = i; } - template - std::enable_if_t>::value> log(const std::string& key, X&& x); + template + std::enable_if_t>::value> log(const std::string& key, + X&& x); - template + template void log(const std::string& key, const std::map& v); - template + template void log(const std::string& key, const mvector& x); ~StepLogger() { - if(active) { + if (active) { std::ofstream fout(std::string("nlcg") + ".json", std::ios_base::app); fout << dict; fout.flush(); @@ -62,14 +64,15 @@ template std::enable_if_t>::value> StepLogger::log(const std::string& key, X&& x) { - if(!active) return; + if (!active) return; dict[key] = x; } -template -void StepLogger::log(const std::string& key, const std::map& v) +template +void +StepLogger::log(const std::string& key, const std::map& v) { - if(!active) return; + if (!active) return; dict[key] = v; } @@ -77,7 +80,7 @@ template void StepLogger::log(const std::string& key, const mvector& x) { - if(!active) return; + if (!active) return; // assuming V is a 1-d kokkos array for (auto& elem : x) { auto x_key = elem.first; diff --git a/src/utils/timer.hpp b/src/utils/timer.hpp index 513dd21..8bf2d37 100644 --- a/src/utils/timer.hpp +++ b/src/utils/timer.hpp @@ -7,8 +7,10 @@ namespace nlcglib { -template -struct duration_string {}; +template +struct duration_string +{ +}; template <> struct duration_string @@ -56,4 +58,4 @@ Timer::stop() return std::chrono::duration_cast>(now - this->t).count(); } -} // nlcglib +} // namespace nlcglib diff --git a/test/test.cpp b/test/test.cpp index b910a41..48cf9a9 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -13,11 +13,13 @@ using namespace nlcglib; typedef std::complex complex_double; -void run() { - KokkosDVector X( +void +run() +{ + KokkosDVector X( Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}}))); - KokkosDVector H( + KokkosDVector H( Map<>(Communicator(), SlabLayoutV({{0, 0, 20, 20}}))); auto kokkos_array = X.array(); @@ -33,11 +35,10 @@ run_unmanaged() KokkosDVector X( Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}}))); - std::vector yptr(200*20); + std::vector yptr(200 * 20); KokkosDVector Y( Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})), - buffer_protocol({1, 200}, {200, 20}, yptr.data(), memory_type::host) - ); + buffer_protocol({1, 200}, {200, 20}, yptr.data(), memory_type::host)); KokkosDVector H( Map<>(Communicator(), SlabLayoutV({{0, 0, 20, 20}}))); @@ -48,7 +49,7 @@ run_unmanaged() auto S = H.copy(); - Kokkos::View eigvals("eigvals", 20); + Kokkos::View eigvals("eigvals", 20); eigh(H, eigvals, S); } @@ -61,10 +62,10 @@ run_unmanaged_cuda() complex_double yptr[200 * 20]; KokkosDVector + SlabLayoutV, + Kokkos::LayoutStride, + Kokkos::CudaSpace, + Kokkos::MemoryUnmanaged> Y(Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}})), buffer_protocol({1, 200}, {200, 20}, yptr, memory_type::host)); @@ -77,10 +78,9 @@ run_unmanaged_cuda() auto S = H.copy(); - Kokkos::View eigvals("eigvals", 20); + Kokkos::View eigvals("eigvals", 20); eigh(S, eigvals, H); - } #endif @@ -102,7 +102,8 @@ run_stacked_vector() } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { Kokkos::initialize(); Communicator::init(argc, argv); @@ -117,9 +118,9 @@ int main(int argc, char *argv[]) run(); - #ifdef __NLCGLIB__CUDA +#ifdef __NLCGLIB__CUDA run_unmanaged_cuda(); - #endif +#endif Communicator::finalize(); Kokkos::finalize(); diff --git a/test/test3.cpp b/test/test3.cpp index a53358a..ab60f1a 100644 --- a/test/test3.cpp +++ b/test/test3.cpp @@ -1,15 +1,15 @@ #include #include #include -#include "smearing.hpp" #include "la/mvector.hpp" +#include "smearing.hpp" auto run() { int n = 10; - using vector_t = Kokkos::View; + using vector_t = Kokkos::View; vector_t a_view("test", n); for (int i = 0; i < n; ++i) { a_view(i) = i; diff --git a/test/test_buffer.cpp b/test/test_buffer.cpp index 8351f65..8dd9e8a 100644 --- a/test/test_buffer.cpp +++ b/test/test_buffer.cpp @@ -16,7 +16,8 @@ using typeX = KokkosDVector X( Map<>(Communicator(), SlabLayoutV({{0, 0, 200, 20}}))); diff --git a/test/test_deep_copy.cpp b/test/test_deep_copy.cpp index b403ed4..61264aa 100644 --- a/test/test_deep_copy.cpp +++ b/test/test_deep_copy.cpp @@ -4,39 +4,37 @@ #include "traits.hpp" #ifdef __NLCGLIB__CUDA -void run() +void +run() { int n = 10; double arr[10]; - Kokkos::View - c(arr, n); - Kokkos::View - a("A", n); - Kokkos::View - a_device("A", n); - Kokkos::View - b("b", n); + Kokkos::View c(arr, n); + Kokkos::View a("A", n); + Kokkos::View a_device("A", n); + Kokkos::View b("b", n); for (int i = 0; i < n; ++i) { a(i) = i; } - Kokkos::parallel_for(Kokkos::RangePolicy(0, n), KOKKOS_LAMBDA(int i) { - a_device[i] = i; - }); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, n), KOKKOS_LAMBDA(int i) { a_device[i] = i; }); Kokkos::deep_copy(b, a); Kokkos::deep_copy(c, a_device); std::cout << "a.data: " << a.data() << "\n"; std::cout << "b.data: " << b.data() << "\n"; - std::cout << "b" << "\n"; + std::cout << "b" + << "\n"; for (int i = 0; i < n; ++i) { std::cout << b(i) << ", "; } std::cout << "\n"; - std::cout << "c (should be same as b)" << "\n"; + std::cout << "c (should be same as b)" + << "\n"; for (int i = 0; i < n; ++i) { std::cout << c(i) << ", "; } @@ -48,7 +46,7 @@ void run_2d() { int n = 3; - double arr[n*n]; + double arr[n * n]; Kokkos::View c(arr, n, n); Kokkos::View a("A", n, n); diff --git a/test/test_dvector_mirror_view_and_copy.cpp b/test/test_dvector_mirror_view_and_copy.cpp index 315d5cc..922f09f 100644 --- a/test/test_dvector_mirror_view_and_copy.cpp +++ b/test/test_dvector_mirror_view_and_copy.cpp @@ -15,8 +15,10 @@ using namespace nlcglib; typedef Kokkos::complex complex_double; -template -struct print_type {}; +template +struct print_type +{ +}; void run() @@ -29,13 +31,10 @@ run() auto Y = create_mirror_view_and_copy(device_space_t(), X); - if (Y.array().data() == X.array().data()) - { + if (Y.array().data() == X.array().data()) { std::cout << "create_mirror_view_and_copy(SPACE, KokkosDVectors) works" << "\n"; - } - else - { + } else { std::cout << "create_mirror_view_and_copy(SPACE, KokkosDVectors) broken" << "\n"; } @@ -58,7 +57,7 @@ run_copy_to_host() void test2() { - using matrix_t = Kokkos::View; + using matrix_t = Kokkos::View; int n = 10; matrix_t A("foo", n, n); @@ -69,8 +68,9 @@ test2() std::cout << "A.ptr = " << A.data() << "\n"; std::cout << "B.ptr = " << B.data() << "\n"; - if ( A.data() != B.data()) { - std::cout << "A and B are _distinct_ arrays!" << "\n"; + if (A.data() != B.data()) { + std::cout << "A and B are _distinct_ arrays!" + << "\n"; } auto C = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index b73c43b..417c84b 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,6 +1,6 @@ +#include #include "la/dvector.hpp" #include "la/lapack.hpp" -#include #include #include @@ -29,7 +29,9 @@ std::mt19937 gen(0); typedef Kokkos::complex complex_double; -void run() { +void +run() +{ int n = 4000; int m = 400; @@ -51,8 +53,6 @@ void run() { Kokkos::deep_copy(X.array(), host); - - // eigh(X); auto H = inner_()(X, X); auto HH = empty_like()(H); // auto Z = transform_alloc(X, H, 1.0); @@ -63,10 +63,10 @@ void run() { // Kokkos::parallel_for("init_H", // mdrange_policy({0, 0}, {m, m}), // MDFunctor(M)); - } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { Kokkos::initialize(); Communicator::init(argc, argv); diff --git a/test/test_kokkos.cpp b/test/test_kokkos.cpp index b3c9797..c73b8a1 100644 --- a/test/test_kokkos.cpp +++ b/test/test_kokkos.cpp @@ -1,24 +1,25 @@ +#include #include -#include "hip/hip_space.hpp" #include -#include +#include "hip/hip_space.hpp" /// to have exp available on device #include -auto unmanaged() +auto +unmanaged() { std::cout << "\nunmanaged\n"; int n = 10; - double* A = new double[n*n]; + double* A = new double[n * n]; for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { - A[n*i + j] = n * i + j; + A[n * i + j] = n * i + j; } } - Kokkos::View > a_view( + Kokkos::View> a_view( A, n, n); for (int i = 0; i < n; ++i) { @@ -32,21 +33,25 @@ auto unmanaged() } -auto unmanaged_strided() +auto +unmanaged_strided() { std::cout << "\nunmanaged_strided\n"; - int n = 10; // rows - int m = 10; // cols + int n = 10; // rows + int m = 10; // cols int lda = 12; double* A; - posix_memalign(reinterpret_cast(&A), 256, lda*m*sizeof(double)); + posix_memalign(reinterpret_cast(&A), 256, lda * m * sizeof(double)); for (int i = 0; i < n; ++i) { for (int j = 0; j < m; ++j) { - A[j*lda + i] = i*n + j; + A[j * lda + i] = i * n + j; } } - typedef Kokkos::View> + typedef Kokkos::View> vector_t; vector_t a_view(A, Kokkos::LayoutStride(n, 1, m, lda)); @@ -60,27 +65,26 @@ auto unmanaged_strided() } - -template -void kokkos_reduction() +template +void +kokkos_reduction() { using space = Kokkos::HostSpace; using exec_space = Kokkos::Serial; // using space = Kokkos::CudaSpace; // using exec_space = Kokkos::Cuda; - int n = 100; + int n = 100; Kokkos::View view("", n); - Kokkos::parallel_for("foo", Kokkos::RangePolicy(0, n), - KOKKOS_LAMBDA (int i) - { - view(i) = i; - } - ); + Kokkos::parallel_for( + "foo", Kokkos::RangePolicy(0, n), KOKKOS_LAMBDA(int i) { view(i) = i; }); numeric_t sum = 0; - Kokkos::parallel_reduce("summation", Kokkos::RangePolicy(0, view.size()), - KOKKOS_LAMBDA (int i, numeric_t& loc_sum) { loc_sum += view(i); }, sum); + Kokkos::parallel_reduce( + "summation", + Kokkos::RangePolicy(0, view.size()), + KOKKOS_LAMBDA(int i, numeric_t& loc_sum) { loc_sum += view(i); }, + sum); std::cout << "Sum is: " << sum << "\n"; } @@ -88,10 +92,10 @@ template void kokkos_reduction_device() { - #ifdef __NLCGLIB__CUDA +#ifdef __NLCGLIB__CUDA using space = Kokkos::CudaSpace; using exec_space = Kokkos::Cuda; - #elif defined __NLCGLIB__ROCM +#elif defined __NLCGLIB__ROCM using space = Kokkos::Experimental::HIPSpace; using exec_space = Kokkos::Experimental::HIP; #endif @@ -116,16 +120,19 @@ struct fun __device__ __host__ double operator()(double x) const { return 1 / (1 + exp(x)); } }; -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { Kokkos::initialize(); auto x = unmanaged(); auto x2 = unmanaged(); unmanaged_strided(); - std::cout << "trying reduction on cpu: " << "\n"; + std::cout << "trying reduction on cpu: " + << "\n"; kokkos_reduction(); - std::cout << "trying reduction on gpu: " << "\n"; + std::cout << "trying reduction on gpu: " + << "\n"; kokkos_reduction_device(); // // test // kokkos_view_stuff(); diff --git a/test/test_mpi_wrapper.cpp b/test/test_mpi_wrapper.cpp index 9516d07..f3934ca 100644 --- a/test/test_mpi_wrapper.cpp +++ b/test/test_mpi_wrapper.cpp @@ -48,7 +48,8 @@ run_pair() comm.allgather(buffer.data(), 2); for (auto i = 0ul; i < buffer.size(); ++i) { - std::cout << "rank " << comm.rank() << ": " << buffer[i].first << " , " << buffer[i].second << "\n"; + std::cout << "rank " << comm.rank() << ": " << buffer[i].first << " , " << buffer[i].second + << "\n"; } } @@ -72,10 +73,8 @@ run_pair2() for (auto i = 0ul; i < buffer.size(); ++i) { auto p1 = buffer[i]; - std::cout << "rank " << comm.rank() << ": (" - << p1.first.first << " , " - << p1.first.second << ") , " - << p1.second << "\n"; + std::cout << "rank " << comm.rank() << ": (" << p1.first.first << " , " << p1.first.second + << ") , " << p1.second << "\n"; } } diff --git a/test/test_mvector_cuda.cpp b/test/test_mvector_cuda.cpp index f80db38..045b01c 100644 --- a/test/test_mvector_cuda.cpp +++ b/test/test_mvector_cuda.cpp @@ -1,21 +1,22 @@ #include #include #include -#include "smearing.hpp" #include "la/mvector.hpp" +#include "smearing.hpp" #ifdef __NLCGLIB__CUDA -void run() +void +run() { int n = 10; - using vector_t = Kokkos::View; + using vector_t = Kokkos::View; vector_t a_view("test", n); auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_view); for (int i = 0; i < n; ++i) { - host_view(i) = i+1; + host_view(i) = i + 1; } Kokkos::deep_copy(a_view, host_view); diff --git a/test/test_smearing.cpp b/test/test_smearing.cpp index 75a3b62..8ffa5d4 100644 --- a/test/test_smearing.cpp +++ b/test/test_smearing.cpp @@ -1,4 +1,6 @@ #include + + #include "smearing.hpp" using namespace nlcglib; @@ -24,7 +26,7 @@ run(smearing_type smearing_t) int nk_loc = nk / comm.size(); - if (pid == nranks-1) { + if (pid == nranks - 1) { nk_loc = nk - (nranks - 1) * nk_loc; } @@ -38,13 +40,13 @@ run(smearing_type smearing_t) } double check = sum(wk, comm); - if (std::abs(check-1) > 1e-10) { - std::cout << sum(wk,comm) << "\n"; + if (std::abs(check - 1) > 1e-10) { + std::cout << sum(wk, comm) << "\n"; throw std::runtime_error("wrong weights"); } } - if(pid == nranks-1) { + if (pid == nranks - 1) { print(wk); } @@ -63,30 +65,29 @@ run(smearing_type smearing_t) mvector ek; for (int i = 0; i < nk_loc; ++i) { - auto key = std::make_pair(pid * nk / nranks + i, 0); double lb = -10; vec_t eki("ek" + std::to_string(i), num_bands); eki(0) = lb; for (int ib = 1; ib < num_bands; ++ib) { - eki(ib) = eki(ib-1) + std::exp(-0.05*ib); + eki(ib) = eki(ib - 1) + std::exp(-0.05 * ib); } ek[key] = eki; - if (i == 0) - print(ek); + if (i == 0) print(ek); } auto mu_fn = smearing.fn(ek); double S = smearing.entropy(std::get<1>(mu_fn), ek, std::get<0>(mu_fn)); double smax = comm.allreduce(S, mpi_op::max); - if ( S != smax) { + if (S != smax) { throw std::runtime_error("entropy differs"); } std::cout << "entropy is " << S << "\n"; } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { MPI_Init(&argc, &argv); Kokkos::initialize(); diff --git a/test/test_solver.cpp b/test/test_solver.cpp index 2050dc4..abf44e4 100644 --- a/test/test_solver.cpp +++ b/test/test_solver.cpp @@ -21,7 +21,7 @@ std::uniform_real_distribution unif01(0, 1); std::mt19937 gen(0); -template +template void run_unmanaged() { @@ -39,8 +39,7 @@ run_unmanaged() Kokkos::deep_copy(arr, host_view); using matrix_t = KokkosDVector; - matrix_t H( - Map<>(Communicator(), SlabLayoutV({{0, 0, ncols, ncols}}))); + matrix_t H(Map<>(Communicator(), SlabLayoutV({{0, 0, ncols, ncols}}))); inner(H, X, X); auto S = H.copy(); @@ -48,22 +47,23 @@ run_unmanaged() solve_sym(H /* will be overwritten by Cholesky factorization */, S /* will be overwritten by solution */); - std::cout << "extent " << S.array().extent(0) << "\n"; - std::cout << "extent " << S.array().extent(1) << "\n"; + std::cout << "extent " << S.array().extent(0) << "\n"; + std::cout << "extent " << S.array().extent(1) << "\n"; auto S_host = Kokkos::create_mirror(S.array()); Kokkos::deep_copy(S_host, S.array()); // print result for (int i = 0; i < ncols; ++i) { for (int j = 0; j < ncols; ++j) { - std::cout << S_host(i,j) << " "; + std::cout << S_host(i, j) << " "; } std::cout << "\n"; } } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { Kokkos::initialize(); Communicator::init(argc, argv); @@ -76,7 +76,8 @@ int main(int argc, char *argv[]) } std::cout << "thread_count: " << threads_count; - std::cout << "run on HOST" << "\n"; + std::cout << "run on HOST" + << "\n"; run_unmanaged(); std::cout << "run on DEVICE" diff --git a/test/test_stride_dvector.cpp b/test/test_stride_dvector.cpp index d3e689c..67165e8 100644 --- a/test/test_stride_dvector.cpp +++ b/test/test_stride_dvector.cpp @@ -1,22 +1,23 @@ +#include #include #include -#include #include "la/dvector.hpp" -auto unmanaged() +auto +unmanaged() { std::cout << "\nunmanaged\n"; int n = 10; - double* A = new double[n*n]; + double* A = new double[n * n]; for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { - A[n*i + j] = n * i + j; + A[n * i + j] = n * i + j; } } - Kokkos::View > a_view( + Kokkos::View> a_view( A, n, n); for (int i = 0; i < n; ++i) { @@ -30,31 +31,33 @@ auto unmanaged() } -auto unmanaged_strided() +auto +unmanaged_strided() { std::cout << "\nunmanaged_strided\n"; - int n = 10; // rows - int m = 10; // cols + int n = 10; // rows + int m = 10; // cols int lda = 12; double* A; - int res = posix_memalign(reinterpret_cast(&A), 256, lda*m*sizeof(double)); + int res = posix_memalign(reinterpret_cast(&A), 256, lda * m * sizeof(double)); std::cout << "A: " << A << "\n"; std::cout << "poisx_memalign: " << res << "\n"; for (int i = 0; i < n; ++i) { for (int j = 0; j < m; ++j) { - A[j*lda + i] = i*n + j; + A[j * lda + i] = i * n + j; } } - nlcglib::buffer_protocol buf{std::array{1, lda}, std::array{n, m}, A, nlcglib::memory_type::host}; + nlcglib::buffer_protocol buf{ + std::array{1, lda}, std::array{n, m}, A, nlcglib::memory_type::host}; nlcglib::Map<> map(nlcglib::Communicator(), nlcglib::SlabLayoutV({{0, 0, n, m}})); - nlcglib::KokkosDVector> - dvector(map, buf); + nlcglib::KokkosDVector> + dvector(map, buf); // check daata bool passed = true; @@ -64,14 +67,17 @@ auto unmanaged_strided() passed = passed && is_same; } } - if (passed) std::cout << "worked!\n"; - else std::cout << "failed!\n"; + if (passed) + std::cout << "worked!\n"; + else + std::cout << "failed!\n"; return dvector; } -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { Kokkos::initialize(); auto x = unmanaged(); @@ -84,5 +90,4 @@ int main(int argc, char *argv[]) std::cout << x2.data() << "\n"; Kokkos::finalize(); return 0; - } diff --git a/test/test_threaded.cpp b/test/test_threaded.cpp index e9a213a..ae5789f 100644 --- a/test/test_threaded.cpp +++ b/test/test_threaded.cpp @@ -2,8 +2,8 @@ #include "la/lapack.hpp" #include -#include #include +#include using namespace nlcglib; @@ -32,7 +32,8 @@ run_stacked_vector() } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { Kokkos::initialize(); MPI_Init(&argc, &argv); diff --git a/test/test_traits.cpp b/test/test_traits.cpp index 67905e4..2a531f3 100644 --- a/test/test_traits.cpp +++ b/test/test_traits.cpp @@ -7,15 +7,21 @@ class Foo { public: Foo() = default; - Foo(const Foo& ) { std::cout << "copy ctr" << "\n";} + Foo(const Foo&) + { + std::cout << "copy ctr" + << "\n"; + } Foo(Foo&&) { - std::cout << "move ctr" << "\n"; + std::cout << "move ctr" + << "\n"; } }; -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { auto x = eval(Foo()); return 0; diff --git a/test/test_utils.cpp b/test/test_utils.cpp index 3e4a593..110c1c6 100644 --- a/test/test_utils.cpp +++ b/test/test_utils.cpp @@ -3,10 +3,14 @@ using namespace nlcglib; -template -struct print_type {}; +template +struct print_type +{ +}; -void test_unzip() { +void +test_unzip() +{ mvector> Z; auto X = unzip(Z); auto x0 = std::get<0>(X); @@ -14,7 +18,8 @@ void test_unzip() { } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { test_unzip(); return 0; diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index 564a067..9a81468 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -7,5 +7,5 @@ endif() if(BUILD_TESTS) add_executable(gtest local/test_la_wrappers.cpp local/test_solver_wrappers.cpp) target_link_libraries(gtest PUBLIC nlcglib_core) - target_link_libraries(gtest PRIVATE GTest::GTest GTest::Main) + target_link_libraries(gtest PRIVATE GTest::gtest GTest::gtest_main) endif() diff --git a/unit_tests/local/test_la_wrappers.cpp b/unit_tests/local/test_la_wrappers.cpp index 347caaa..29a4d79 100644 --- a/unit_tests/local/test_la_wrappers.cpp +++ b/unit_tests/local/test_la_wrappers.cpp @@ -1,10 +1,10 @@ #include -#include "hip/hip_space.hpp" +#include #include +#include "hip/hip_space.hpp" #include "la/dvector.hpp" #include "la/lapack.hpp" #include "la/magma.hpp" -#include using namespace nlcglib; @@ -51,8 +51,7 @@ class CPUKokkosVectors : public ::testing::Test } } - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; vector_t a_; vector_t b_; @@ -62,8 +61,7 @@ class CPUKokkosVectors : public ::testing::Test TEST_F(CPUKokkosVectors, InnerProductCPU) { - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; int n = 5; vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, n, n}}))); inner(c, a_, b_); @@ -83,8 +81,7 @@ TEST_F(CPUKokkosVectors, InnerProductCPU) TEST_F(CPUKokkosVectors, TransformCPU) { - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; int n = a_.map().ncols(); int m = a_.map().nrows(); vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, m, n}}))); @@ -153,15 +150,16 @@ class GPUKokkosVectors : public ::testing::Test auto U = u_.array(); Kokkos::parallel_for( "init", mdrange_policy({0, 0}, {5, 5}), KOKKOS_LAMBDA(int i, int j) { - M(i, j) = c_arr[i * n + j]; - if (i == j) U(i, j) = 1; - else U(i, j) = 0; + M(i, j) = c_arr[i * n + j]; + if (i == j) + U(i, j) = 1; + else + U(i, j) = 0; }); } protected: - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; vector_t a_; vector_t b_; @@ -172,14 +170,12 @@ class GPUKokkosVectors : public ::testing::Test TEST_F(GPUKokkosVectors, InnerProductGPU) { - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; int n = 5; vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, n, n}}))); inner(c, a_, b_); // host vector type - typedef KokkosDVector - h_vector_t; + typedef KokkosDVector h_vector_t; h_vector_t h_c(c.map()); h_vector_t h_cRef(c.map()); @@ -199,15 +195,13 @@ TEST_F(GPUKokkosVectors, InnerProductGPU) TEST_F(GPUKokkosVectors, TransformGPU) { - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; int n = a_.map().ncols(); int m = a_.map().nrows(); vector_t c(Map<>(Communicator(), SlabLayoutV({{0, 0, m, n}}))); transform(c, 0., 1., a_, u_); // host vector type - typedef KokkosDVector - h_vector_t; + typedef KokkosDVector h_vector_t; h_vector_t h_c(c.map()); h_vector_t h_cRef(c.map()); @@ -226,8 +220,7 @@ TEST(EigenValues, EigHermitian) { // Poisson matrix: n =5, ones on diagonal, -2 on first off-diagonals typedef std::complex numeric_t; - typedef KokkosDVector - vector_t; + typedef KokkosDVector vector_t; const std::vector _Varr = { 2.88675135e-01, 5.00000000e-01, 5.77350269e-01, 5.00000000e-01, 2.88675135e-01, @@ -250,7 +243,7 @@ TEST(EigenValues, EigHermitian) Kokkos::parallel_for( "init", mdrange_policy({0, 0}, {5, 5}), KOKKOS_LAMBDA(int i, int j) { A_array(i, i) = 1; - if (std::abs(i-j) == 1) A_array(i, j) = -2; + if (std::abs(i - j) == 1) A_array(i, j) = -2; }); // @@ -271,7 +264,8 @@ TEST(EigenValues, EigHermitian) auto wh = Kokkos::create_mirror(w); Kokkos::deep_copy(wh, w); - std::cout << "eigenvalues" << "\n"; + std::cout << "eigenvalues" + << "\n"; for (int i = 0; i < wh.extent(0); ++i) { // double err = eigs[i] - wh(i); EXPECT_NEAR(wh(i), eigs[i], 1e-8); @@ -289,17 +283,17 @@ main(int argc, char *argv[]) Kokkos::initialize(); - #ifdef __NLCGLIB__MAGMA +#ifdef __NLCGLIB__MAGMA nlcg_init_magma(); - #endif +#endif result = RUN_ALL_TESTS(); Kokkos::finalize(); - #ifdef __NLCGLIB__MAGMA +#ifdef __NLCGLIB__MAGMA nlcg_finalize_magma(); - #endif +#endif Communicator::finalize(); diff --git a/unit_tests/local/test_solver_wrappers.cpp b/unit_tests/local/test_solver_wrappers.cpp index 5c1c14d..5f7cf59 100644 --- a/unit_tests/local/test_solver_wrappers.cpp +++ b/unit_tests/local/test_solver_wrappers.cpp @@ -1,11 +1,11 @@ #include -#include "hip/hip_space.hpp" #include #include +#include #include "gtest/gtest.h" +#include "hip/hip_space.hpp" #include "la/dvector.hpp" #include "la/lapack.hpp" -#include using namespace nlcglib; @@ -15,7 +15,7 @@ using complex_double = Kokkos::complex; int nrows = 200; int ncols = 20; -template +template class TestSymSolve : public ::testing::Test { public: @@ -23,7 +23,9 @@ class TestSymSolve : public ::testing::Test TestSymSolve() : X(Map<>(Communicator(), SlabLayoutV({{0, 0, nrows, ncols}}))) , H(Map<>(Communicator(), SlabLayoutV({{0, 0, ncols, ncols}}))) - {} + { + } + protected: using vector_t = KokkosDVector; // @@ -33,7 +35,8 @@ class TestSymSolve : public ::testing::Test template -void TestSymSolve::SetUp() +void +TestSymSolve::SetUp() { std::uniform_real_distribution unif01(0, 1); std::mt19937 gen(0); @@ -77,11 +80,10 @@ TYPED_TEST(TestSymSolve, PotrfPotrs) // check result for (int i = 0; i < ncols; ++i) { for (int j = 0; j < ncols; ++j) { - if ( i == j ) { + if (i == j) { EXPECT_NEAR(S_host(i, j).real(), 1, 1e-8); EXPECT_NEAR(S_host(i, j).imag(), 0, 1e-8); - } - else { + } else { EXPECT_NEAR(S_host(i, j).imag(), 0, 1e-8); EXPECT_NEAR(S_host(i, j).real(), 0, 1e-8); }