diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index f60008ab72..450a0975ab 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -188,12 +188,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 06328c83c1..e1f8aa51f8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -38,13 +38,13 @@ jobs: egress-policy: audit - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 + uses: github/codeql-action/init@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -52,7 +52,7 @@ jobs: # Prefix the list here with "+" to use these queries and those in the config file. - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: 'kokkos/kokkos' path: 'kokkos' @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 + uses: github/codeql-action/analyze@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: category: "/language:c-cpp" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index b911317970..1792f0181c 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -22,6 +22,6 @@ jobs: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: 'Dependency Review' uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 901e218fdc..9690446a4f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,12 +25,12 @@ jobs: doxygen --version - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 5517b68dbb..08b541587f 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Install Dependencies run: sudo apt install clang-format-8 diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index 5fd01d972b..0d20177b96 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index 7b55f065bf..9735e405f1 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -107,12 +107,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 082467d614..fa23b5dd72 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 2885ca7fae..bf06213b40 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -38,7 +38,7 @@ jobs: egress-policy: audit - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: persist-credentials: false @@ -65,7 +65,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4 with: name: SARIF file path: results.sarif @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 + uses: github/codeql-action/upload-sarif@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: sarif_file: results.sarif diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml index 8fe8053f5b..c38d04ac8d 100644 --- a/.github/workflows/spr.yml +++ b/.github/workflows/spr.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp new file mode 100644 index 0000000000..b0ea39fa3f --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp @@ -0,0 +1,73 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pttrf_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPttrfInput( + [[maybe_unused]] const DViewType &d, [[maybe_unused]] const EViewType &e) { + static_assert(Kokkos::is_view::value, + "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); + + static_assert(DViewType::rank == 1, + "KokkosBatched::pttrf: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, + "KokkosBatched::pttrf: EViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int nd = d.extent(0); + const int ne = e.extent(0); + + if (ne + 1 != nd) { + Kokkos::printf( + "KokkosBatched::pttrf: Dimensions of d and e do not match: d: %d, e: " + "%d \n" + "e.extent(0) must be equal to d.extent(0) - 1\n", + nd, ne); + return 1; + } +#endif + return 0; +} + +template <> +struct SerialPttrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, + const EViewType &e) { + // Quick return if possible + if (d.extent(0) == 0) return 0; + if (d.extent(0) == 1) return (d(0) < 0 ? 1 : 0); + + auto info = checkPttrfInput(d, e); + if (info) return info; + + return SerialPttrfInternal::invoke( + d.extent(0), d.data(), d.stride(0), e.data(), e.stride(0)); + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp new file mode 100644 index 0000000000..5b4d3fb182 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp @@ -0,0 +1,211 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +struct SerialPttrfInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, + ValueType *KOKKOS_RESTRICT d, + const int ds0, + ValueType *KOKKOS_RESTRICT e, + const int es0); + + template + KOKKOS_INLINE_FUNCTION static int invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0); +}; + +/// +/// Real matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + ValueType *KOKKOS_RESTRICT e, const int es0) { + int info = 0; + + auto update = [&](const int i) { + auto ei_tmp = e[i * es0]; + e[i * es0] = ei_tmp / d[i * ds0]; + d[(i + 1) * ds0] -= e[i * es0] * ei_tmp; + }; + + auto check_positive_definitiveness = [&](const int i) { + return (d[i] <= 0.0) ? (i + 1) : 0; + }; + + // Compute the L*D*L' (or U'*D*U) factorization of A. + const int i4 = (n - 1) % 4; + for (int i = 0; i < i4; i++) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + } // for (int i = 0; i < i4; i++) + + for (int i = i4; i < n - 4; i += 4) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 1); + if (info) { + return info; + } +#endif + + update(i + 1); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 2); + if (info) { + return info; + } +#endif + + update(i + 2); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 3); + if (info) { + return info; + } +#endif + + update(i + 3); + + } // for (int i = i4; i < n-4; 4) + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(n - 1); + if (info) { + return info; + } +#endif + + return 0; +} + +/// +/// Complex matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0) { + int info = 0; + + auto update = [&](const int i) { + auto eir_tmp = e[i * es0].real(); + auto eii_tmp = e[i * es0].imag(); + auto f_tmp = eir_tmp / d[i * ds0]; + auto g_tmp = eii_tmp / d[i * ds0]; + e[i * es0] = Kokkos::complex(f_tmp, g_tmp); + d[(i + 1) * ds0] = d[(i + 1) * ds0] - f_tmp * eir_tmp - g_tmp * eii_tmp; + }; + + auto check_positive_definitiveness = [&](const int i) { + return (d[i] <= 0.0) ? (i + 1) : 0; + }; + + // Compute the L*D*L' (or U'*D*U) factorization of A. + const int i4 = (n - 1) % 4; + for (int i = 0; i < i4; i++) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + } // for (int i = 0; i < i4; i++) + + for (int i = i4; i < n - 4; i += 4) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 1); + if (info) { + return info; + } +#endif + + update(i + 1); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 2); + if (info) { + return info; + } +#endif + + update(i + 2); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 3); + if (info) { + return info; + } +#endif + + update(i + 3); + + } // for (int i = i4; i < n-4; 4) + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(n - 1); + if (info) { + return info; + } +#endif + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp new file mode 100644 index 0000000000..675e73f744 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp @@ -0,0 +1,169 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Tbsv_Serial_Internal.hpp" + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkTbsvInput( + [[maybe_unused]] const AViewType &A, [[maybe_unused]] const XViewType &x, + [[maybe_unused]] const int k) { + static_assert(Kokkos::is_view::value, + "KokkosBatched::tbsv: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::tbsv: XViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::tbsv: AViewType must have rank 2."); + static_assert(XViewType::rank == 1, + "KokkosBatched::tbsv: XViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (k < 0) { + Kokkos::printf( + "KokkosBatched::tbsv: input parameter k must not be less than 0: k = " + "%d\n", + k); + return 1; + } + + const int lda = A.extent(0), n = A.extent(1); + if (lda < (k + 1)) { + Kokkos::printf( + "KokkosBatched::tbsv: leading dimension of A must be smaller than k+1: " + "lda = %d, k = %d\n", + lda, k); + return 1; + } + + const int nx = x.extent(0); + if (nx != n) { + Kokkos::printf( + "KokkosBatched::tbsv: Dimensions of x and A do not match: X: %d, A: %d " + "x %d\n" + "x.extent(0) must be equal to A.extent(1)\n", + nx, lda, n); + return 1; + } +#endif + return 0; +} + +//// Lower non-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalLower::invoke( + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Lower transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalLowerTranspose::invoke( + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Lower conjugate-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalLowerTranspose::invoke( + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Upper non-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalUpper::invoke( + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Upper transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalUpperTranspose::invoke( + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Upper conjugate-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalUpperTranspose::invoke( + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp new file mode 100644 index 0000000000..d2f5df4649 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp @@ -0,0 +1,224 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower, Non-Transpose +/// + +template +struct SerialTbsvInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalLower::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = 0; j < an; ++j) { + if (x[j * xs0] != static_cast(0)) { + if (!use_unit_diag) x[j * xs0] = x[j * xs0] / A[0 + j * as1]; + + auto temp = x[j * xs0]; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = j + 1; i < Kokkos::min(an, j + k + 1); ++i) { + x[i * xs0] = x[i * xs0] - temp * A[(i - j) * as0 + j * as1]; + } + } + } + + return 0; +} + +/// +/// Lower, Transpose +/// + +template +struct SerialTbsvInternalLowerTranspose { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalLowerTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = an - 1; j >= 0; --j) { + auto temp = x[j * xs0]; + + if (do_conj) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::min(an - 1, j + k); i > j; --i) { + temp -= + Kokkos::ArithTraits::conj(A[(i - j) * as0 + j * as1]) * + x[i * xs0]; + } + if (!use_unit_diag) + temp = temp / Kokkos::ArithTraits::conj(A[0 + j * as1]); + } else { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::min(an - 1, j + k); i > j; --i) { + temp -= A[(i - j) * as0 + j * as1] * x[i * xs0]; + } + if (!use_unit_diag) temp = temp / A[0 + j * as1]; + } + x[j * xs0] = temp; + } + + return 0; +} + +/// +/// Upper, Non-Transpose +/// + +template +struct SerialTbsvInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalUpper::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = an - 1; j >= 0; --j) { + if (x[j * xs0] != 0) { + if (!use_unit_diag) x[j * xs0] = x[j * xs0] / A[k * as0 + j * as1]; + + auto temp = x[j * xs0]; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = j - 1; i >= Kokkos::max(0, j - k); --i) { + x[i * xs0] = x[i * xs0] - temp * A[(k - j + i) * as0 + j * as1]; + } + } + } + + return 0; +} + +/// +/// Upper, Transpose +/// + +template +struct SerialTbsvInternalUpperTranspose { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalUpperTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = 0; j < an; j++) { + auto temp = x[j * xs0]; + if (do_conj) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::max(0, j - k); i < j; ++i) { + temp -= Kokkos::ArithTraits::conj( + A[(i + k - j) * as0 + j * as1]) * + x[i * xs0]; + } + if (!use_unit_diag) + temp = + temp / Kokkos::ArithTraits::conj(A[k * as0 + j * as1]); + } else { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::max(0, j - k); i < j; ++i) { + temp -= A[(i + k - j) * as0 + j * as1] * x[i * xs0]; + } + if (!use_unit_diag) temp = temp / A[k * as0 + j * as1]; + } + x[j * xs0] = temp; + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pttrf.hpp b/batched/dense/src/KokkosBatched_Pttrf.hpp new file mode 100644 index 0000000000..4fcc944dc8 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pttrf.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_HPP_ +#define KOKKOSBATCHED_PTTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pttrf: +/// Compute the Cholesky factorization L*D*L**T (or L*D*L**H) of a real +/// symmetric (or complex Hermitian) positive definite tridiagonal matrix A_l +/// for all l = 0, ..., N +/// +/// \tparam DViewType: Input type for the a diagonal matrix, needs to be a 1D +/// view +/// \tparam EViewType: Input type for the a upper/lower diagonal matrix, +/// needs to be a 1D view +/// +/// \param d [inout]: n diagonal elements of the diagonal matrix D +/// \param e [inout]: n-1 upper/lower diagonal elements of the diagonal matrix E +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPttrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, + const EViewType &e); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pttrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PTTRF_HPP_ diff --git a/batched/dense/src/KokkosBatched_Tbsv.hpp b/batched/dense/src/KokkosBatched_Tbsv.hpp new file mode 100644 index 0000000000..7510c07969 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Tbsv.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_TBSV_HPP_ +#define KOKKOSBATCHED_TBSV_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Tbsv: +/// +/// Solve Ab_l x_l = b_l for all l = 0, ..., N +/// using the triangular solve algorithm Tbsv. Ab is an n by n unit, or +/// non-unit, upper or lower triangular band matrix, with ( k + 1 ) +/// diagonals. +/// +/// \tparam AViewType: Input type for the matrix, needs to be a 2D view +/// \tparam XViewType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param A [in]: A is a lda by n banded matrix, with ( k + 1 ) diagonals +/// \param X [inout]: right-hand side and the solution, a rank 1 view +/// \param k [in]: k specifies the number of superdiagonals or subdiagonals of +/// matrix A. k >= 0 +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &X, const int k); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Tbsv_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_TBSV_HPP_ diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index cf9b3c23f4..76215b58f8 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -42,10 +42,16 @@ #include "Test_Batched_SerialTrsv.hpp" #include "Test_Batched_SerialTrsv_Real.hpp" #include "Test_Batched_SerialTrsv_Complex.hpp" +#include "Test_Batched_SerialTbsv.hpp" +#include "Test_Batched_SerialTbsv_Real.hpp" +#include "Test_Batched_SerialTbsv_Complex.hpp" #include "Test_Batched_SerialTrtri.hpp" #include "Test_Batched_SerialTrtri_Real.hpp" #include "Test_Batched_SerialTrtri_Complex.hpp" #include "Test_Batched_SerialSVD.hpp" +#include "Test_Batched_SerialPttrf.hpp" +#include "Test_Batched_SerialPttrf_Real.hpp" +#include "Test_Batched_SerialPttrf_Complex.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index 6a96bd193a..c1328291fb 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -16,10 +16,12 @@ #ifndef TEST_BATCHED_DENSE_HELPER_HPP #define TEST_BATCHED_DENSE_HELPER_HPP +#include "KokkosBatched_Util.hpp" + namespace KokkosBatched { template -void create_tridiagonal_batched_matrices(const MatrixViewType &A, - const VectorViewType &B) { +void create_tridiagonal_batched_matrices(const MatrixViewType& A, + const VectorViewType& B) { Kokkos::Random_XorShift64_Pool< typename VectorViewType::device_type::execution_space> random(13718); @@ -54,6 +56,101 @@ void create_tridiagonal_batched_matrices(const MatrixViewType &A, Kokkos::fence(); } + +template +void create_banded_triangular_matrix(InViewType& in, OutViewType& out, + int k = 1, bool band_storage = true) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + + Kokkos::deep_copy(h_in, in); + if (band_storage) { + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == static_cast(k + 1)); + assert(out.extent(2) == in.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, k - i1, i2) = h_in(i0, i2 - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i2 + i1, i2); + } + } + } + } + } else { + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < Kokkos::min(i1 + k + 1, BlkSize); i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = Kokkos::max(0, i1 - k); i2 <= i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + } + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Create a diagonal matrix from an input vector: +/// Copies the input vector into the diagonal of the output matrix specified +/// by the parameter k. k > 0 means that the matrix is upper-diagonal and +/// k < 0 means the lower-diagonal. k = 0 means the diagonal. +/// +/// \tparam InViewType: Input type for the vector, needs to be a 2D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched vector, a rank 2 view +/// \param out [out]: Output batched matrix, where the diagonal compnent +/// specified by k is filled with the input vector, a rank 3 view +/// \param k [in]: The diagonal offset to be filled (default is 0). +/// +template +void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == in.extent(1) + abs(k)); + + int i1_start = k >= 0 ? 0 : -k; + int i2_start = k >= 0 ? k : 0; + + // Zero clear the output matrix + using ScalarType = typename OutViewType::non_const_value_type; + Kokkos::deep_copy(h_out, ScalarType(0.0)); + + Kokkos::deep_copy(h_in, in); + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + h_out(i0, i1 + i1_start, i1 + i2_start) = h_in(i0, i1); + } + } + + Kokkos::deep_copy(out, h_out); +} + } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp new file mode 100644 index 0000000000..6ee7818ddc --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp @@ -0,0 +1,467 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pttrf.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pttrf { + +template +struct Functor_BatchedSerialPttrf { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) + : _d(d), _e(e) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k, int &info) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPttrf::invoke(dd, ee); + } + + inline int run() { + using value_type = typename DViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, + const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, + _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pttrf test for random matrix +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), + A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), + L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d("d", N, BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e_upper("e_upper", N, BlkSize - 1), + e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + RealType realRandStart, realRandEnd; + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, realRandStart, realRandEnd); + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + + // Add BlkSize to ensure positive definiteness + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, + realRandEnd + BlkSize); + Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); + + auto h_e_upper = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_lower = Kokkos::create_mirror_view(e_lower); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize - 1; i++) { + // Fill the lower diagonal with conjugate of the upper diagonal + h_e_lower(ib, i) = + Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + } + } + + Kokkos::deep_copy(e_lower, h_e_lower); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiagonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(e_upper, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + .run(); + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + auto info = Functor_BatchedSerialPttrf(d, e_lower) + .run(); + + Kokkos::fence(); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(info, 0); +#endif + + // Reconstruct L and D from factorized matrix + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(d, D); + + // Copy I to L + Kokkos::deep_copy(L, I); + + // EL + I by EL * I + L (result stored in L) + Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + .run(); + + // Reconstruct A by L*D*L**H + // Gemm to compute L*D -> LD + Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + .run(); + + // FIXME: We should use SerialGemm Trans::ConjTranspose. + // For the moment, we compute the complex conjugate of L and + // then use Trans::Transpose. + // Gemm to compute (L*D)*L**H -> A_reconst + // Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + // A_reconst) + // .run(); + + // Compute the complex conjugate of L + // L -> conj(L) + auto h_L = Kokkos::create_mirror_view(L); + Kokkos::deep_copy(h_L, L); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_L(ib, i, j) = Kokkos::ArithTraits::conj(h_L(ib, i, j)); + } + } + } + Kokkos::deep_copy(L, h_L); + + // Gemm to compute (L*D)*(conj(L))**T -> A_reconst + Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + A_reconst) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check A = L*D*L**H + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +template +/// \brief Implementation details of batched pttrf test for early return +/// BlkSize must be 0 or 1 +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + if (BlkSize > 1) return; + + const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; + + RealView2DType d("d", N, BlkSize), + d2("d2", N, BlkSize); // Diagonal components + View2DType e("e", N, + BlkSize_minus_1); // lower diagonal components + + const RealType reference_value = 4.0; + + Kokkos::deep_copy(d, reference_value); + Kokkos::deep_copy(d2, -reference_value); + Kokkos::deep_copy(e, ScalarType(1.0)); + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + // Early return if BlkSize is 0 or 1 + auto info = Functor_BatchedSerialPttrf(d, e) + .run(); + + // For negative values, info should be 1 for BlkSize = 1 + auto info2 = Functor_BatchedSerialPttrf(d2, e) + .run(); + + Kokkos::fence(); + + int expected_info2 = BlkSize == 0 ? 0 : N; + EXPECT_EQ(info, 0); + EXPECT_EQ(info2, expected_info2); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_d = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d); + auto h_d2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d2); + + // Check if d is unchanged + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_d(ib, i), reference_value, eps); + EXPECT_NEAR_KK(h_d2(ib, i), -reference_value, eps); + } + } +} + +template +/// \brief Implementation details of batched pttrf test +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), + A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), + L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize - 1); // Upper and lower diagonal components (identical) + + Kokkos::deep_copy(d, RealType(4.0)); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiaonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e, EL, -1); + create_diagonal_matrix(e, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + .run(); + + // Factorize matrix A -> L * D * L**T + // d and e are updated by pttrf + auto info = Functor_BatchedSerialPttrf(d, e) + .run(); + + Kokkos::fence(); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(info, 0); +#endif + + // Reconstruct L and D from factorized matrix + create_diagonal_matrix(e, EL, -1); + create_diagonal_matrix(d, D); + + // Copy I to L + Kokkos::deep_copy(L, I); + + // EL + I by EL * I + L (result stored in L) + Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + .run(); + + // Reconstruct A by L*D*L**T + // Gemm to compute L*D -> LD + Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + .run(); + + // Gemm to compute (L*D)*L**T -> A_reconst + Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + A_reconst) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check A = L*D*L.T + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +} // namespace Pttrf +} // namespace Test + +template +int test_batched_pttrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + for (int i = 0; i < 2; i++) { + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + for (int i = 0; i < 2; i++) { + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 2, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp new file mode 100644 index 0000000000..febccc5cb3 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pttrf_fcomplex) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pttrf_dcomplex) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp new file mode 100644 index 0000000000..8b0fb658fe --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pttrf_float) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pttrf_double) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp new file mode 100644 index 0000000000..572e02053b --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp @@ -0,0 +1,349 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Tbsv.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Tbsv { + +template +struct ParamTag { + using uplo = U; + using trans = T; + using diag = D; +}; + +template +struct Functor_BatchedSerialTrsv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + + ScalarType _alpha; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a, + const BViewType &b) + : _a(a), _b(b), _alpha(alpha) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + KokkosBatched::SerialTrsv< + typename ParamTagType::uplo, typename ParamTagType::trans, + typename ParamTagType::diag, AlgoTagType>::invoke(_alpha, aa, bb); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialTbsv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialTbsv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + int _k; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k) + : _a(a), _b(b), _k(k) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + KokkosBatched::SerialTbsv< + typename ParamTagType::uplo, typename ParamTagType::trans, + typename ParamTagType::diag, AlgoTagType>::invoke(aa, bb, _k); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialTbsv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +/// \brief Implementation details of batched tbsv test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) { + using execution_space = typename DeviceType::execution_space; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + // Reference is created by trsv from triangular matrix + View3DType A("A", N, BlkSize, BlkSize), Ref("Ref", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x1("x1", N, BlkSize); // Solutions + + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(Ref, rand_pool, randStart, randEnd); + Kokkos::fill_random(x0, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(x1, x0); + + // Create triangluar or banded matrix + create_banded_triangular_matrix(Ref, A, k, + false); + create_banded_triangular_matrix(Ref, Ab, k, + true); + + // Reference trsv + Functor_BatchedSerialTrsv(1.0, A, x0) + .run(); + + // tbsv + Functor_BatchedSerialTbsv(Ab, x1, k) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using mag_type = typename ats::mag_type; + mag_type eps = 1.0e3 * ats::epsilon(); + + // Check x0 = x1 + auto h_x0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x0); + auto h_x1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x1); + for (int i = 0; i < N; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_x0(i, j), h_x1(i, j), eps); + } + } +} + +template +/// \brief Implementation details of batched tbsv test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +void impl_test_batched_tbsv_analytical(const std::size_t N) { + using execution_space = typename DeviceType::execution_space; + using View2DType = Kokkos::View; + using StridedView2DType = + Kokkos::View; + using View3DType = Kokkos::View; + + // Reference is created by trsv from triangular matrix + constexpr std::size_t BlkSize = 3, k = 2, incx = 2; + + View3DType A("A", N, BlkSize, BlkSize), ref("Ref", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize); // Solutions + + // Testing incx argument with strided Views + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx}; + StridedView2DType x1("x1", layout); // Solutions + + Kokkos::RangePolicy policy(0, N); + Kokkos::parallel_for( + "KokkosBatched::Test::SerialTbsv::Initialize", policy, + KOKKOS_LAMBDA(const std::size_t ib) { + for (std::size_t i = 0; i < BlkSize; i++) { + for (std::size_t j = 0; j < BlkSize; j++) { + ref(ib, i, j) = i + 1; + } + } + for (std::size_t j = 0; j < BlkSize; j++) { + x0(ib, j) = 1; + x1(ib, j) = 1; + } + + if (std::is_same_v) { + if (std::is_same_v) { + if (std::is_same_v) { + x_ref(ib, 0) = 1.0 / 2.0; + x_ref(ib, 1) = 1.0 / 6.0; + x_ref(ib, 2) = 1.0 / 3.0; + } else { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = -1.0; + x_ref(ib, 2) = 1.0; + } + } else { + if (std::is_same_v) { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = 0.0; + x_ref(ib, 2) = 0.0; + } else { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = 0.0; + x_ref(ib, 2) = 0.0; + } + } + } else { + if (std::is_same_v) { + if (std::is_same_v) { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = -1.0 / 2.0; + x_ref(ib, 2) = -1.0 / 6.0; + } else { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = -1.0; + x_ref(ib, 2) = 1.0; + } + } else { + if (std::is_same_v) { + x_ref(ib, 0) = 0.0; + x_ref(ib, 1) = 0.0; + x_ref(ib, 2) = 1.0 / 3.0; + } else { + x_ref(ib, 0) = 2.0; + x_ref(ib, 1) = -2.0; + x_ref(ib, 2) = 1.0; + } + } + } + }); + + Kokkos::fence(); + + // Create triangluar or banded matrix + create_banded_triangular_matrix(ref, A, k, + false); + create_banded_triangular_matrix(ref, Ab, k, + true); + + // tbsv + Functor_BatchedSerialTbsv(Ab, x0, k) + .run(); + + // tbsv with incx == 2 + Functor_BatchedSerialTbsv(Ab, x1, k) + .run(); + + Kokkos::fence(); + + // Check x0 = x_ref and x1 = x_ref + // Firstly, prepare contiguous views on host + auto h_x0 = Kokkos::create_mirror_view(x0); + auto h_x1 = Kokkos::create_mirror_view(x0); + + Kokkos::deep_copy(h_x0, x0); + + // Pack x1 into x0 for contiguous storage + Kokkos::parallel_for( + "KokkosBatched::Test::SerialTbsv::Copy", policy, + KOKKOS_LAMBDA(const std::size_t ib) { + for (std::size_t j = 0; j < BlkSize; j++) { + x0(ib, j) = x1(ib, j); + } + }); + + Kokkos::fence(); + Kokkos::deep_copy(h_x1, x0); + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using mag_type = typename ats::mag_type; + mag_type eps = 1.0e3 * ats::epsilon(); + + auto h_x_ref = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t j = 0; j < BlkSize; j++) { + // Check x0 = x_ref + EXPECT_NEAR_KK(h_x0(ib, j), h_x_ref(ib, j), eps); + + // Check x1 = x_ref + EXPECT_NEAR_KK(h_x1(ib, j), h_x_ref(ib, j), eps); + } + } +} + +} // namespace Tbsv +} // namespace Test + +template +int test_batched_tbsv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + for (int i = 0; i < 10; i++) { + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + for (int i = 0; i < 10; i++) { + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp new file mode 100644 index 0000000000..8789cc6931 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp @@ -0,0 +1,120 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +// NO TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +// TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_t_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_t_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} + +/* [FIXME] These tests need Trans::ConjTranspose in trsv. +// CONJUGATE TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_ct_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_ct_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_ct_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_ct_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +*/ +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp new file mode 100644 index 0000000000..8915b4ad05 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp @@ -0,0 +1,137 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +// NO TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +// TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_t_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_t_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +// NO TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +// TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_t_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_t_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +#endif diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 50173538fb..1fc6b7d480 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -85,6 +85,7 @@ struct Algo { using SolveLU = Level3; using QR = Level3; using UTV = Level3; + using Pttrf = Level3; struct Level2 { struct Unblocked {}; @@ -116,6 +117,7 @@ struct Algo { using Gemv = Level2; using Trsv = Level2; using ApplyQ = Level2; + using Tbsv = Level2; }; namespace Impl { diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 614b48d47a..88ffc63810 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -163,6 +163,7 @@ void gemv(const ExecutionSpace& space, const char trans[], #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #ifdef KOKKOS_ENABLE_SYCL // oneMKL supports both row-major and column-major of A + // but only supports oneapi::mkl::transpose::nontrans op useFallback = useFallback || !std::is_same_v; diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 304dd349bf..07d9476b66 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -824,6 +824,10 @@ struct kokkos_to_std_type_map { const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ const YViewType& Y) { \ + if (beta == Kokkos::ArithTraits::zero()) { \ + Kokkos::deep_copy(Y, Kokkos::ArithTraits::zero()); \ + } \ + \ bool row_major = std::is_same::value; \ const std::int64_t M = A.extent(0); \ const std::int64_t N = A.extent(1); \ diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index c0a9de3b50..4edcd82944 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,13 +1,47 @@ -# MPL: 12/29/2022: CMake regular way to find a package -FIND_PACKAGE(ROCBLAS) -if(TARGET roc::rocblas) -## MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention -## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) - SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) - SET(TPL_IMPORTED_NAME roc::rocblas) -## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention -## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) - ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") +IF(ROCBLAS_LIBRARIES AND ROCBLAS_LIBRARY_DIRS AND ROCBLAS_INCLUDE_DIRS) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES ${ROCBLAS_LIBRARIES} + LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS} + HEADER_PATHS ${ROCBLAS_INCLUDE_DIRS} + ) +ELSEIF(ROCBLAS_LIBRARIES AND ROCBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES ${ROCBLAS_LIBRARIES} + LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS} + HEADER rocblas.h + ) +ELSEIF(ROCBLAS_LIBRARIES) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES ${ROCBLAS_LIBRARIES} + HEADER rocblas.h + ) +ELSEIF(ROCBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES rocblas + LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS} + HEADER rocblas.h + ) +ELSEIF(ROCBLAS_ROOT OR KokkosKernels_ROCBLAS_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES rocblas + HEADER rocblas.h + ) +ELSE() # backwards-compatible way + FIND_PACKAGE(ROCBLAS) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT ROCBLAS_FOUND) + #Important note here: this find Module is named TPLROCBLAS + #The eventual target is named ROCBLAS. To avoid naming conflicts + #the find module is called TPLROCBLAS. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE + LINK_LIBRARIES "${ROCBLAS_LIBRARIES}") + ENDIF() + ENDIF() ENDIF() diff --git a/cmake/Modules/FindTPLROCSOLVER.cmake b/cmake/Modules/FindTPLROCSOLVER.cmake index 8f2a92cfda..58eae9f8f5 100644 --- a/cmake/Modules/FindTPLROCSOLVER.cmake +++ b/cmake/Modules/FindTPLROCSOLVER.cmake @@ -1,9 +1,48 @@ -# LBV: 11/08/2023: This file follows the partern of FindTPLROCBLAS.cmake/FindTPLROCSPARSE.cmake -FIND_PACKAGE(ROCSOLVER) -if(TARGET roc::rocsolver) - SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver) - SET(TPL_IMPORTED_NAME roc::rocsolver) - ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found") +IF(ROCSOLVER_LIBRARIES AND ROCSOLVER_LIBRARY_DIRS AND ROCSOLVER_INCLUDE_DIRS) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES ${ROCSOLVER_LIBRARIES} + LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS} + HEADER_PATHS ${ROCSOLVER_INCLUDE_DIRS} + ) +ELSEIF(ROCSOLVER_LIBRARIES AND ROCSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES ${ROCSOLVER_LIBRARIES} + LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS} + HEADER rocsolver.h + ) +ELSEIF(ROCSOLVER_LIBRARIES) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES ${ROCSOLVER_LIBRARIES} + HEADER rocsolver.h + ) +ELSEIF(ROCSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES rocsolver + LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS} + HEADER rocsolver.h + ) +ELSEIF(ROCSOLVER_ROOT OR KokkosKernels_ROCSOLVER_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES rocsolver + HEADER rocsolver.h + ) +ELSE() # backwards-compatible way + FIND_PACKAGE(ROCSOLVER) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT ROCSOLVER_FOUND) + #Important note here: this find Module is named TPLROCSOLVER + #The eventual target is named ROCSOLVER. To avoid naming conflicts + #the find module is called TPLROCSOLVER. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSOLVER REQUIRED_VARS ROCSOLVER_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCSOLVER_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSOLVER REQUIRED_VARS ROCSOLVER_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSOLVER INTERFACE + LINK_LIBRARIES "${ROCSOLVER_LIBRARIES}") + ENDIF() + ENDIF() ENDIF() + diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index 5f985ff3a8..3b45ba5e82 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,9 +1,47 @@ -# MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake -FIND_PACKAGE(ROCSPARSE) -if(TARGET roc::rocsparse) - SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) - SET(TPL_IMPORTED_NAME roc::rocsparse) - ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") +IF(ROCSPARSE_LIBRARIES AND ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_INCLUDE_DIRS) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES ${ROCSPARSE_LIBRARIES} + LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS} + HEADER_PATHS ${ROCSPARSE_INCLUDE_DIRS} + ) +ELSEIF(ROCSPARSE_LIBRARIES AND ROCSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES ${ROCSPARSE_LIBRARIES} + LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS} + HEADER rocsparse.h + ) +ELSEIF(ROCSPARSE_LIBRARIES) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES ${ROCSPARSE_LIBRARIES} + HEADER rocsparse.h + ) +ELSEIF(ROCSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES rocsparse + LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS} + HEADER rocsparse.h + ) +ELSEIF(ROCSPARSE_ROOT OR KokkosKernels_ROCSPARSE_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES rocsparse + HEADER rocsparse.h + ) +ELSE() # backwards-compatible way + FIND_PACKAGE(ROCSPARSE) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT ROCSPARSE_FOUND) + #Important note here: this find Module is named TPLROCSPARSE + #The eventual target is named ROCSPARSE. To avoid naming conflicts + #the find module is called TPLROCSPARSE. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE + LINK_LIBRARIES "${ROCSPARSE_LIBRARIES}") + ENDIF() + ENDIF() ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 6af952ce94..49d1adcdcb 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -330,6 +330,9 @@ MACRO(kokkoskernels_export_imported_tpl NAME) GET_TARGET_PROPERTY(TPL_INCLUDES ${TPL_IMPORTED_NAME} INTERFACE_INCLUDE_DIRECTORIES) IF(TPL_INCLUDES) + # remove duplicates to prevent incorrect number of arguments to INTERFACE_INCLUDE_DIRECTORIES + # see issue #2238 + LIST(REMOVE_DUPLICATES TPL_INCLUDES) KOKKOSKERNELS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES ${TPL_INCLUDES}") ENDIF() diff --git a/common/src/KokkosKernels_BlockUtils.hpp b/common/src/KokkosKernels_BlockUtils.hpp index 006a38a6e4..6fd9d9b656 100644 --- a/common/src/KokkosKernels_BlockUtils.hpp +++ b/common/src/KokkosKernels_BlockUtils.hpp @@ -39,7 +39,7 @@ template KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, value_type *dst, const value_type *val) { - memcpy(dst, val, block_dim * block_dim * sizeof(value_type)); + memcpy((void *)dst, val, block_dim * block_dim * sizeof(value_type)); } // Performs A += B on blocks diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index b447f13397..055c1d6d32 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -342,9 +342,9 @@ struct IsRelativelyIdenticalFunctor { if (val_diff > mag_type(eps)) { Kokkos::printf( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " - "(eps = %e)\n", + "(eps = %e, rel err = %e)\n", (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), - KAT::imag(view2(i)), eps); + KAT::imag(view2(i)), eps, val_diff); num_diffs++; } } diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index ba8049cecf..92419424b6 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -1527,13 +1527,17 @@ struct array_sum_reduce { } }; -template -KOKKOS_INLINE_FUNCTION T *alignPtr(InPtr p) { +template +KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr *p) { // ugly but computationally free and the "right" way to do this in C++ - std::uintptr_t ptrVal = reinterpret_cast(p); + const std::uintptr_t ptrVal = reinterpret_cast(p); // ptrVal + (align - 1) lands inside the next valid aligned scalar_t, // and the mask produces the start of that scalar_t. - return reinterpret_cast((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1))); + const std::uintptr_t ptrValNew = + (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)); + return reinterpret_cast( + reinterpret_cast(const_cast *>(p)) + + (ptrValNew - ptrVal)); } } // namespace Impl diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index 2ccf9c2103..fb93a494d6 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -16,6 +16,7 @@ #ifndef TEST_COMMON_HPP #define TEST_COMMON_HPP +#include #include // #include #include diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp new file mode 100644 index 0000000000..760cddd5a2 --- /dev/null +++ b/common/unit_test/Test_Common_AlignPtrTo.hpp @@ -0,0 +1,166 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file + +This test file was motivated by an observation in the SpGEMM on SYCL that +strange values were coming out of the pointer alignment functions, causing +Kokkos::atomic_add to be a no-op or write 0. The Kokkos Kernels alignPtrTo +function was updated with the one of four implementations that was observed to +work on SYCL (even though all four in here should be okay.) + +TEST_FN 0-3 are various implemetations, and TEST_FN 4 is testing Kokkos Kernels +implementation. The tests are written to PASS for the observed SYCL behavor - +i.e., that TEST_FN 1,4 produce aligned pointers, and the others do not (even +though they should). If the other functions start working on SYCL, then this +test will "fail", and the Kokkos Kernels implementation should be updated with +one of the now-working (and faster?) implementations. +*/ + +#ifndef TEST_COMMON_ALIGNPTRTO_HPP +#define TEST_COMMON_ALIGNPTRTO_HPP + +#include +#include +#include + +namespace { + +// the original Kokkos Kernels implementation +template +KOKKOS_INLINE_FUNCTION T *f0(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + return reinterpret_cast((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1))); +} + +// an implementation that works for SYCL +template +KOKKOS_INLINE_FUNCTION T *f1(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + while (ptrVal % alignof(T)) { + ++ptrVal; + } + return reinterpret_cast(ptrVal); +} + +// another valid implementation +template +KOKKOS_INLINE_FUNCTION T *f2(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + return reinterpret_cast((ptrVal + alignof(T) - 1) / alignof(T) * + alignof(T)); +} + +// the way GCC does it (roughly) +template +KOKKOS_INLINE_FUNCTION T *f3(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + return reinterpret_cast((ptrVal - uint64_t(1) + alignof(T)) & + -alignof(T)); +} + +// Function to be executed by each team +template +struct TeamFunction { + TeamFunction() = default; + TeamFunction(const Results &results) : results_(results) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const Team &team) const { + // get an "aligned" pointer to scratch memory + char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() * + sizeof(double))); + double *vals; + if constexpr (0 == TEST_FN) { + vals = f0(shmem); + } else if constexpr (1 == TEST_FN) { + vals = f1(shmem); + } else if constexpr (2 == TEST_FN) { + vals = f2(shmem); + } else if constexpr (3 == TEST_FN) { + vals = f3(shmem); + } else if constexpr (4 == TEST_FN) { + vals = KokkosKernels::Impl::alignPtrTo(shmem); + } else { + static_assert(std::is_void_v, "Unexpected test function"); + } + + const size_t i = team.team_rank(); + double val = team.team_rank(); + vals[i] = 0; // zero shared memory + Kokkos::atomic_add(&vals[i], val); +#if 0 // debugging + Kokkos::printf("%s:%i result(%lu) += %f yielded %f\n", __FILE__, __LINE__, i, val, vals[i]); +#endif + + results_(i) = vals[i]; + } + + size_t team_shmem_size(int team_size) const { + return team_size * sizeof(double); + } + + Results results_; +}; + +// use atomic add to set result(i) = i +template +void test_alignPtrTo() { + using MemorySpace = typename Device::memory_space; + using ExecSpace = typename Device::execution_space; + using TestView = Kokkos::View; + using TestPolicy = Kokkos::TeamPolicy; + const int teamSize = TestPolicy(1, Kokkos::AUTO) + .team_size_max(TeamFunction(), + Kokkos::ParallelForTag{}); + + ExecSpace space; + + TestView results("TestView", teamSize); + TestPolicy policy(space, 1, teamSize); + Kokkos::parallel_for("test alignment", policy, + TeamFunction(results)); + + int errs; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, teamSize), + KOKKOS_LAMBDA(int i, int &lerr) { lerr += (results(i) != i); }, errs); + +// if SYCL is enabled, only TEST_FN 1 and 4 should work +#if defined(KOKKOS_ENABLE_SYCL) + if constexpr (std::is_same_v) { + if constexpr ((1 == TEST_FN) || (4 == TEST_FN)) { + EXPECT_EQ(0, errs); + } else { + EXPECT_NE(0, errs); + } + } else { + EXPECT_EQ(0, errs); + } +#else + EXPECT_EQ(0, errs); +#endif +} + +TEST_F(TestCategory, common_AlignPtrTo_0) { test_alignPtrTo<0, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_1) { test_alignPtrTo<1, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_2) { test_alignPtrTo<2, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_3) { test_alignPtrTo<3, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_kk) { test_alignPtrTo<4, TestDevice>(); } + +} // anonymous namespace + +#endif // TEST_COMMON_ALIGNPTRTO diff --git a/example/graph/PartitioningExample b/example/graph/PartitioningExample deleted file mode 100755 index 88619a8d12..0000000000 Binary files a/example/graph/PartitioningExample and /dev/null differ diff --git a/example/wiki/sparse/CMakeLists.txt b/example/wiki/sparse/CMakeLists.txt index 16d6a3a89d..8d061c24f8 100644 --- a/example/wiki/sparse/CMakeLists.txt +++ b/example/wiki/sparse/CMakeLists.txt @@ -10,6 +10,11 @@ if (KOKKOSKERNELS_ENABLE_EXPERIMENTAL) ) endif() +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_bsrmatrix_2 + SOURCES KokkosSparse_wiki_bsrmatrix_2.cpp + ) + KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( wiki_crsmatrix SOURCES KokkosSparse_wiki_crsmatrix.cpp diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp new file mode 100644 index 0000000000..7ff56ff14a --- /dev/null +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp @@ -0,0 +1,247 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include + +#include "Kokkos_Core.hpp" + +#include "KokkosKernels_default_types.hpp" +#include "KokkosSparse_BsrMatrix.hpp" + +using Scalar = default_scalar; +using Ordinal = default_lno_t; +using Offset = default_size_type; +using Layout = default_layout; + +template +struct bsr_fill { + bsrmatrix_type bsr_mat; + + bsr_fill(bsrmatrix_type bsr_mat_) : bsr_mat(bsr_mat_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& rowIdx) const { + if (rowIdx == 0) { // Left boundary condition + auto block_tmp = bsr_mat.unmanaged_block(0); + block_tmp(0, 0) = 1.0; + block_tmp(0, 1) = 0.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = 1.0; + } else if (rowIdx == bsr_mat.numRows() - 1) { // Right boundary condition + auto block_tmp = + bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + block_tmp(0, 0) = 1.0; + block_tmp(1, 1) = 1.0; + } else { + auto block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx)); + block_tmp(0, 0) = -1.0; + block_tmp(0, 1) = -1.0 / 2.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = -1.0; + + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + block_tmp(0, 0) = 2.0; + block_tmp(0, 1) = 0.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = 2.0; + + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2); + block_tmp(0, 0) = -1.0; + block_tmp(0, 1) = 1.0 / 2.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = -1.0; + } + } +}; + +template +struct diagonal_extractor { + using graph_type = typename bsrmatrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; + using bsr_block_type = typename bsrmatrix_type::block_type; + + bsrmatrix_type bsr_mat; + row_map_type row_map; + entries_type entries; + diag_blocks_type diag_blocks; + + diagonal_extractor(bsrmatrix_type bsr_mat_, diag_blocks_type diag_blocks_) + : bsr_mat(bsr_mat_), + row_map(bsr_mat_.graph.row_map), + entries(bsr_mat_.graph.entries), + diag_blocks(diag_blocks_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& rowIdx) const { + for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1); + ++entryIdx) { + if (entries(entryIdx) == rowIdx) { + bsr_block_type bsr_diag_block = bsr_mat.unmanaged_block(entryIdx); + for (int i = 0; i < bsr_mat.blockDim(); ++i) { + for (int j = 0; j < bsr_mat.blockDim(); ++j) { + diag_blocks(rowIdx, i, j) = bsr_diag_block(i, j); + } + } + } + } + } +}; + +int main(int argc, char* argv[]) { + using device_type = typename Kokkos::Device< + Kokkos::DefaultExecutionSpace, + typename Kokkos::DefaultExecutionSpace::memory_space>; + using bsrmatrix_type = + typename KokkosSparse::Experimental::BsrMatrix; + using graph_type = typename bsrmatrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; + + Kokkos::initialize(argc, argv); + { + // + // We will create a 1D discretization for the coupled thermo-elastic + // diffusion + // + // -\div(EA \grad_s(u) - \alpha(T-T0)I) = f_u + // -\kappa\Delta(T) = f_T + // + // The problem is discretized using finite differences as follows: + // \frac{d^2 u}{dx^2}\approx \frac{u_{i+1}-2u_i+u_{i-1}}{h_x^2} + // \frac{dT}{dx}\approx\frac{T_{i+1}-T_{i-1}}{2h_x} + // \frac{d^2T}{dx^2}\approx\frac{T_{i+1}-2T_i+T_{i-1}}{h_x^2} + // + // This leads to the combined stencil (assuming all unit coefficients): + // + // [-1 1/2] [2 0] [-1 -1/2] + // [ 0 -1] [0 2] [ 0 -1] + // + // First the graph for the mesh will be constructed. + // Second a BsrMatrix will be constructed from the graph + // Third the values of the BsrMatrix will be filled. + + constexpr Ordinal blockSize = 2; + constexpr Ordinal numRows = 10; + constexpr Offset numNNZ = 3 * numRows - 2; + bsrmatrix_type bsr_mat; + + { + typename row_map_type::non_const_type row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"), + numRows + 1); + typename entries_type::non_const_type entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"), + numNNZ); + typename row_map_type::HostMirror row_map_h = + Kokkos::create_mirror_view(row_map); + typename entries_type::HostMirror entries_h = + Kokkos::create_mirror_view(entries); + + // First Step: build the CrsGraph + { + // Build the row pointers and store numNNZ + + row_map_h(0) = 0; + for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) { + if (rowIdx == 0) { + row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 2; + + entries_h(row_map_h(rowIdx)) = rowIdx; + entries_h(row_map_h(rowIdx) + 1) = rowIdx + 1; + } else if (rowIdx == numRows - 1) { + row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 2; + + entries_h(row_map_h(rowIdx)) = rowIdx - 1; + entries_h(row_map_h(rowIdx) + 1) = rowIdx; + } else { + row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 3; + + entries_h(row_map_h(rowIdx)) = rowIdx - 1; + entries_h(row_map_h(rowIdx) + 1) = rowIdx; + entries_h(row_map_h(rowIdx) + 2) = rowIdx + 1; + } + } + + if (row_map_h(numRows) != numNNZ) { + std::ostringstream error_msg; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" + << row_map_h(numRows) << ", numNNZ=" << numNNZ; + throw std::runtime_error(error_msg.str()); + } + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + } + + graph_type myGraph(entries, row_map); + + // Second Step: build the BsrMatrix from graph and block size + bsr_mat = bsrmatrix_type("block matrix", myGraph, blockSize); + + bsr_fill fillFunctor(bsr_mat); + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), fillFunctor); + + std::cout << "BsrMatrix graph: " << std::endl; + for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) { + std::cout << " ["; + for (int colIdx = 0; colIdx < entries_h(row_map_h(rowIdx)); ++colIdx) { + std::cout << " "; + } + std::cout << "*"; + for (Offset entryIdx = row_map_h(rowIdx); + entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) { + for (int colIdx = entries_h(entryIdx) + 1; + colIdx < entries_h(entryIdx + 1); ++colIdx) { + std::cout << " "; + } + std::cout << "*"; + } + for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1; + colIdx < numRows; ++colIdx) { + std::cout << " "; + } + std::cout << "]" << std::endl; + } + } + + // Extract diagonal block and store them in a rank-3 view + using diag_blocks_type = + Kokkos::View; + diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize, + blockSize); + diagonal_extractor myFunc(bsr_mat, diag_blocks); + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), myFunc); + + auto diag_blocks_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks); + + std::cout << "\nBsrMatrix diagonal blocks: " << std::endl; + for (int blockId = 0; blockId < diag_blocks_h.extent_int(0); ++blockId) { + std::cout << " [" << diag_blocks_h(blockId, 0, 0) << ", " + << diag_blocks_h(blockId, 0, 1) << "]" << std::endl; + std::cout << " [" << diag_blocks_h(blockId, 1, 0) << ", " + << diag_blocks_h(blockId, 1, 1) << "]\n" + << std::endl; + } + } + Kokkos::finalize(); + + return 0; +} diff --git a/graph/impl/KokkosGraph_BFS_impl.hpp b/graph/impl/KokkosGraph_BFS_impl.hpp index e73c1cb489..9ea5d63e07 100644 --- a/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/graph/impl/KokkosGraph_BFS_impl.hpp @@ -38,7 +38,7 @@ struct SerialRCM { host_lno_view_t entries; SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) - : numVerts(rowmap_.extent(0) - 1), + : numVerts(std::max(rowmap_.extent_int(0), 1) - 1), rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), rowmap_.extent(0)), entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), @@ -47,35 +47,39 @@ struct SerialRCM { Kokkos::deep_copy(entries, entries_); } - lno_t findPseudoPeripheral() { - // Choose vertex with smallest degree - lno_t periph = -1; - lno_t periphDeg = numVerts; - for (lno_t i = 0; i < numVerts; i++) { - lno_t deg = rowmap(i + 1) - rowmap(i); - if (deg < periphDeg) { - periph = i; - periphDeg = deg; - if (deg == 0) break; - } - } - return periph; - } - lno_view_t rcm() { - lno_t start = findPseudoPeripheral(); + // Given a label L, labelReverse - L gives the reversed label (as in reverse + // Cuthill McKee) + lno_t labelReverse = numVerts - 1; host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), numVerts); host_lno_view_t label( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), numVerts); for (lno_t i = 0; i < numVerts; i++) label(i) = -1; - lno_t qhead = 0; - lno_t qtail = 0; - label(start) = qtail; + lno_t qhead = 0; + lno_t qtail = 0; + // List of all vertices, in order from lowest to highest degree + // (heuristic for best to worst starting vertex for RCM). + // If the graph has multiple connected components, restart at the first + // unlabeled vertex in this list. + host_lno_view_t allVertices( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"), + numVerts); + for (lno_t i = 0; i < numVerts; i++) allVertices(i) = i; + std::sort(allVertices.data(), allVertices.data() + numVerts, + [&](lno_t n1, lno_t n2) -> bool { + // return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < + (rowmap(n2 + 1) - rowmap(n2)); + }); + lno_t allVerticesIter = 0; + // Start RCM with the first vertex in allVertices + lno_t start = allVertices(allVerticesIter++); + label(start) = labelReverse - qtail; q(qtail++) = start; + // Reuse this neighbor list for all levels without deallocating std::vector neighbors; - lno_t outerQueue = 0; while (true) { lno_t v = q(qhead++); neighbors.clear(); @@ -94,7 +98,7 @@ struct SerialRCM { }); // label and enqueue all unlabeled neighbors for (lno_t nei : neighbors) { - label(nei) = qtail; + label(nei) = labelReverse - qtail; q(qtail++) = nei; } if (qtail == numVerts) { @@ -102,16 +106,15 @@ struct SerialRCM { break; } else if (qhead == qtail) { // have exhausted this connected component, but others remain unlabeled - while (label(outerQueue) != -1) outerQueue++; - label(outerQueue) = qtail; - q(qtail++) = outerQueue; + while (label(allVertices(allVerticesIter)) != -1) allVerticesIter++; + lno_t restart = allVertices(allVerticesIter); + label(restart) = labelReverse - qtail; + q(qtail++) = restart; } } lno_view_t labelOut( Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), numVerts); - // reverse the labels - for (lno_t i = 0; i < numVerts; i++) label(i) = numVerts - label(i) - 1; Kokkos::deep_copy(labelOut, label); return labelOut; } diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index 2e05554d2d..a6d165d8c3 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -19,7 +19,7 @@ #include "KokkosGraph_RCM.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_CrsMatrix.hpp" +#include "Kokkos_StaticCrsGraph.hpp" #include @@ -81,7 +81,7 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm) { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - lno_t numVerts = rowmap.extent(0) - 1; + lno_t numVerts = std::max(1, rowmap.extent_int(0)) - 1; int bw = 0; for (lno_t i = 0; i < numVerts; i++) { lno_t origRow = perm(i); @@ -97,18 +97,10 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, return bw; } -template -void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type rowmap_t; - typedef typename graph_t::entries_type entries_t; - lno_t numVerts = gridX * gridY * gridZ; - typename rowmap_t::non_const_type rowmap; - typename entries_t::non_const_type entries; - generate7pt(rowmap, entries, gridX, gridY, gridZ); +template +void test_rcm(const rowmap_t& rowmap, const entries_t& entries, + bool expectBandwidthReduced) { + using lno_t = typename entries_t::non_const_value_type; auto rcm = KokkosGraph::Experimental::graph_rcm( rowmap, entries); auto rowmapHost = @@ -116,6 +108,7 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); + lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1; decltype(rcmHost) rcmPermHost( Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts); for (lno_t i = 0; i < numVerts; i++) rcmPermHost(rcmHost(i)) = i; @@ -130,21 +123,119 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { } for (lno_t i = 0; i < numVerts; i++) ASSERT_EQ(counts[i], 1); } - Kokkos::View identityOrder( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts); - for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i; - size_t origBW = - maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); - size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); - EXPECT_LE(rcmBW, origBW); + if (expectBandwidthReduced) { + Kokkos::View identityOrder( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts); + for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i; + size_t origBW = + maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); + size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); + EXPECT_LE(rcmBW, origBW); + } +} + +template +void test_rcm_zerorows() { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + rowmap_t rowmap; + entries_t entries; + test_rcm(rowmap, entries, false); +} + +template +void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, + bool expectBandwidthReduced) { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + rowmap_t rowmap; + entries_t entries; + generate7pt(rowmap, entries, gridX, gridY, gridZ); + test_rcm(rowmap, entries, expectBandwidthReduced); +} + +template +void test_rcm_4clique() { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + rowmap_t rowmap("rowmap", 5); + entries_t entries("entries", 16); + auto rowmap_host = Kokkos::create_mirror_view(rowmap); + auto entries_host = Kokkos::create_mirror_view(entries); + for (lno_t i = 0; i < 5; i++) rowmap_host(i) = i * 4; + for (lno_t i = 0; i < 16; i++) entries_host(i) = i % 4; + Kokkos::deep_copy(rowmap, rowmap_host); + Kokkos::deep_copy(entries, entries_host); + test_rcm(rowmap, entries, false); +} + +template +void test_rcm_multiple_components() { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + // Generate a single 3D grid first + rowmap_t rowmap_cube; + entries_t entries_cube; + generate7pt(rowmap_cube, entries_cube, 7, 7, 7); + auto rowmap_cube_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube); + auto entries_cube_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube); + lno_t nv_cube = 7 * 7 * 7; + lno_t ne_cube = entries_cube.extent(0); + // Now replicate the graph twice, so there are 2 disconnected copies of the + // cube + rowmap_t rowmap("rowmap", nv_cube * 2 + 1); + entries_t entries("entries", ne_cube * 2); + auto rowmap_host = Kokkos::create_mirror_view(rowmap); + auto entries_host = Kokkos::create_mirror_view(entries); + for (lno_t i = 0; i <= nv_cube * 2; i++) { + if (i < nv_cube) + rowmap_host(i) = rowmap_cube_host(i); + else + rowmap_host(i) = ne_cube + rowmap_cube_host(i - nv_cube); + } + for (lno_t i = 0; i < ne_cube * 2; i++) { + if (i < ne_cube) + entries_host(i) = entries_cube_host(i); + else + entries_host(i) = nv_cube + entries_cube_host(i - ne_cube); + } + Kokkos::deep_copy(rowmap, rowmap_host); + Kokkos::deep_copy(entries, entries_host); + test_rcm(rowmap, entries, true); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - graph##_##rcm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm(6, 3, 3); \ - test_rcm(20, 20, 20); \ - test_rcm(100, 100, 1); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_zerorows(); \ + } \ + TEST_F(TestCategory, \ + graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_7pt(1, 1, 1, false); \ + test_rcm_7pt(2, 1, 1, false); \ + test_rcm_7pt(6, 3, 3, true); \ + test_rcm_7pt(20, 20, 20, true); \ + test_rcm_7pt(100, 100, 1, true); \ + } \ + TEST_F(TestCategory, \ + graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_4clique(); \ + } \ + TEST_F( \ + TestCategory, \ + graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_multiple_components(); \ } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index c85b126019..95dcc78ab1 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -24,7 +24,13 @@ #include #include // std::setprecision -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// cuSPARSE ILU and IC factorizations were removed +// completely in cuSPARSE 12.5 +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 12500) +#define USE_CUSPARSE_ILU +#endif + +#ifdef USE_CUSPARSE_ILU #include #endif @@ -39,8 +45,6 @@ #include #include -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \ - (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) using namespace KokkosSparse; using namespace KokkosSparse::Experimental; using namespace KokkosKernels; @@ -52,8 +56,8 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, int team_size, int /*vector_length*/, /*int idx_offset,*/ int loop) { typedef default_scalar scalar_t; - typedef default_lno_t lno_t; - typedef default_size_type size_type; + typedef int lno_t; + typedef int size_type; typedef Kokkos::DefaultExecutionSpace execution_space; typedef typename execution_space::memory_space memory_space; @@ -82,6 +86,11 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "\n\n" << std::endl; if (!afilename.empty()) { +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && !defined(USE_CUSPARSE_ILU) + std::cout << "** Note: cuSPARSE is enabled, but the cusparseXcsrilu*\n"; + std::cout << " functions were removed in cuSPARSE 12.5.\n"; + std::cout << " Only KokkosKernels spiluk will be run.\n\n"; +#endif std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl; crsmat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix( @@ -91,11 +100,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, const int nnz = A.nnz(); const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(kin); -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU // std::cout << " cusparse: create handle" << std::endl; cusparseStatus_t status; cusparseHandle_t handle = 0; @@ -131,10 +136,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaMalloc((void **)&pBuffer, pBufferSize); -#else - std::cout << "Note: the cuSPARSE TPL is enabled, but either offset=int or " - "ordinal=int is disabled, so it can't be used.\n"; -#endif #endif for (auto test : tests) { @@ -223,11 +224,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "nrm2(A*e-L*U*e) = " << std::setprecision(15) << bb_nrm << std::endl; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU if (fill_lev == 0) { std::cout << "CUSPARSE: No KK interface added yet" << std::endl; @@ -383,7 +380,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, } // end row std::cout << "ILU(0) SUCCESS!" << std::endl; } // fill_lev=0 -#endif #endif // Benchmark @@ -407,11 +403,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MAX_TIME: " << max_time << std::endl; std::cout << "LOOP_MIN_TIME: " << min_time << std::endl; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU if (fill_lev == 0) { lno_view_t A_row_map("A_row_map", nrows + 1); lno_nnz_view_t A_entries("A_entries", nnz); @@ -441,21 +433,15 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MAX_TIME (cuSPARSE): " << max_time << std::endl; std::cout << "LOOP_MIN_TIME (cuSPARSE): " << min_time << std::endl; } // fill_lev=0 -#endif #endif } // end tests -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU // step 6: free resources cudaFree(pBuffer); cusparseDestroyCsrilu02Info(info); cusparseDestroyMatDescr(descr); cusparseDestroy(handle); -#endif #endif } // end if (!afilename.empty()) @@ -583,9 +569,3 @@ int main(int argc, char **argv) { Kokkos::finalize(); return 0; } -#else -int main() { - std::cout << "The SPILUK perf_test requires CUDA >= 8.0\n"; - return 0; -} -#endif diff --git a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp index 65120a8827..6b9c244da3 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp @@ -228,25 +228,37 @@ std::string getCuSparseErrorString(cusparseStatus_t status) { /* ========================================================================================= */ #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#if CUSPARSE_VERSION >= 12500 +template +bool check_cusparse(host_crsmat_t &, bool, crsmat_t &, bool, crsmat_t &, int *, + int *, double, int) { + // TODO: call KokkosSparse::sptrsv (if hardcoded problem settings below are + // compatible), or add wrappers for modern interface (cusparseSpSV*) + throw std::logic_error("Legacy cuSPARSE csrsv interface not available."); + return false; +} + +#else + template bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_majorU, crsmat_t &U, int *perm_r, int *perm_c, double tol, int loop) { using values_view_t = typename crsmat_t::values_type::non_const_type; - using scalar_t = typename values_view_t::value_type; - using size_type = typename crsmat_t::size_type; + using scalar_t = typename values_view_t::value_type; + using size_type = typename crsmat_t::size_type; using host_values_view_t = typename host_crsmat_t::values_type::non_const_type; using execution_space = typename values_view_t::execution_space; - using memory_space = typename execution_space::memory_space; + using memory_space = typename execution_space::memory_space; using host_execution_space = typename host_values_view_t::execution_space; - using host_memory_space = typename host_execution_space::memory_space; + using host_memory_space = typename host_execution_space::memory_space; using host_scalar_view_t = Kokkos::View; - using scalar_view_t = Kokkos::View; + using scalar_view_t = Kokkos::View; const scalar_t ZERO(0.0); const scalar_t ONE(1.0); @@ -258,7 +270,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // > create a handle cusparseStatus_t status; cusparseHandle_t handle = 0; - status = cusparseCreate(&handle); + status = cusparseCreate(&handle); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreate failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -269,7 +281,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // > create a empty info structure for L-solve (e.g., analysis results) csrsv2Info_t infoL = 0; - status = cusparseCreateCsrsv2Info(&infoL); + status = cusparseCreateCsrsv2Info(&infoL); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateCsrsv2Info failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -279,14 +291,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // Preparing for L-solve // step 1: create a descriptor size_type nnzL = L.nnz(); - auto graphL = L.graph; // in_graph - auto row_mapL = graphL.row_map; - auto entriesL = graphL.entries; - auto valuesL = L.values; + auto graphL = L.graph; // in_graph + auto row_mapL = graphL.row_map; + auto entriesL = graphL.entries; + auto valuesL = L.values; // NOTE: it is stored in CSC = UPPER + TRANSPOSE cusparseMatDescr_t descrL = 0; - status = cusparseCreateMatDescr(&descrL); + status = cusparseCreateMatDescr(&descrL); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateMatDescr failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -300,7 +312,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // step 2: query how much memory used in csrsv2, and allocate the buffer // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. int pBufferSize; - void *pBufferL = 0; + void *pBufferL = 0; cusparseOperation_t transL = (col_majorL ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE); if (std::is_same::value) { @@ -374,14 +386,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, timer.reset(); if (std::is_same::value) { const double alpha = 1.0; - status = cusparseDcsrsv2_solve( + status = cusparseDcsrsv2_solve( handle, transL, nrows, nnzL, &alpha, descrL, reinterpret_cast(valuesL.data()), row_mapL.data(), entriesL.data(), infoL, reinterpret_cast(rhs.data()), reinterpret_cast(sol.data()), policy, pBufferL); } else { const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); - status = cusparseZcsrsv2_solve( + status = cusparseZcsrsv2_solve( handle, transL, nrows, nnzL, &alpha, descrL, reinterpret_cast(valuesL.data()), row_mapL.data(), entriesL.data(), infoL, reinterpret_cast(rhs.data()), @@ -404,14 +416,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // ============================================== // Preparing for U-solve size_type nnzU = U.nnz(); - auto graphU = U.graph; // in_graph - auto row_mapU = graphU.row_map; - auto entriesU = graphU.entries; - auto valuesU = U.values; + auto graphU = U.graph; // in_graph + auto row_mapU = graphU.row_map; + auto entriesU = graphU.entries; + auto valuesU = U.values; // > create a empty info structure for U-solve (e.g., analysis results) csrsv2Info_t infoU = 0; - status = cusparseCreateCsrsv2Info(&infoU); + status = cusparseCreateCsrsv2Info(&infoU); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateCsrsv2Info failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -420,7 +432,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // ============================================== // step 1: create a descriptor cusparseMatDescr_t descrU = 0; - status = cusparseCreateMatDescr(&descrU); + status = cusparseCreateMatDescr(&descrU); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateMatDescr create status error name " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -438,7 +450,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // ============================================== // step 2: query how much memory used in csrsv2, and allocate the buffer // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. - void *pBufferU = 0; + void *pBufferU = 0; cusparseOperation_t transU = (col_majorU ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE); if (std::is_same::value) { @@ -485,14 +497,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, timer.reset(); if (std::is_same::value) { const double alpha = 1.0; - status = cusparseDcsrsv2_solve( + status = cusparseDcsrsv2_solve( handle, transU, nrows, nnzU, &alpha, descrU, reinterpret_cast(valuesU.data()), row_mapU.data(), entriesU.data(), infoU, reinterpret_cast(sol.data()), reinterpret_cast(rhs.data()), policy, pBufferU); } else { const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); - status = cusparseZcsrsv2_solve( + status = cusparseZcsrsv2_solve( handle, transU, nrows, nnzU, &alpha, descrU, reinterpret_cast(valuesU.data()), row_mapU.data(), entriesU.data(), infoU, reinterpret_cast(sol.data()), @@ -652,6 +664,8 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, } return success; } +#endif + #else template bool check_cusparse(host_crsmat_t & /*Mtx*/, bool /*col_majorL*/, diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index 6eb9044733..a36200b295 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -270,8 +270,7 @@ struct KokkosBSPGEMM(tmp); + scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo(tmp); BlockAccumulator hm(block_dim, pow2_hash_size, pow2_hash_func, nullptr, nullptr, hash_ids, hash_values); @@ -414,7 +413,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); BlockAccumulator hm(block_dim, thread_shmem_key_size, thread_shared_memory_hash_func, begins, nexts, keys, @@ -554,7 +553,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); @@ -601,8 +600,7 @@ struct KokkosBSPGEMM( - tmp + pow2_hash_size); + KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); } // initialize begins. { @@ -885,7 +883,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp index bc1b378558..22111d3752 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -325,7 +325,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); KokkosKernels::Experimental::BlockHashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp index 7f1ff2171e..f773bdc0d8 100644 --- a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -99,6 +99,7 @@ template Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) { using bsr_value_type = typename Bsr::value_type; using bsr_ordinal_type = typename Bsr::ordinal_type; + using crs_size_type = typename Crs::non_const_size_type; // copy matrix data to host auto hRowMap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), @@ -119,7 +120,7 @@ Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) { for (bsr_ordinal_type row = 0; row < bsr_ordinal_type(hRowMap.size()) - 1; ++row) { - for (size_t ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) { + for (crs_size_type ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) { bsr_ordinal_type col = hColInds(ci); bsr_value_type val = hVals(ci); diff --git a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp index fa356dc963..16c228d8ec 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -169,10 +169,11 @@ struct UnsortedNumericSumFunctor { const CcolindsT Bpos; }; -// Helper macro to check that two types are the same (ignoring const) -#define SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value +// Two types are the same (ignoring const) +template +constexpr bool spadd_numeric_same_type = + std::is_same_v, + typename std::remove_const_t>; template < typename execution_space, typename KernelHandle, typename alno_row_view_t, @@ -193,46 +194,56 @@ void spadd_numeric_impl( typedef typename KernelHandle::nnz_scalar_t scalar_type; // Check that A/B/C data types match KernelHandle types, and that C data types // are nonconst (doesn't matter if A/B types are const) - static_assert(SAME_TYPE(ascalar_t, scalar_type), + static_assert(spadd_numeric_same_type, "A scalar type must match handle scalar type"); - static_assert(SAME_TYPE(bscalar_t, scalar_type), + static_assert(spadd_numeric_same_type, "B scalar type must match handle scalar type"); - static_assert(SAME_TYPE(typename alno_row_view_t::value_type, size_type), - "add_symbolic: A size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_row_view_t::value_type, size_type), - "add_symbolic: B size_type must match KernelHandle size_type " - "(const doesn't matter)"); static_assert( - SAME_TYPE(typename clno_row_view_t::non_const_value_type, size_type), + spadd_numeric_same_type, + "add_symbolic: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + spadd_numeric_same_type, + "add_symbolic: B size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + spadd_numeric_same_type, "add_symbolic: C size_type must match KernelHandle size_type)"); - static_assert(SAME_TYPE(typename alno_nnz_view_t::value_type, ordinal_type), + static_assert(spadd_numeric_same_type, "add_symbolic: A entry type must match KernelHandle entry type " "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_nnz_view_t::value_type, ordinal_type), + static_assert(spadd_numeric_same_type, "add_symbolic: B entry type must match KernelHandle entry type " "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename clno_nnz_view_t::value_type, ordinal_type), + static_assert(spadd_numeric_same_type, "add_symbolic: C entry type must match KernelHandle entry type " "(aka nnz_lno_t)"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "add_symbolic: C entry type must not be const"); static_assert( - SAME_TYPE(typename ascalar_nnz_view_t::value_type, scalar_type), + spadd_numeric_same_type, "add_symbolic: A scalar type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); static_assert( - SAME_TYPE(typename bscalar_nnz_view_t::value_type, scalar_type), + spadd_numeric_same_type, "add_symbolic: B scalar type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); static_assert( - SAME_TYPE(typename cscalar_nnz_view_t::value_type, scalar_type), + spadd_numeric_same_type, "add_symbolic: C scalar type must match KernelHandle entry type (aka " "nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C scalar type must not be const"); + static_assert( + std::is_same_v, + "add_symbolic: C scalar type must not be const"); typedef Kokkos::RangePolicy range_type; auto addHandle = kernel_handle->get_spadd_handle(); // rowmap length can be 0 or 1 if #rows is 0. @@ -269,8 +280,6 @@ void spadd_numeric_impl( addHandle->set_call_numeric(); } -#undef SAME_TYPE - } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 80506e3056..9744e75ec3 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -24,10 +24,11 @@ namespace KokkosSparse { namespace Impl { -// Helper macro to check that two types are the same (ignoring const) -#define SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value +// Two types are the same (ignoring const) +template +constexpr bool spadd_symbolic_same_type = + std::is_same_v, + typename std::remove_const_t>; // get C rowmap for sorted input template , "add_symbolic: A size_type must match KernelHandle size_type (const " "doesn't matter)"); static_assert( - SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type), + spadd_symbolic_same_type, "add_symbolic: B size_type must match KernelHandle size_type (const " "doesn't matter)"); static_assert( - SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type), + spadd_symbolic_same_type, "add_symbolic: C size_type must match KernelHandle size_type)"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "add_symbolic: C size_type must not be const"); static_assert( - SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type), + spadd_symbolic_same_type, "add_symbolic: A entry type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); static_assert( - SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type), + spadd_symbolic_same_type, "add_symbolic: B entry type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "add_symbolic: C entry type must not be const"); // symbolic just needs to compute c_rowmap // easy for sorted, but for unsorted is easiest to just compute the whole sum @@ -540,9 +546,7 @@ void spadd_symbolic_impl( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", range_type(exec, 0, nrows), countEntries); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - exec, nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(exec, c_nnz_upperbound, - Kokkos::subview(c_rowmap_upperbound, nrows)); + exec, nrows + 1, c_rowmap_upperbound, c_nnz_upperbound); } ordinal_view_t c_entries_uncompressed( Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, @@ -589,13 +593,12 @@ void spadd_symbolic_impl( // provide the number of NNZ in C to user through handle size_type cmax; Kokkos::deep_copy(exec, cmax, Kokkos::subview(c_rowmap, nrows)); + exec.fence("fence before cmax used on host"); addHandle->set_c_nnz(cmax); addHandle->set_call_symbolic(); addHandle->set_call_numeric(false); } -#undef SAME_TYPE - } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index a420a81c90..54e4e228c8 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -59,6 +59,7 @@ void KokkosSPGEMM::KokkosSPGEMM_symbolic(c_row_view_t rowmapC_) { + Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_symbolic[NATIVE]"); { if (KOKKOSKERNELS_VERBOSE) { std::cout << "SYMBOLIC PHASE" << std::endl; @@ -162,6 +163,7 @@ void KokkosSPGEMM(tmp); + scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo(tmp); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), @@ -409,8 +408,7 @@ struct KokkosSPGEMM(tmp); + hm2.values = KokkosKernels::Impl::alignPtrTo(tmp); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), @@ -498,7 +496,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); KokkosKernels::Experimental::HashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, @@ -639,7 +637,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); @@ -686,8 +684,7 @@ struct KokkosSPGEMM( - tmp + pow2_hash_size); + KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); } // initialize begins. { @@ -970,7 +967,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); @@ -1241,6 +1238,7 @@ void KokkosSPGEMMget_spgemm_handle()->set_c_nnz(result_index); Kokkos::deep_copy(row_mapC, h_rmc); Kokkos::fence(); + Kokkos::Profiling::popRegion(); } template (all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); KokkosKernels::Experimental::HashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, @@ -468,6 +468,7 @@ void KokkosSPGEMM(tmp); + scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo(tmp); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), @@ -452,7 +451,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); // Create the hashmaps KokkosKernels::Experimental::HashmapAccumulator< @@ -610,7 +609,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); int vector_rank = 0; @@ -826,7 +825,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); int vector_rank = 0; @@ -871,8 +870,7 @@ struct KokkosSPGEMM( - tmp + pow2_hash_size); + KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); nnz_lno_t num_threads = pow2_hash_size / vector_size; Kokkos::parallel_for( diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index a2bb19a44c..f1f4c5700e 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -32,8 +32,6 @@ namespace KokkosSparse { namespace Impl { -constexpr const char* KOKKOSSPARSE_ALG_NATIVE_MERGE = "native-merge"; - // This TransposeFunctor is functional, but not necessarily performant. template @@ -609,7 +607,8 @@ static void spmv_beta(const execution_space& exec, Handle* handle, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - if (handle->algo == SPMV_MERGE_PATH) { + if (handle->algo == SPMV_MERGE_PATH || + handle->algo == SPMV_NATIVE_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { @@ -617,7 +616,8 @@ static void spmv_beta(const execution_space& exec, Handle* handle, dobeta, false>(exec, handle, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { - if (handle->algo == SPMV_MERGE_PATH) { + if (handle->algo == SPMV_MERGE_PATH || + handle->algo == SPMV_NATIVE_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { diff --git a/sparse/impl/KokkosSparse_spmv_team_spec.hpp b/sparse/impl/KokkosSparse_spmv_team_spec.hpp index 156123b113..f065a34fb6 100644 --- a/sparse/impl/KokkosSparse_spmv_team_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_team_spec.hpp @@ -37,7 +37,7 @@ struct TeamSpmv { return Impl::TeamSpmvInternal::invoke< MemberType, ScalarType, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, dobeta>( - member, x.extent(0), alpha, values.data(), values.stride_0(), + member, y.extent(0), alpha, values.data(), values.stride_0(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); @@ -56,7 +56,7 @@ struct TeamVectorSpmv { return Impl::TeamVectorSpmvInternal::invoke< MemberType, ScalarType, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, dobeta>( - member, x.extent(0), alpha, values.data(), values.stride_0(), + member, y.extent(0), alpha, values.data(), values.stride_0(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 53a1c853e7..bc31f14791 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -28,20 +28,12 @@ #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV // Enable supernodal sptrsv +#include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBatched_Util.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" -#endif -#include "KokkosBlas3_trsm.hpp" -#include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_Trsm_Team_Impl.hpp" -#include "KokkosBlas1_team_axpby.hpp" -#include "KokkosBlas1_axpby.hpp" -#include "KokkosBlas1_set.hpp" -#include "KokkosBatched_LU_Decl.hpp" -#include "KokkosBlas2_serial_gemv_impl.hpp" - -//#define SERIAL_FOR_LOOP +#endif #define KOKKOSKERNELS_SPTRSV_TRILVLSCHED @@ -107,9 +99,7 @@ struct SptrsvWrap { }; /** - * Common base class for sptrsv functors that need to work for both - * point and block matrices. Default version does not support - * blocks + * Common base class for sptrsv functors */ template @@ -121,420 +111,145 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - using reftype = scalar_t &; - using ArrayType = reftype; - using SumArray = reftype; - - struct SBlock { - template - KOKKOS_INLINE_FUNCTION SBlock(T, size_type, size_type) {} - - KOKKOS_INLINE_FUNCTION - scalar_t *data() { return nullptr; } - - static int shmem_size(size_type, size_type) { return 0; } - }; - - static constexpr size_type BUFF_SIZE = 1; - - Common(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, - LHSType &lhs_, - const RHSType &rhs_, + Common(const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, - const size_type block_size_) + const size_type block_size_ = 0) : row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) - { - KK_REQUIRE_MSG(block_size_ == 0, - "Tried to use blocks with the unblocked Common?"); - } - - KOKKOS_INLINE_FUNCTION - size_type get_block_size() const { return 0; } - - // lset - KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const scalar_t value) const { - lhs(row) = value; - } - - // add. y += x - KOKKOS_INLINE_FUNCTION - static void add(const member_type &team, const scalar_t& x, scalar_t& y) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { y += x; }); - team.team_barrier(); - } - - // serial add. y += x - KOKKOS_INLINE_FUNCTION - static void add(const scalar_t& x, scalar_t& y) { - y += x; - } - - // divide. b /= A - KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, scalar_t &b, const scalar_t &A, - scalar_t*) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { b /= A; }); - team.team_barrier(); - } - - // serial divide. b /= A - KOKKOS_INLINE_FUNCTION - static void divide(scalar_t &b, const scalar_t &A, scalar_t*) { - b /= A; - } - - // multiply_subtract. C -= A * B - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t &A, const scalar_t &B, - scalar_t &C) { - C -= A * B; + nodes_grouped_by_level(nodes_grouped_by_level_) { + KK_REQUIRE_MSG(!BlockEnabled, "Blocks are not yet supported."); + KK_REQUIRE_MSG(block_size_ == 0, "Blocks are not yet supported."); } - KOKKOS_INLINE_FUNCTION - static void copy(const member_type&, scalar_t&, const scalar_t&) {} - - // lget - KOKKOS_INLINE_FUNCTION - scalar_t& lget(const size_type row) const { return lhs(row); } - - // rget - KOKKOS_INLINE_FUNCTION - scalar_t rget(const size_type row) const { return rhs(row); } - - // vget - KOKKOS_INLINE_FUNCTION - scalar_t vget(const size_type nnz) const { return values(nnz); } - - // print - KOKKOS_INLINE_FUNCTION - static void print(const scalar_t &item) { std::cout << item << std::endl; } - - KOKKOS_INLINE_FUNCTION - static void print(ArrayType rhs, const int) { std::cout << rhs << std::endl; } - - }; - - // Partial specialization for block support - template - struct Common { - // BSR data is in LayoutRight! - using Layout = Kokkos::LayoutRight; - - using Block = Kokkos::View< - scalar_t **, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; - - // const block - using CBlock = Kokkos::View< - const scalar_t **, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; - - // scratch block - using SBlock = Kokkos::View< - scalar_t **, Layout, typename execution_space::scratch_memory_space, - Kokkos::MemoryTraits >; - - using Vector = Kokkos::View< - scalar_t *, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; - - using CVector = Kokkos::View< - const scalar_t *, Layout, typename ValuesType::device_type, - Kokkos::MemoryTraits >; - - static constexpr size_type BUFF_SIZE = 128; - - using reftype = Vector; - - struct ArrayType - { - scalar_t m_data[BUFF_SIZE]; - - KOKKOS_INLINE_FUNCTION - ArrayType() { init(); } - - KOKKOS_INLINE_FUNCTION - ArrayType(const ArrayType& rhs_) { - for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = rhs_.m_data[i]; - } - - KOKKOS_INLINE_FUNCTION - ArrayType(const Vector&) { init(); } + struct ReduceSumFunctor { + const Common *m_obj; + const lno_t rowid; + lno_t diag; KOKKOS_INLINE_FUNCTION - void init() { - for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] = 0; - } - - KOKKOS_INLINE_FUNCTION - ArrayType& operator +=(const ArrayType& rhs_) { - for (size_type i = 0; i < BUFF_SIZE; ++i) m_data[i] += rhs_.m_data[i]; - return *this; - } - - KOKKOS_INLINE_FUNCTION - ArrayType& operator +=(const values_t& rhs_) { - for (int i = 0; i < rhs_.size(); ++i) m_data[i] += rhs_(i); - return *this; + void operator()(size_type i, scalar_t &accum) const { + const auto colid = m_obj->entries(i); + auto val = m_obj->values(i); + auto lhs_colid = m_obj->lhs(colid); + accum -= val * lhs_colid; + KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); } }; - struct SumArray - { - using reducer = SumArray; - using value_type = ArrayType; - using result_view_type = Kokkos::View; - - private: - value_type& m_value; + struct ReduceSumDiagFunctor { + const Common *m_obj; + const lno_t rowid; + lno_t diag; - public: KOKKOS_INLINE_FUNCTION - SumArray(value_type& value) : m_value(value) {} - - KOKKOS_INLINE_FUNCTION - void join(value_type& dest, const value_type& src) const { - dest += src; + void operator()(size_type i, scalar_t &accum) const { + const auto colid = m_obj->entries(i); + if (colid != rowid) { + auto val = m_obj->values(i); + auto lhs_colid = m_obj->lhs(colid); + accum -= val * lhs_colid; + } else { + diag = i; + } } - - KOKKOS_INLINE_FUNCTION - void init(value_type& val) const { val.init(); } - - KOKKOS_INLINE_FUNCTION - value_type& reference() const { return m_value; } - - KOKKOS_INLINE_FUNCTION - result_view_type view() const { return result_view_type(&m_value, 1); } - - KOKKOS_INLINE_FUNCTION - bool reference_scalar() const { return true; } }; - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - size_type block_size; - size_type block_items; - - Common(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, - LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const size_type block_size_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - block_size(block_size_), - block_items(block_size * block_size) - { - KK_REQUIRE_MSG(block_size > 0, - "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); - } - - KOKKOS_INLINE_FUNCTION - size_type get_block_size() const { return block_size; } - - // lset - KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const scalar_t &value) const { - KokkosBlas::SerialSet::invoke(value, lget(row)); - } - - KOKKOS_INLINE_FUNCTION - void lset(const size_type row, const CVector &rhs_) const { - auto lvec = lget(row); - assign(lvec, rhs_); - } - - // assign - template - KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, - const View2 &rhs_) { - for (size_t i = 0; i < lhs_.size(); ++i) { - lhs_.data()[i] = rhs_.data()[i]; - } - } - - template - KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, - const View1 &lhs_, - const View2 &rhs_) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs_.size()), - [&](const size_type i) { - lhs_.data()[i] = rhs_.data()[i]; - }); - } - - // add. y += x - KOKKOS_INLINE_FUNCTION - static void add(const member_type &team, const CVector& x, const Vector& y) { - KokkosBlas::Experimental::axpy(team, 1.0, x, y); - } - - // serial add. y += x - KOKKOS_INLINE_FUNCTION - static void add(const CVector& x, const Vector& y) { - KokkosBlas::serial_axpy(1.0, x, y); - } - - // divide. b /= A (b = b * A^-1) - KOKKOS_INLINE_FUNCTION - static void divide(const member_type &team, const Vector &b, const CBlock &A, - scalar_t* buff) { - // Need a temp block to do LU of A - const auto block_size = b.size(); - Block LU(buff, block_size, block_size); - assign(team, LU, A); - KokkosBatched::TeamLU::invoke(team, LU); - - // A = LU - // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsv first - KokkosBatched::TeamTrsv< - member_type, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); - - KokkosBatched::TeamTrsv< - member_type, KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, LU, b); - } - - // serial divide. b /= A (b = b * A^-1) - KOKKOS_INLINE_FUNCTION - static void divide(const Vector &b, const CBlock &A, scalar_t* buff) { - // Need a temp block to do LU of A - const auto block_size = b.size(); - Block LU(buff, block_size, block_size); - assign(LU, A); - KokkosBatched::SerialLU::invoke(LU); - - // A = LU - // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsv first - KokkosBatched::SerialTrsv< - KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); - - KokkosBatched::SerialTrsv< - KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsv::Blocked>::invoke(1.0, LU, b); - } - - // multiply_subtract. C -= A * B - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &B, - ArrayType &Ca) { - Vector C(&Ca.m_data[0], B.size()); - multiply_subtract(A, B, C); - } - - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const CBlock &A, const CVector &B, - Vector &C) { - // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 - KokkosBlas::SerialGemv< - KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Blocked>:: - invoke(-1.0, A, B, 1.0, C); - } - - KOKKOS_INLINE_FUNCTION - static void copy(const member_type &team, const Vector& lhs_, ArrayType& rhsa) { - CVector rhs_(&rhsa.m_data[0], lhs_.size()); - assign(team, lhs_, rhs_); - } - - // lget - KOKKOS_INLINE_FUNCTION - Vector lget(const size_type row) const { - return Vector(lhs.data() + (row * block_size), block_size); - } - - // rget - KOKKOS_INLINE_FUNCTION - CVector rget(const size_type row) const { - return CVector(rhs.data() + (row * block_size), block_size); - } - - // vget KOKKOS_INLINE_FUNCTION - CBlock vget(const size_type block) const { - return CBlock(values.data() + (block * block_items), block_size, block_size); + static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, + const scalar_t &diag_val) { + lhs_val = (lhs_val + rhs_val) / diag_val; } - // print - KOKKOS_INLINE_FUNCTION - static void print(const CBlock &item) { - std::cout << "Block: "; - for (size_type i = 0; i < item.extent(0); ++i) { - std::cout << " "; - for (size_type j = 0; j < item.extent(1); ++j) { - std::cout << item(i, j) << " "; + template + KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, + const int my_rank, + const long node_count) const { + static_assert( + !((!IsSerial && BlockEnabled) && UseThreadVec), + "ThreadVectorRanges are not yet supported for block-enabled"); + static_assert(!(IsSerial && UseThreadVec), + "Requested thread vector range in serial?"); + + const auto rowid = nodes_grouped_by_level(my_rank + node_count); + const auto soffset = row_map(rowid); + const auto eoffset = row_map(rowid + 1); + const auto rhs_val = rhs(rowid); + scalar_t &lhs_val = lhs(rowid); + + // Set up range to auto-skip diag if is sorted + const auto itr_b = soffset + (IsSorted ? (IsLower ? 0 : 1) : 0); + const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); + + // We don't need the reducer to find the diag item if sorted + using reducer_t = + std::conditional_t; + reducer_t rf{this, rowid, -1}; + + if constexpr (IsSerial) { + KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); + KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); + for (auto ptr = itr_b; ptr < itr_e; ++ptr) { + rf(ptr, lhs_val); } - std::cout << std::endl; - } - } - - // print - KOKKOS_INLINE_FUNCTION - static void print(const CVector &item) { - std::cout << "Vector: "; - for (size_type i = 0; i < item.extent(0); ++i) { - std::cout << item(i) << " "; + } else { + KK_KERNEL_ASSERT_MSG(team != nullptr, + "Cannot do team operations without team"); + if constexpr (!UseThreadVec) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), + rf, lhs_val); + team->team_barrier(); + } else { + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); } - std::cout << std::endl; - } - - KOKKOS_INLINE_FUNCTION - static void print(const ArrayType& rhs_, const int block_size) - { - std::cout << "Array: "; - for (int i = 0; i < block_size; ++i) { - std::cout << rhs_.m_data[i] << " "; } - std::cout << std::endl; - } - KOKKOS_INLINE_FUNCTION - static void print(const SumArray& rhs_, const int block_size) - { - std::cout << "SumArray: "; - for (int i = 0; i < block_size; ++i) { - std::cout << rhs_.reference().m_data[i] << " "; + // If sorted, we already know the diag. Otherwise, get it from the reducer + rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; + + // At end, handle the diag element. We need to be careful to avoid race + // conditions here. + if constexpr (IsSerial) { + // Serial case is easy, there's only 1 thread so just do the + // add_and_divide + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + } else { + if constexpr (IsSorted) { + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide. + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Kokkos::single(Kokkos::PerTeam(*team), [&]() { + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + }); + } else { + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + } + } else { + // Parallel unsorted case. Only one thread should know what the diag + // item is. We have that one do the add_and_divide. + if (rf.diag != -1) { + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + } + } } - std::cout << std::endl; } - }; template - struct TriLvlSchedTP1SolverFunctor : - public Common - { - using Base = Common; + struct TriLvlSchedTP1SolverFunctor + : public Common { + using Base = Common; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset @@ -546,349 +261,130 @@ struct SptrsvWrap { const entries_t &nodes_grouped_by_level_, const long &node_count_, const size_type block_size_ = 0) - : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_), - node_count(node_count_) {} - - template - struct ReduceFunctor - { - const Base* m_obj; - lno_t rowid; - - using accum_t = std::conditional_t; - - KOKKOS_INLINE_FUNCTION - void operator()(size_type i, accum_t& accum) const - { - const auto colid = m_obj->entries(i); - if (!AvoidDiag || colid != rowid) { - auto val = m_obj->vget(i); - auto lhs_colid = m_obj->lget(colid); - // accum -= val * lhs_colid; - Base::multiply_subtract(val, lhs_colid, accum); - } - } - }; + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, + block_size_), + node_count(node_count_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - using reduce_item = typename Base::ArrayType; - using reducer = typename Base::SumArray; - - const auto my_league = team.league_rank(); // map to rowid - const auto rowid = Base::nodes_grouped_by_level(my_league + node_count); - const auto soffset = Base::row_map(rowid); - const auto eoffset = Base::row_map(rowid + 1); - const auto rhs_rowid = Base::rget(rowid); - - typename Base::reftype lhs_rowid = Base::lget(rowid); - reduce_item reduce = lhs_rowid; - ReduceFunctor rf {this, rowid}; - - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset + (IsLower ? 0 : 1), eoffset - (IsLower ? 1 : 0)), - rf, reducer(reduce)); - team.team_barrier(); - - Base::copy(team, lhs_rowid, reduce); - - // Team-shared buffer. Use for team work. - const auto bs = Base::get_block_size(); - typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); - - // At end, finalize rowid == colid - Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) - auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); - // lhs_rowid /= diag - Base::divide(team, lhs_rowid, diag, shared_buff.data()); + Base::template solve_impl(&team, team.league_rank(), + node_count); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { - using reduce_item = typename Base::ArrayType; - using reducer = typename Base::SumArray; - - const auto my_league = team.league_rank(); // map to rowid - const auto rowid = Base::nodes_grouped_by_level(my_league + node_count); - const auto soffset = Base::row_map(rowid); - const auto eoffset = Base::row_map(rowid + 1); - const auto rhs_rowid = Base::rget(rowid); - - typename Base::reftype lhs_rowid = Base::lget(rowid); - reduce_item reduce = lhs_rowid; - ReduceFunctor rf {this, rowid}; + Base::template solve_impl( + &team, team.league_rank(), node_count); + } + }; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, soffset, eoffset), - rf, reducer(reduce)); - team.team_barrier(); + // Lower vs Upper Multi-block Functors - Base::copy(team, lhs_rowid, reduce); + template + struct TriLvlSchedRPSolverFunctor + : public Common { + using Base = Common; - // Find diag ptr - size_type diag = 0; - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const size_type ptr, size_type& diag_inner) { - const auto colid = Base::entries(ptr); - if (colid == rowid) { - diag_inner = ptr; - } - }, - Kokkos::Max(diag)); - team.team_barrier(); + TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, + block_size_) {} - // Team-shared buffer. Use for team work. - const auto bs = Base::get_block_size(); - typename Base::SBlock shared_buff(team.team_shmem(), bs, bs); + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + Base::template solve_impl(nullptr, 0, i); + } - // At end, finalize rowid == colid - Base::add(team, rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) - Base::divide(team, lhs_rowid, Base::vget(diag), shared_buff.data()); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const lno_t i) const { + Base::template solve_impl(nullptr, 0, i); } }; template - struct TriLvlSchedTP1SolverFunctorDiagValues { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - ValuesType diagonal_values; // inserted according to rowid + class LHSType, class RHSType, bool IsLower> + struct TriLvlSchedTP1SingleBlockFunctor + : public Common { + using Base = + Common; - const bool is_lowertri; + entries_t nodes_per_level; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset - long dense_nrows; + long lvl_start; + long lvl_end; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... - TriLvlSchedTP1SolverFunctorDiagValues( + TriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const ValuesType &diagonal_values_, const bool is_lowertri_, - long node_count_, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - diagonal_values(diagonal_values_), - is_lowertri(is_lowertri_), + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, + const int dense_nrows_ = 0, const int cutoff_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, + nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), node_count(node_count_), - dense_nrows(dense_nrows_) {} + lvl_start(lvl_start_), + lvl_end(lvl_end_), + dense_nrows(dense_nrows_), + cutoff(cutoff_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row - team.team_barrier(); + template + KOKKOS_INLINE_FUNCTION void common_impl(const member_type &team) const { + auto mut_node_count = node_count; - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : - // (rhs_rowid+diff)/values(soffset); - lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + const auto nodes_this_lvl = nodes_per_level(lvl); + const auto my_team_rank = team.team_rank(); + const auto loop_cutoff = LargerCutoff ? cutoff : my_team_rank + 1; + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < loop_cutoff; + my_rank += team.team_size()) { + if (my_rank < nodes_this_lvl) { + Base::template solve_impl( + &team, my_rank, mut_node_count); + } + } + mut_node_count += nodes_this_lvl; + team.team_barrier(); } } - }; - - template - struct TriLvlSchedTP2SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - long dense_nrows; - - TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, - long node_groups_ = 0, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_), - dense_nrows(dense_nrows_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = IsLower - ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); + common_impl(team); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); + common_impl(team); } - }; - - // Lower vs Upper Multi-block Functors - - template - struct TriLvlSchedRPSolverFunctor : - public Common - { - using Base = Common; - - TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const size_type block_size_ = 0) - : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) - {} KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - // Thread-local buffers. Use for Serial (non-team) work - scalar_t buff1[Base::BUFF_SIZE]; - - const auto rowid = Base::nodes_grouped_by_level(i); - const auto soffset = Base::row_map(rowid); - const auto eoffset = Base::row_map(rowid + 1); - const auto rhs_rowid = Base::rget(rowid); - - typename Base::reftype lhs_rowid = Base::lget(rowid); - - for (auto ptr = soffset + (IsLower ? 0 : 1); ptr < eoffset - (IsLower ? 1 : 0); ++ptr) { - const auto colid = Base::entries(ptr); - auto val = Base::vget(ptr); - auto lhs_colid = Base::lget(colid); - // lhs_rowid -= val * lhs_colid - Base::multiply_subtract(val, lhs_colid, lhs_rowid); - } // end for ptr - - Base::add(rhs_rowid, lhs_rowid); - auto diag = IsLower ? Base::vget(eoffset - 1) : Base::vget(soffset); - Base::divide(lhs_rowid, diag, &buff1[0]); + void operator()(const LargerCutoffTag &, const member_type &team) const { + common_impl(team); } KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - // Thread-local buffers. Use for Serial (non-team) work - scalar_t buff1[Base::BUFF_SIZE]; - - auto rowid = Base::nodes_grouped_by_level(i); - long soffset = Base::row_map(rowid); - long eoffset = Base::row_map(rowid + 1); - auto rhs_rowid = Base::rget(rowid); - - typename Base::reftype lhs_rowid = Base::lget(rowid); - - size_type diag = 0; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - const auto colid = Base::entries(ptr); - if (colid != rowid) { - auto val = Base::values(ptr); - auto lhs_colid = Base::lget(colid); - Base::multiply_subtract(val, lhs_colid, lhs_rowid); - } else { - diag = ptr; - } - } // end for ptr - Base::add(rhs_rowid, lhs_rowid); // lhs_rowid += rhs(rowid) - Base::divide(lhs_rowid, Base::vget(diag), &buff1[0]); + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + common_impl(team); } }; @@ -1500,301 +996,6 @@ struct SptrsvWrap { }; #endif - // -------------------------------- - // Single-block functors - // -------------------------------- - - template - struct TriLvlSchedTP1SingleBlockFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, - const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - if (IsLower) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for - // lower tri, soffset for upper tri - if (IsLower) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - // // End of functors, begin external API // @@ -1802,11 +1003,9 @@ struct SptrsvWrap { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT template - static void tri_solve_cg(TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + static void tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, + const EntriesType entries, const ValuesType values, + const RHSType &rhs, LHSType &lhs) { typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = thandle.get_sptrsvCudaGraph(); @@ -1846,8 +1045,8 @@ struct SptrsvWrap { Kokkos::Experimental::require( policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), - TriLvlSchedTP1SolverFunctor( + TriLvlSchedTP1SolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); @@ -1871,10 +1070,11 @@ struct SptrsvWrap { #endif -#define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ - Functor +#define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ + Functor - template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, @@ -1892,12 +1092,14 @@ struct SptrsvWrap { const auto hnodes_per_level = thandle.get_host_nodes_per_level(); const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); - const auto block_enabled = thandle.is_block_enabled(); - assert(block_enabled == BlockEnabled); + const auto block_enabled = false; // thandle.is_block_enabled(); + assert(block_size == 0); // Set up functor types - using LowerRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, BlockEnabled); - using LowerTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, BlockEnabled); + using LowerRPFunc = + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false); + using LowerTPFunc = + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1963,28 +1165,29 @@ struct SptrsvWrap { #endif if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - LowerRPFunc lrpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); + LowerRPFunc lrpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, block_size); Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( - range_policy(space, node_count, node_count + lvl_nodes), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), + range_policy(space, node_count, node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), lrpp); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - LowerTPFunc ltpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); + LowerTPFunc ltpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); - auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); - const int scratch_size = LowerTPFunc::SBlock::shmem_size(block_size, block_size); - tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + auto tp = team_size == -1 + ? team_policy(space, lvl_nodes, Kokkos::AUTO) + : team_policy(space, lvl_nodes, team_size); Kokkos::parallel_for( - "parfor_l_team", - Kokkos::Experimental::require( - tp, - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - ltpp); + "parfor_l_team", + Kokkos::Experimental::require( + tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), + ltpp); } // TP2 algorithm has issues with some offset-ordinal combo to be // addressed @@ -2031,7 +1234,8 @@ struct SptrsvWrap { else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree size_t flops = 0; @@ -2180,7 +1384,8 @@ struct SptrsvWrap { SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree Kokkos::Timer timer; timer.reset(); @@ -2265,7 +1470,7 @@ struct SptrsvWrap { #endif } // end lower_tri_solve - template static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, @@ -2281,16 +1486,18 @@ struct SptrsvWrap { // Keep this a host View, create device version and copy to back to host // during scheduling This requires making sure the host view in the handle // is properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - const auto block_size = thandle.get_block_size(); - const auto block_enabled = thandle.is_block_enabled(); - assert(block_enabled == BlockEnabled); + const auto block_size = thandle.get_block_size(); + const auto block_enabled = false; // thandle.is_block_enabled(); + assert(block_size == 0); // Set up functor types - using UpperRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, BlockEnabled); - using UpperTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, BlockEnabled); + using UpperRPFunc = + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false); + using UpperTPFunc = + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, false); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -2356,7 +1563,8 @@ struct SptrsvWrap { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - UpperRPFunc urpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, block_size); + UpperRPFunc urpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, block_size); Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( @@ -2366,17 +1574,17 @@ struct SptrsvWrap { } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - UpperTPFunc utpp(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, block_size); + UpperTPFunc utpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); - auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); - const int scratch_size = UpperTPFunc::SBlock::shmem_size(block_size, block_size); - tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + auto tp = team_size == -1 + ? team_policy(space, lvl_nodes, Kokkos::AUTO) + : team_policy(space, lvl_nodes, team_size); Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - tp, - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - utpp); + "parfor_u_team", + Kokkos::Experimental::require( + tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), + utpp); } // TP2 algorithm has issues with some offset-ordinal combo to be // addressed @@ -2422,7 +1630,8 @@ tstf); } // end elseif else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree size_t flops = 0; @@ -2676,7 +1885,8 @@ tstf); } // end elseif SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { - KK_REQUIRE_MSG(!block_enabled, "Block matrices not yet supported for supernodal"); + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree Kokkos::Timer timer; @@ -2841,8 +2051,7 @@ tstf); } // end elseif // team_size_singleblock is unimportant } - for (size_type chainlink = 0; chainlink < num_chain_entries; - ++chainlink) { + for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { size_type schain = h_chain_ptr(chainlink); size_type echain = h_chain_ptr(chainlink + 1); @@ -2850,21 +2059,21 @@ tstf); } // end elseif // if team_size is -1 (unset), get recommended size from Kokkos TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); if (team_size == -1) { team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? Kokkos::parallel_for( - "parfor_l_team_chain1", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_l_team_chain1", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -2876,41 +2085,43 @@ tstf); } // end elseif if (team_size_singleblock <= 0) { team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain), - Kokkos::ParallelForTag()); + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain), + Kokkos::ParallelForTag()); } if (cutoff <= team_size_singleblock) { - SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); + SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, nodes_per_level, + node_count, schain, echain); Kokkos::parallel_for( - "parfor_l_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_l_team_chainmulti", + Kokkos::Experimental::require( + team_policy(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally - SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, 0, - cutoff); + SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, nodes_per_level, + node_count, schain, echain, 0, cutoff); Kokkos::parallel_for( - "parfor_l_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + "parfor_l_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } - node_count += lvl_nodes; + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? } // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the @@ -2924,19 +2135,21 @@ tstf); } // end elseif // -------------------------------- template - static void tri_solve_streams( - const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, std::vector &lhs_v) { + static void tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - using RPPointFunctor = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower, false); - using TPPointFunctor = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower, false); + using RPPointFunctor = + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower, false); + using TPPointFunctor = + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower, false); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); @@ -2969,17 +2182,20 @@ tstf); } // end elseif "parfor_fixed_lvl", range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - RPPointFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i])); + RPPointFunctor(row_map_v[i], entries_v[i], values_v[i], + lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { int team_size = thandle_v[i]->get_team_size(); - auto tp = team_size == -1 ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) : team_policy(execspace_v[i], lvl_nodes, team_size); - TPPointFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + auto tp = + team_size == -1 + ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) + : team_policy(execspace_v[i], lvl_nodes, team_size); + TPPointFunctor tstf(row_map_v[i], entries_v[i], values_v[i], + lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i], node_count_v[i]); Kokkos::parallel_for("parfor_l_team", tp, tstf); } node_count_v[i] += lvl_nodes; diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index d4418b7bf8..d8deee382c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -136,8 +136,8 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, - b, x); + Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, + entries, values, b, x); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; @@ -165,8 +165,8 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, - b, x); + Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, + entries, values, b, x); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; @@ -219,8 +219,9 @@ struct SPTRSV_SOLVE(execspace_v, sptrsv_handle_v, row_map_v, - entries_v, values_v, b_v, x_v); + Sptrsv::template tri_solve_streams(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, + b_v, x_v); } else { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { @@ -229,8 +230,9 @@ struct SPTRSV_SOLVE(execspace_v, sptrsv_handle_v, row_map_v, - entries_v, values_v, b_v, x_v); + Sptrsv::template tri_solve_streams(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, + b_v, x_v); } Kokkos::Profiling::popRegion(); } diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 8ded6209ec..edaae8192f 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -44,9 +44,11 @@ namespace KokkosSparse { namespace Experimental { -#define KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value +// Two types are the same (ignoring const) +template +constexpr bool parilut_same_type = + std::is_same_v, + typename std::remove_const_t>; /// @brief Performs the symbolic phase of par_ilut. /// This is a non-blocking function. @@ -78,24 +80,24 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename ARowMapType::non_const_value_type, size_type), - "par_ilut_symbolic: A size_type must match KernelHandle " - "size_type (const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename AEntriesType::non_const_value_type, ordinal_type), + static_assert( + parilut_same_type, + "par_ilut_symbolic: A size_type must match KernelHandle " + "size_type (const doesn't matter)"); + static_assert(parilut_same_type, "par_ilut_symbolic: A entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LRowMapType::non_const_value_type, size_type), - "par_ilut_symbolic: L size_type must match KernelHandle " - "size_type (const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_symbolic: L size_type must match KernelHandle " + "size_type (const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename URowMapType::non_const_value_type, size_type), - "par_ilut_symbolic: U size_type must match KernelHandle " - "size_type (const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_symbolic: U size_type must match KernelHandle " + "size_type (const doesn't matter)"); static_assert(Kokkos::is_view::value, "par_ilut_symbolic: A_rowmap is not a Kokkos::View."); @@ -118,25 +120,25 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_symbolic: A_rowmap, L_rowmap and U_rowmap must all " "have rank 1."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: The output L_rowmap must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: The output U_rowmap must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: Views LRowMapType and ARowMapType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: Views LRowMapType and URowMapType have " "different device_types."); static_assert( - std::is_same< + std::is_same_v< typename LRowMapType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_symbolic: KernelHandle and Views have different execution " "spaces."); @@ -165,26 +167,26 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, typename ARowMapType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename ARowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using AEntries_Internal = Kokkos::View< typename AEntriesType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout< AEntriesType>::array_layout, typename AEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LRowMap_Internal = Kokkos::View< typename LRowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename LRowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using URowMap_Internal = Kokkos::View< typename URowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename URowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; ARowMap_Internal A_rowmap_i = A_rowmap; AEntries_Internal A_entries_i = A_entries; @@ -240,46 +242,43 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, using scalar_type = typename KernelHandle::nnz_scalar_t; static_assert( - KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename ARowMapType::non_const_value_type, size_type), + parilut_same_type, "par_ilut_numeric: A size_type must match KernelHandle size_type " "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename AEntriesType::non_const_value_type, ordinal_type), + static_assert(parilut_same_type, "par_ilut_numeric: A entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename AValuesType::value_type, scalar_type), - "par_ilut_numeric: A scalar type must match KernelHandle entry " - "type (aka nnz_scalar_t, and const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_numeric: A scalar type must match KernelHandle entry " + "type (aka nnz_scalar_t, and const doesn't matter)"); static_assert( - KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LRowMapType::non_const_value_type, size_type), + parilut_same_type, "par_ilut_numeric: L size_type must match KernelHandle size_type " "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LEntriesType::non_const_value_type, ordinal_type), + static_assert(parilut_same_type, "par_ilut_numeric: L entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LValuesType::value_type, scalar_type), - "par_ilut_numeric: L scalar type must match KernelHandle entry " - "type (aka nnz_scalar_t, and const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_numeric: L scalar type must match KernelHandle entry " + "type (aka nnz_scalar_t, and const doesn't matter)"); static_assert( - KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename URowMapType::non_const_value_type, size_type), + parilut_same_type, "par_ilut_numeric: U size_type must match KernelHandle size_type " "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename UEntriesType::non_const_value_type, ordinal_type), + static_assert(parilut_same_type, "par_ilut_numeric: U entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename UValuesType::value_type, scalar_type), - "par_ilut_numeric: U scalar type must match KernelHandle entry " - "type (aka nnz_scalar_t, and const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_numeric: U scalar type must match KernelHandle entry " + "type (aka nnz_scalar_t, and const doesn't matter)"); static_assert(Kokkos::is_view::value, "par_ilut_numeric: A_rowmap is not a Kokkos::View."); @@ -330,73 +329,71 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_numeric: A_values, L_values and U_values must all " "have rank 1."); - static_assert( - std::is_same::value, - "par_ilut_numeric: The output L_entries must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, + "par_ilut_numeric: The output L_entries must be nonconst."); + static_assert(std::is_same_v, "par_ilut_numeric: The output L_values must be nonconst."); - static_assert( - std::is_same::value, - "par_ilut_numeric: The output U_entries must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, + "par_ilut_numeric: The output U_entries must be nonconst."); + static_assert(std::is_same_v, "par_ilut_numeric: The output U_values must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LRowMapType and ARowMapType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LEntriesType and AEntriesType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LValuesType and AValuesType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LRowMapType and URowMapType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LEntriesType and UEntriesType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LValuesType and UValuesType have " "different device_types."); static_assert( - std::is_same< + std::is_same_v< typename LRowMapType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_numeric: KernelHandle and Views have different execution " "spaces."); static_assert( - std::is_same< + std::is_same_v< typename LEntriesType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_numeric: KernelHandle and Views have different execution " "spaces."); static_assert( - std::is_same< + std::is_same_v< typename LValuesType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_numeric: KernelHandle and Views have different execution " "spaces."); static_assert( - std::is_same::value, + std::is_same_v, "par_ilut_numeric: rowmap and entries have different device types."); static_assert( - std::is_same::value, + std::is_same_v, "par_ilut_numeric: rowmap and values have different device types."); // Check if symbolic has been called @@ -431,58 +428,58 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, typename ARowMapType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename ARowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using AEntries_Internal = Kokkos::View< typename AEntriesType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout< AEntriesType>::array_layout, typename AEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using AValues_Internal = Kokkos::View< typename AValuesType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename AValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LRowMap_Internal = Kokkos::View< typename LRowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename LRowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LEntries_Internal = Kokkos::View::array_layout, typename LEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LValues_Internal = Kokkos::View< typename LValuesType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename LValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using URowMap_Internal = Kokkos::View< typename URowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename URowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using UEntries_Internal = Kokkos::View::array_layout, typename UEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using UValues_Internal = Kokkos::View< typename UValuesType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename UValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; ARowMap_Internal A_rowmap_i = A_rowmap; AEntries_Internal A_entries_i = A_entries; @@ -519,6 +516,4 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, } // namespace Experimental } // namespace KokkosSparse -#undef KOKKOSKERNELS_PAR_ILUT_SAME_TYPE - #endif // KOKKOSSPARSE_PAR_ILUT_HPP_ diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index ddbef56504..de98701b7c 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -247,13 +247,6 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], YVector_Internal y_i(y); bool useNative = is_spmv_algorithm_native(handle->get_algorithm()); - // Also use the native algorithm if SPMV_FAST_SETUP was selected and - // rocSPARSE is the possible TPL to use. Native is faster in this case. -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - if (handle->get_algorithm() == SPMV_FAST_SETUP && - std::is_same_v) - useNative = true; -#endif // Now call the proper implementation depending on isBSR and the rank of X/Y if constexpr (!isBSR) { @@ -288,7 +281,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], #ifdef KOKKOS_ENABLE_SYCL if constexpr (std::is_same_v) { - useNative = useNative || (mode[0] == Conjugate[0]); + useNative = useNative || (mode[0] != NoTranspose[0]); } #endif #endif diff --git a/sparse/src/KokkosSparse_spmv_deprecated.hpp b/sparse/src/KokkosSparse_spmv_deprecated.hpp index f29caaec0c..0faef2ef4e 100644 --- a/sparse/src/KokkosSparse_spmv_deprecated.hpp +++ b/sparse/src/KokkosSparse_spmv_deprecated.hpp @@ -191,20 +191,18 @@ spmv(const ExecutionSpace& space, // Default to fast setup, since this handle can't be reused SPMVAlgorithm algo = SPMV_FAST_SETUP; // Translate the Controls algorithm selection to the SPMVHandle algorithm. - // This maintains the old behavior, where any manually set name that isn't - // "tpl" gives native. - // - // This also uses the behavior set by #2021: "merge" was a hint to use - // cuSPARSE merge path, but that path is gone so just use the normal TPL. - // "merge-path" means to use the KK merge-path implementation. // // And also support the 3 different BSR algorithms by their old names. if (controls.isParameter("algorithm")) { std::string algoName = controls.getParameter("algorithm"); - if (algoName == "merge" || algoName == "tpl") + if (algoName == "tpl") algo = SPMV_FAST_SETUP; - else if (algoName == "native-merge") + else if (algoName == "native") + algo = SPMV_NATIVE; + else if (algoName == "merge") algo = SPMV_MERGE_PATH; + else if (algoName == "native-merge") + algo = SPMV_NATIVE_MERGE_PATH; else if (algoName == "v4.1") algo = SPMV_BSR_V41; else if (algoName == "v4.2") diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index 6d23d2bde1..38f6056615 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -36,8 +36,11 @@ enum SPMVAlgorithm { /// is only used once. SPMV_NATIVE, /// Use the best KokkosKernels implementation, even if a TPL /// implementation is available. - SPMV_MERGE_PATH, /// Use load-balancing merge path algorithm (for CrsMatrix - /// only) + SPMV_MERGE_PATH, /// Use algorithm optimized for matrices with + /// imbalanced/irregular sparsity patterns (merge path or + /// similar). May call a TPL. For CrsMatrix only. + SPMV_NATIVE_MERGE_PATH, /// Use the KokkosKernels implementation of merge + /// path. For CrsMatrix only. SPMV_BSR_V41, /// Use experimental version 4.1 algorithm (for BsrMatrix only) SPMV_BSR_V42, /// Use experimental version 4.2 algorithm (for BsrMatrix only) SPMV_BSR_TC /// Use experimental tensor core algorithm (for BsrMatrix only) @@ -59,6 +62,7 @@ inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) { case SPMV_FAST_SETUP: return "SPMV_FAST_SETUP"; case SPMV_NATIVE: return "SPMV_NATIVE"; case SPMV_MERGE_PATH: return "SPMV_MERGE_PATH"; + case SPMV_NATIVE_MERGE_PATH: return "SPMV_NATIVE_MERGE_PATH"; case SPMV_BSR_V41: return "SPMV_BSR_V41"; case SPMV_BSR_V42: return "SPMV_BSR_V42"; case SPMV_BSR_TC: return "SPMV_BSR_TC"; @@ -73,10 +77,11 @@ inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) { inline bool is_spmv_algorithm_native(SPMVAlgorithm a) { switch (a) { case SPMV_NATIVE: - case SPMV_MERGE_PATH: + case SPMV_NATIVE_MERGE_PATH: case SPMV_BSR_V41: case SPMV_BSR_V42: case SPMV_BSR_TC: return true; + // DEFAULT, FAST_SETUP and MERGE_PATH may call TPLs default: return false; } } @@ -351,6 +356,7 @@ struct SPMVHandle } else { switch (get_algorithm()) { case SPMV_MERGE_PATH: + case SPMV_NATIVE_MERGE_PATH: throw std::invalid_argument(std::string("SPMVHandle: algorithm ") + get_spmv_algorithm_name(get_algorithm()) + " cannot be used if A is a BsrMatrix"); diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp index 6c68478501..c3f2bfa49f 100644 --- a/sparse/src/KokkosSparse_spmv_team.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -62,7 +62,7 @@ int KOKKOS_INLINE_FUNCTION team_spmv( return 1; } - if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + if ((x.extent(0) + 1) != row_ptr.extent(0)) { Kokkos::printf( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", @@ -116,7 +116,7 @@ int KOKKOS_INLINE_FUNCTION team_vector_spmv( return 1; } - if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + if ((x.extent(0) + 1) != row_ptr.extent(0)) { Kokkos::printf( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 53e9926f62..6c0cef4f6e 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -1022,7 +1022,6 @@ class SPTRSVHandle { } bool is_block_enabled() const { return block_size > 0; } - void set_symbolic_complete() { this->symbolic_complete = true; } void set_symbolic_incomplete() { this->symbolic_complete = false; } diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index be2588483f..33eb052135 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -392,7 +392,15 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], &vecY, y.extent_int(0), y_data, rocsparse_compute_type())); - rocsparse_spmv_alg alg = rocsparse_spmv_alg_default; + // Default to using the "stream" algorithm which has almost no setup cost, + // and performs well for reasonably balanced matrices + rocsparse_spmv_alg alg = rocsparse_spmv_alg_csr_stream; + if (handle->get_algorithm() == SPMV_MERGE_PATH) { + // Only use the "adaptive" algorithm if the user has indicated that the + // matrix is very imbalanced, by asking for merge path. This algorithm + // has fairly expensive setup + alg = rocsparse_spmv_alg_csr_adaptive; + } KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle; if (handle->tpl_rank1) { diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index bd1e68c370..139e47dcef 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -69,7 +69,10 @@ void randomize_matrix_values(const Values &v) { ScalarType randStart, randEnd; KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd); Kokkos::Random_XorShift64_Pool pool(13718); - Kokkos::fill_random(v, pool, randStart, randEnd); + // Instead of sampling from [-50, 50] or [-50-50i, 50+50i], + // sample from [1, 50] or [1+i, 50+50i]. That way relative + // error between values can't become large if values happen to sum close to 0. + Kokkos::fill_random(v, pool, randEnd / 50.0, randEnd); } template @@ -254,6 +257,8 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, m, k, nnz, row_size_variance, bandwidth); crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix( k, n, nnz, row_size_variance, bandwidth); + randomize_matrix_values(A.values); + randomize_matrix_values(B.values); KokkosSparse::sort_crs_matrix(A); KokkosSparse::sort_crs_matrix(B); diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 2057a8ba14..0921a1b45a 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -518,7 +518,11 @@ template (algo, numRows, nnz, bandwidth, row_size_variance, heavy); } diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index e9b23298f9..699afb2510 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -383,9 +383,9 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, using handle_t = SPMVHandle; // cover a variety of algorithms - std::vector handles; + std::vector> handles; for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) - handles.push_back(new handle_t(algo)); + handles.push_back(std::make_unique(algo)); // Tensor core algorithm temporarily disabled, fails on V100 /* @@ -405,14 +405,14 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, } */ - for (handle_t *handle : handles) { + for (std::unique_ptr &handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spmv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); if (beta == scalar_type(0)) { - test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, + test_spmv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x_with_nans, y_with_nans); } } @@ -644,9 +644,9 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, SPMVHandle; // cover a variety of algorithms - std::vector handles; + std::vector> handles; for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) - handles.push_back(new handle_t(algo)); + handles.push_back(std::make_unique(algo)); // Tensor core algorithm temporarily disabled, fails on V100 /* @@ -670,14 +670,15 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); auto [x_with_nans, y_with_nans] = random_multivecs_for_spm_mv(mode, a, numVecs, true); - for (handle_t *handle : handles) { + for (std::unique_ptr &handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spm_mv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x, + y); if (beta == scalar_type(0)) { - test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, + test_spm_mv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x_with_nans, y_with_nans); } } diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 35e7d57572..2c23a5ae67 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -47,12 +47,12 @@ template struct SptrsvTest { // Define useful types - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using Crs = CrsMatrix; @@ -123,9 +123,28 @@ struct SptrsvTest { return A; } - static std::tuple - create_crs_lhs_rhs(const std::vector>& fixture) - { + static bool do_cusparse() { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + return ( + std::is_same::value && + std::is_same::value && + std::is_same::value); +#else + return false; +#endif + } + + struct ReductionCheck { + ValuesType lhs; + + ReductionCheck(const ValuesType &lhs_) : lhs(lhs_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(lno_t i, scalar_t &tsum) const { tsum += lhs(i); } + }; + + static std::tuple create_crs_lhs_rhs( + const std::vector> &fixture) { RowMapType row_map; EntriesType entries; ValuesType values; @@ -153,25 +172,24 @@ struct SptrsvTest { } template - static void basic_check(const SpMatrix& triMtx, const ValuesType& lhs, const ValuesType& rhs, const bool is_lower, const size_type block_size=0) - { - // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, currently unavailable - std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1}; + static void basic_check(const SpMatrix &triMtx, const ValuesType &lhs, + const ValuesType &rhs, const bool is_lower, + const size_type block_size = 0) { + // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, + // currently unavailable + std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, + SPTRSVAlgorithm::SEQLVLSCHD_TP1}; if (block_size == 0) { // SEQLVLSCHD_TP1CHAIN and SPTRSV_CUSPARSE are not supported for blocks algs.push_back(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN); -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { + if (do_cusparse()) { algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE); } -#endif } - auto row_map = triMtx.graph.row_map; - auto entries = triMtx.graph.entries; - auto values = triMtx.values; + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; const size_type nrows = row_map.size() - 1; @@ -183,7 +201,7 @@ struct SptrsvTest { kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); } - sptrsv_symbolic(&kh, row_map, entries); + sptrsv_symbolic(&kh, row_map, entries, values); Kokkos::fence(); sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); @@ -200,15 +218,6 @@ struct SptrsvTest { } } - struct ReductionCheck { - ValuesType lhs; - - ReductionCheck(const ValuesType &lhs_) : lhs(lhs_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(lno_t i, scalar_t &tsum) const { tsum += lhs(i); } - }; - static void run_test_sptrsv() { const size_type nrows = 5; @@ -236,7 +245,8 @@ struct SptrsvTest { // Upper tri { { - const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(get_5x5_ut_ones_fixture()); + const auto [triMtx, lhs, rhs] = + create_crs_lhs_rhs(get_5x5_ut_ones_fixture()); basic_check(triMtx, lhs, rhs, false); } @@ -352,7 +362,8 @@ struct SptrsvTest { // Lower tri { { - const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(get_5x5_lt_ones_fixture()); + const auto [triMtx, lhs, rhs] = + create_crs_lhs_rhs(get_5x5_lt_ones_fixture()); basic_check(triMtx, lhs, rhs, true); } @@ -519,7 +530,8 @@ struct SptrsvTest { } } - static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, const bool is_lower) { + static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, + const bool is_lower) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition bool run_streams_test = true; @@ -549,12 +561,13 @@ struct SptrsvTest { std::vector rhs_v(nstreams); std::vector lhs_v(nstreams); - auto fixture = is_lower ? get_5x5_lt_ones_fixture() : get_5x5_ut_ones_fixture(); + auto fixture = + is_lower ? get_5x5_lt_ones_fixture() : get_5x5_ut_ones_fixture(); const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(fixture); - auto row_map = triMtx.graph.row_map; - auto entries = triMtx.graph.entries; - auto values = triMtx.values; + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; for (int i = 0; i < nstreams; i++) { // Allocate @@ -583,7 +596,7 @@ struct SptrsvTest { Kokkos::fence(); // Create handle - kh_v[i] = KernelHandle(); + kh_v[i] = KernelHandle(); kh_v[i].create_sptrsv_handle(test_algo, nrows, is_lower); kh_ptr_v[i] = &kh_v[i]; @@ -623,13 +636,11 @@ template void test_sptrsv_streams() { using TestStruct = Test::SptrsvTest; - std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, SPTRSVAlgorithm::SEQLVLSCHD_TP1}; -#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - if (std::is_same::value && - std::is_same::value) { + std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, + SPTRSVAlgorithm::SEQLVLSCHD_TP1}; + if (TestStruct::do_cusparse()) { algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE); } -#endif for (auto alg : algs) { for (int nstreams = 1; nstreams <= 4; ++nstreams) {