From 5af9da9812e61587e7835d90343d92aef88dd263 Mon Sep 17 00:00:00 2001
From: Ernesto Prudencio <eeprude@sandia.gov>
Date: Sat, 25 May 2024 17:00:11 -0600
Subject: [PATCH] Backup

---
 lapack/src/KokkosLapack_geqrf.hpp             |  44 +--
 .../tpls/KokkosLapack_geqrf_tpl_spec_decl.hpp |   4 +-
 lapack/unit_test/Test_Lapack_geqrf.hpp        | 370 ++++++++++++++----
 3 files changed, 311 insertions(+), 107 deletions(-)
diff --git a/lapack/src/KokkosLapack_geqrf.hpp b/lapack/src/KokkosLapack_geqrf.hpp
index 4c920e9a74..a81ae2a436 100644
--- a/lapack/src/KokkosLapack_geqrf.hpp
+++ b/lapack/src/KokkosLapack_geqrf.hpp
@@ -34,7 +34,7 @@ namespace KokkosLapack {
 ///
 /// \tparam ExecutionSpace The space where the kernel will run.
 /// \tparam AMatrix        Type of matrix A, as a 2-D Kokkos::View.
-/// \tparam TArray         Type of array Tau, as a 1-D Kokkos::View.
+/// \tparam TauArray       Type of array Tau, as a 1-D Kokkos::View.
 /// \tparam InfoArray      Type of array Info, as a 1-D Kokkos::View.
 ///
 /// \param space [in] Execution space instance used to specified how to execute
@@ -48,10 +48,9 @@ namespace KokkosLapack {
 ///                   is represented as a product of elementary reflectors
 ///                     Q = H(1) H(2) . . . H(k), where k = min(M,N).
 ///                   Each H(i) has the form
-///                     H(i) = I - Tau * v * v**H
-///                   where tau is a complex scalar, and v is a complex vector
-///                   with v(1:i-1) = 0 and v(i) = 1; v(i+1:M) is stored on
-///                   exit in A(i+1:M,i), and tau in Tau(i).
+///                     H(i) = I - Tau(i) * v * v**H,
+///                   where v is a vector with v(1:i-1) = 0 and v(i) = 1;
+///                   v(i+1:M) is stored on exit in A(i+1:M,i).
 /// \param Tau [out]  One-dimensional array of size min(M,N) that contains the
 ///                   scalar factors of the elementary reflectors.
 /// \param Info [out] One-dimensional array of integers and of size 1:
@@ -59,8 +58,8 @@ namespace KokkosLapack {
 ///                   Info[0] < 0: if equal to '-i', the i-th argument had an
 ///                                illegal value
 ///
-template <class ExecutionSpace, class AMatrix, class TArray, class InfoArray>
-void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
+template <class ExecutionSpace, class AMatrix, class TauArray, class InfoArray>
+void geqrf(const ExecutionSpace& space, const AMatrix& A, const TauArray& Tau,
            const InfoArray& Info) {
   // NOTE: Currently, KokkosLapack::geqrf only supports LAPACK, MAGMA and
   // rocSOLVER TPLs.
@@ -73,21 +72,21 @@ void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
                                  typename AMatrix::memory_space>::accessible);
   static_assert(
       Kokkos::SpaceAccessibility<ExecutionSpace,
-                                 typename TArray::memory_space>::accessible);
+                                 typename TauArray::memory_space>::accessible);
   static_assert(
       Kokkos::SpaceAccessibility<ExecutionSpace,
                                  typename InfoArray::memory_space>::accessible);
 
   static_assert(Kokkos::is_view<AMatrix>::value,
                 "KokkosLapack::geqrf: A must be a Kokkos::View.");
-  static_assert(Kokkos::is_view<TArray>::value,
+  static_assert(Kokkos::is_view<TauArray>::value,
                 "KokkosLapack::geqrf: Tau must be Kokkos::View.");
   static_assert(Kokkos::is_view<InfoArray>::value,
                 "KokkosLapack::geqrf: Info must be Kokkos::View.");
 
   static_assert(static_cast<int>(AMatrix::rank) == 2,
                 "KokkosLapack::geqrf: A must have rank 2.");
-  static_assert(static_cast<int>(TArray::rank) == 1,
+  static_assert(static_cast<int>(TauArray::rank) == 1,
                 "KokkosLapack::geqrf: Tau must have rank 1.");
   static_assert(static_cast<int>(InfoArray::rank) == 1,
                 "KokkosLapack::geqrf: Info must have rank 1.");
@@ -118,9 +117,9 @@ void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
   using AMatrix_Internal = Kokkos::View<
       typename AMatrix::non_const_value_type**, typename AMatrix::array_layout,
       typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using TArray_Internal =
-      Kokkos::View<typename TArray::non_const_value_type*,
-                   typename TArray::array_layout, typename TArray::device_type,
+  using TauArray_Internal =
+      Kokkos::View<typename TauArray::non_const_value_type*,
+                   typename TauArray::array_layout, typename TauArray::device_type,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
   using InfoArray_Internal =
       Kokkos::View<typename InfoArray::non_const_value_type*,
@@ -128,11 +127,11 @@ void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
                    typename InfoArray::device_type,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
 
-  AMatrix_Internal A_i      = A;
-  TArray_Internal Tau_i     = Tau;
+  AMatrix_Internal   A_i    = A;
+  TauArray_Internal  Tau_i  = Tau;
   InfoArray_Internal Info_i = Info;
 
-  KokkosLapack::Impl::GEQRF<ExecutionSpace, AMatrix_Internal, TArray_Internal,
+  KokkosLapack::Impl::GEQRF<ExecutionSpace, AMatrix_Internal, TauArray_Internal,
                             InfoArray_Internal>::geqrf(space, A_i, Tau_i,
                                                        Info_i);
 }
@@ -140,7 +139,7 @@ void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
 /// \brief Computes a QR factorization of a matrix A
 ///
 /// \tparam AMatrix   Type of matrix A, as a 2-D Kokkos::View.
-/// \tparam TArray    Type of array Tau, as a 1-D Kokkos::View.
+/// \tparam TauArray  Type of array Tau, as a 1-D Kokkos::View.
 /// \tparam InfoArray Type of array Info, as a 1-D Kokkos::View.
 ///
 /// \param A [in,out] On entry, the M-by-N matrix to be factorized.
@@ -152,10 +151,9 @@ void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
 ///                   is represented as a product of elementary reflectors
 ///                     Q = H(1) H(2) . . . H(k), where k = min(M,N).
 ///                   Each H(i) has the form
-///                     H(i) = I - Tau * v * v**H
-///                   where tau is a complex scalar, and v is a complex vector
-///                   with v(1:i-1) = 0 and v(i) = 1; v(i+1:M) is stored on
-///                   exit in A(i+1:M,i), and tau in Tau(i).
+///                     H(i) = I - Tau(i) * v * v**H,
+///                   where v is a vector with v(1:i-1) = 0 and v(i) = 1;
+///                   v(i+1:M) is stored on exit in A(i+1:M,i).
 /// \param Tau [out]  One-dimensional array of size min(M,N) that contains the
 ///                   scalar factors of the elementary reflectors.
 /// \param Info [out] One-dimensional array of integers and of size 1:
@@ -163,8 +161,8 @@ void geqrf(const ExecutionSpace& space, const AMatrix& A, const TArray& Tau,
 ///                   Info[0] < 0: if equal to '-i', the i-th argument had an
 ///                                illegal value
 ///
-template <class AMatrix, class TArray, class InfoArray>
-void geqrf(const AMatrix& A, const TArray& Tau, const InfoArray& Info) {
+template <class AMatrix, class TauArray, class InfoArray>
+void geqrf(const AMatrix& A, const TauArray& Tau, const InfoArray& Info) {
   typename AMatrix::execution_space space{};
   geqrf(space, A, Tau, Info);
 }
diff --git a/lapack/tpls/KokkosLapack_geqrf_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_geqrf_tpl_spec_decl.hpp
index d9f88549aa..c7630cc783 100644
--- a/lapack/tpls/KokkosLapack_geqrf_tpl_spec_decl.hpp
+++ b/lapack/tpls/KokkosLapack_geqrf_tpl_spec_decl.hpp
@@ -172,7 +172,7 @@ KOKKOSLAPACK_GEQRF_LAPACK(Kokkos::complex<double>, Kokkos::LayoutLeft,
 }  // namespace KokkosLapack
 #endif  // KOKKOSKERNELS_ENABLE_TPL_LAPACK
 
-#if 0  // AquiEEP
+#if 0  // TO DO
 
 // MAGMA
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
@@ -270,7 +270,7 @@ KOKKOSLAPACK_GEQRF_MAGMA(Kokkos::complex<double>, Kokkos::LayoutLeft,
 }  // namespace KokkosLapack
 #endif  // KOKKOSKERNELS_ENABLE_TPL_MAGMA
 
-#endif  // AquiEEP
+#endif  // TO DO
 
 // CUSOLVER
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER
diff --git a/lapack/unit_test/Test_Lapack_geqrf.hpp b/lapack/unit_test/Test_Lapack_geqrf.hpp
index f619c8fba3..2a4533b8bc 100644
--- a/lapack/unit_test/Test_Lapack_geqrf.hpp
+++ b/lapack/unit_test/Test_Lapack_geqrf.hpp
@@ -24,63 +24,127 @@
      (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \
       defined(TEST_THREADS_LAPACK_CPP)))
 
-// AquiEEP
-
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 
+#include <KokkosBlas2_ger.hpp>
+#include <KokkosBlas3_gemm.hpp>
 #include <KokkosLapack_geqrf.hpp>
-//#include <KokkosBlas2_gemv.hpp>
-//#include <KokkosBlas3_gemm.hpp>
 #include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
 
 template <class ViewTypeA, class ViewTypeTau>
 void getQR(int const m, int const n,
-           typename ViewTypeA::HostMirror const&  // h_A
-           ,
-           typename ViewTypeTau::HostMirror const&  // h_tau
-           ,
-           typename ViewTypeA::HostMirror&  // h_Q
-           ,
+           typename ViewTypeA::HostMirror const& h_A,
+           typename ViewTypeTau::HostMirror const& h_tau,
+           typename ViewTypeA::HostMirror& h_Q,
            typename ViewTypeA::HostMirror& h_R,
-           typename ViewTypeA::HostMirror&  // h_QR
+           typename ViewTypeA::HostMirror& h_QR
 ) {
   using ScalarA = typename ViewTypeA::value_type;
 
+  // Populate h_R
   for (int i(0); i < m; ++i) {
     for (int j(0); j < n; ++j) {
-      if constexpr (Kokkos::ArithTraits<ScalarA>::is_complex) {
-        h_R(i, j).real() = 0.;
-        h_R(i, j).imag() = 0.;
-      } else {
-        h_R(i, j) = 0.;
+      if ((i <= j) && (i < n)) {
+        h_R(i,j) = h_A(i,j);
+      }
+      else {
+        h_R(i,j) = Kokkos::ArithTraits<ScalarA>::zero();
       }
     }
   }
 
+  // Instantiate the identity matrix
   ViewTypeA I("I", m, m);
   typename ViewTypeA::HostMirror h_I = Kokkos::create_mirror_view(I);
+  Kokkos::deep_copy(h_I,Kokkos::ArithTraits<ScalarA>::zero());
   for (int i(0); i < m; ++i) {
-    for (int j(0); j < m; ++j) {
-      if constexpr (Kokkos::ArithTraits<ScalarA>::is_complex) {
-        if (i == j) {
-          h_I(i, j).real() = 1.;
-        } else {
-          h_I(i, j).real() = 0.;
-        }
-        h_I(i, j).imag() = 0.;
-      } else {
-        if (i == j) {
-          h_I(i, j) = 1.;
-        } else {
-          h_I(i, j) = 0.;
-        }
-      }
+    if constexpr (Kokkos::ArithTraits<ScalarA>::is_complex) {
+      h_I(i,i).real() = 1.;
+    } else {
+      h_I(i,i) = 1.;
     }
   }
+
+  // Populate h_Q
+  int minMN(std::min(m, n));
+  ViewTypeTau v("v", m);
+  typename ViewTypeTau::HostMirror h_v = Kokkos::create_mirror_view(v);
+
+  ViewTypeA Qk("Qk", m, m);
+  typename ViewTypeA::HostMirror h_Qk = Kokkos::create_mirror_view(Qk);
+
+  ViewTypeA auxM("auxM", m, m);
+  typename ViewTypeA::HostMirror h_auxM = Kokkos::create_mirror_view(auxM);
+
+  // Q = H(0) H(1) . . . H(min(M,N)-1), where for k=0,1,...,min(m,n)-1:
+  //   H(k) = I - Tau(k) * v * v**H, and
+  //   v is a vector of size m with:
+  //     v(0:k-1) = 0,
+  //     v(k)     = 1,
+  //     v(k+1:m-1) = A(k+1:m-1,k).
+  for (int k(0); k < minMN; ++k) {
+    Kokkos::deep_copy(h_v,Kokkos::ArithTraits<ScalarA>::zero());
+    h_v[k] = 1.;
+    for (int index(k+1); index < minMN; ++index) {
+      h_v[index] = h_A(index,k);
+    }
+
+    // Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}.
+    // void ger( const char                                   trans[]
+    //         , const typename AViewType::const_value_type & alpha
+    //         , const XViewType                            & x
+    //         , const YViewType                            & y
+    //         , const AViewType                            & A
+    //         );
+    Kokkos::deep_copy(h_Qk, h_I);
+    KokkosBlas::ger( "H"
+                   , -h_tau[k]
+                   , h_v
+                   , h_v
+                   , h_Qk
+                   );
+
+    // Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B).
+    // void gemm( const char                             transA[]
+    //          , const char                             transB[]
+    //          , typename AViewType::const_value_type & alpha
+    //          , const AViewType                      & A
+    //          , const BViewType                      & B
+    //          , typename CViewType::const_value_type & beta
+    //          , const CViewType                      & C
+    //          );
+    if (k == 0) {
+      Kokkos::deep_copy(h_Q, h_Qk);
+    }
+    else {
+      Kokkos::deep_copy(h_auxM, Kokkos::ArithTraits<ScalarA>::zero());
+      KokkosBlas::gemm( "N"
+                      , "N"
+                      , 1.
+                      , h_Q
+                      , h_Qk
+                      , 0.
+                      , h_auxM
+                      );
+      Kokkos::deep_copy(h_Q, h_auxM);
+    }
+  } // for k
+
+  Kokkos::deep_copy(h_QR, Kokkos::ArithTraits<ScalarA>::zero());
+  KokkosBlas::gemm( "N"
+                  , "N"
+                  , 1.
+                  , h_Q
+                  , h_R
+                  , 0.
+                  , h_QR
+                  );
+
+  // AquiEEP: test Q^H Q = I
 }
 
 template <class ViewTypeA, class ViewTypeTau, class Device>
@@ -88,7 +152,7 @@ void impl_test_geqrf(int m, int n) {
   using ViewTypeInfo    = Kokkos::View<int*, Kokkos::LayoutLeft, Device>;
   using execution_space = typename Device::execution_space;
   using ScalarA         = typename ViewTypeA::value_type;
-  // using ats             = Kokkos::ArithTraits<ScalarA>;
+  using ats             = Kokkos::ArithTraits<ScalarA>;
 
   execution_space space{};
 
@@ -97,13 +161,14 @@ void impl_test_geqrf(int m, int n) {
   int minMN(std::min(m, n));
 
   // Create device views
-  ViewTypeA A("A", m, n);
-  ViewTypeTau Tau("Tau", minMN);
-  ViewTypeInfo Info("Info", 1);
+  ViewTypeA    A    ("A", m, n);
+  ViewTypeA    Aorig("Aorig", m, n);
+  ViewTypeTau  Tau  ("Tau", minMN);
+  ViewTypeInfo Info ("Info", 1);
 
   // Create host mirrors of device views.
   typename ViewTypeA::HostMirror h_A       = Kokkos::create_mirror_view(A);
-  typename ViewTypeA::HostMirror h_Aorig   = Kokkos::create_mirror_view(A);
+  typename ViewTypeA::HostMirror h_Aorig   = Kokkos::create_mirror_view(Aorig);
   typename ViewTypeTau::HostMirror h_tau   = Kokkos::create_mirror_view(Tau);
   typename ViewTypeInfo::HostMirror h_info = Kokkos::create_mirror_view(Info);
 
@@ -124,7 +189,7 @@ void impl_test_geqrf(int m, int n) {
 
       for (int i(0); i < m; ++i) {
         for (int j(0); j < n; ++j) {
-          h_A(i, j).imag() = 0.;
+          h_A(i,j).imag() = 0.;
         }
       }
     } else {
@@ -151,10 +216,10 @@ void impl_test_geqrf(int m, int n) {
 
   Kokkos::deep_copy(h_Aorig, h_A);
 
-#if 1  // def HAVE_KOKKOSKERNELS_DEBUG
+#ifdef HAVE_KOKKOSKERNELS_DEBUG
   for (int i(0); i < m; ++i) {
     for (int j(0); j < n; ++j) {
-      std::cout << "A(" << i << "," << j << ") = " << h_A(i, j) << std::endl;
+      std::cout << "Aorig(" << i << "," << j << ") = " << h_A(i,j) << std::endl;
     }
   }
 #endif
@@ -180,18 +245,86 @@ void impl_test_geqrf(int m, int n) {
   Kokkos::deep_copy(h_A, A);
   Kokkos::deep_copy(h_tau, Tau);
 
-#if 1  // def HAVE_KOKKOSKERNELS_DEBUG
+#ifdef HAVE_KOKKOSKERNELS_DEBUG
   std::cout << "info[0] = " << h_info[0] << std::endl;
   for (int i(0); i < minMN; ++i) {
     for (int j(0); j < n; ++j) {
-      std::cout << "R(" << i << "," << j << ") = " << h_A(i, j) << std::endl;
+      std::cout << "Aoutput(" << i << "," << j << ") = " << std::setprecision(16) << h_A(i,j) << std::endl;
     }
   }
   for (int i(0); i < minMN; ++i) {
-    std::cout << "tau(" << i << ") = " << h_tau[i] << std::endl;
+    std::cout << "tau(" << i << ") = " << h_tau[i] << std::setprecision(16) << std::endl;
   }
 #endif
 
+  const typename Kokkos::ArithTraits<typename ViewTypeA::non_const_value_type>::mag_type absTol(1.e-8);
+
+  if ((m == 3) && (n == 3)) {
+    std::vector<std::vector<ScalarA>> refMatrix(m);
+    for (int i(0); i < m; ++i) {
+      refMatrix[i].resize(n,Kokkos::ArithTraits<ScalarA>::zero());
+    }
+
+    std::vector<ScalarA> refTau(m,Kokkos::ArithTraits<ScalarA>::zero());
+
+    if constexpr (Kokkos::ArithTraits<ScalarA>::is_complex) {
+      refMatrix[0][0].real() = -14.;
+      refMatrix[0][1].real() = -21.;
+      refMatrix[0][2].real() = 14.;
+
+      refMatrix[1][0].real() = 0.2307692307692308;
+      refMatrix[1][1].real() = -175.;
+      refMatrix[1][2].real() = 70.;
+
+      refMatrix[2][0].real() = -0.1538461538461539;
+      refMatrix[2][1].real() = 1./18.;
+      refMatrix[2][2].real() = -35.;
+
+      refTau[0].real() = 1.857142857142857;
+      refTau[1].real() = 1.993846153846154;
+      refTau[2].real() = 0.;
+    }
+    else {
+      refMatrix[0][0] = -14.;
+      refMatrix[0][1] = -21.;
+      refMatrix[0][2] = 14.;
+
+      refMatrix[1][0] = 0.2307692307692308;
+      refMatrix[1][1] = -175.;
+      refMatrix[1][2] = 70.;
+
+      refMatrix[2][0] = -0.1538461538461539;
+      refMatrix[2][1] = 1./18.;
+      refMatrix[2][2] = -35.;
+
+      refTau[0] = 1.857142857142857;
+      refTau[1] = 1.993846153846154;
+      refTau[2] = 0.;
+    }
+
+    {
+      bool test_flag_A = true;
+      for (int i(0); (i < m) && test_flag_A; ++i) {
+        for (int j(0); (j < n) && test_flag_A; ++j) {
+          if (ats::abs(h_A(i,j) - refMatrix[i][j]) > absTol) {
+            test_flag_A = false;
+          }
+        }
+      }
+      ASSERT_EQ(test_flag_A, true);
+    }
+
+    {
+      bool test_flag_tau = true;
+      for (int i(0); (i < m) && test_flag_tau; ++i) {
+        if (ats::abs(h_tau[i] - refTau[i]) > absTol) {
+          test_flag_tau = false;
+        }
+      }
+      ASSERT_EQ(test_flag_tau, true);
+    }
+  }
+
   ViewTypeA Q("Q", m, m);
   ViewTypeA R("R", m, n);
   ViewTypeA QR("QR", m, n);
@@ -202,65 +335,135 @@ void impl_test_geqrf(int m, int n) {
 
   getQR<ViewTypeA, ViewTypeTau>(m, n, h_A, h_tau, h_Q, h_R, h_QR);
 
-#if 1  // def HAVE_KOKKOSKERNELS_DEBUG
+#ifdef HAVE_KOKKOSKERNELS_DEBUG
   for (int i(0); i < m; ++i) {
     for (int j(0); j < m; ++j) {
-      std::cout << "Q(" << i << "," << j << ") = " << h_Q(i, j) << std::endl;
+      std::cout << "Q(" << i << "," << j << ") = " << h_Q(i,j) << std::endl;
     }
   }
   for (int i(0); i < m; ++i) {
     for (int j(0); j < n; ++j) {
-      std::cout << "R(" << i << "," << j << ") = " << h_R(i, j) << std::endl;
+      std::cout << "R(" << i << "," << j << ") = " << h_R(i,j) << std::endl;
     }
   }
   for (int i(0); i < m; ++i) {
     for (int j(0); j < n; ++j) {
-      std::cout << "QR(" << i << "," << j << ") = " << h_QR(i, j) << std::endl;
+      std::cout << "QR(" << i << "," << j << ") = " << h_QR(i,j) << std::endl;
     }
   }
 #endif
 
   if ((m == 3) && (n == 3)) {
-  }
+    std::vector<std::vector<ScalarA>> refQ(m);
+    for (int i(0); i < m; ++i) {
+      refQ[i].resize(n,Kokkos::ArithTraits<ScalarA>::zero());
+    }
 
-  // Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B).
-  // void gemm( const execution_space                & space
-  //          , const char                             transA[]
-  //          , const char                             transB[]
-  //          , typename AViewType::const_value_type & alpha
-  //          , const AViewType                      & A
-  //          , const BViewType                      & B
-  //          , typename CViewType::const_value_type & beta
-  //          , const CViewType                      & C
-  //          );
-
-  // Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}.
-  // void ger( const ExecutionSpace                       & space
-  //         , const char                                   trans[]
-  //         , const typename AViewType::const_value_type & alpha
-  //         , const XViewType                            & x
-  //         , const YViewType                            & y
-  //         , const AViewType                            & A
-  //         );
-
-  // Checking vs ref on CPU, this eps is about 10^-9
-  // typedef typename ats::mag_type mag_type;
-  // const mag_type eps = 3.0e7 * ats::epsilon();
-  bool test_flag = true;
-  for (int i = 0; i < n; i++) {
-#if 0
-    if (ats::abs(h_B(i) - h_X0(i)) > eps) {
-      test_flag = false;
-      printf(
-          "    Error %d, pivot %c, padding %c: result( %.15lf ) !="
-          "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n",
-          N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i),
-          ats::abs(h_B(i) - h_X0(i)), eps);
-      break;
+    std::vector<std::vector<ScalarA>> refR(m);
+    for (int i(0); i < m; ++i) {
+      refR[i].resize(n,Kokkos::ArithTraits<ScalarA>::zero());
     }
+
+#if 0
+    Q = [ -6/7     69/175   58/175
+          -3/7   -158/175   -6/175
+           2/7     -6/35    33/35 ]
+
+    R = [ -14   -21   14
+           0   -175   70
+           0      0  -35 ]
 #endif
+
+    if constexpr (Kokkos::ArithTraits<ScalarA>::is_complex) {
+      refQ[0][0].real() = -6./7.;
+      refQ[0][1].real() = 69./175.;
+      refQ[0][2].real() = 58./175.;
+
+      refQ[1][0].real() = -3./7.;
+      refQ[1][1].real() = -158./175.;
+      refQ[1][2].real() = -6./175.;
+
+      refQ[2][0].real() = 2./7.;
+      refQ[2][1].real() = -6./35.;
+      refQ[2][2].real() = 33./35.;
+
+      refR[0][0].real() = -14.;
+      refR[0][1].real() = -21.;
+      refR[0][2].real() = 14.;
+
+      refR[1][1].real() = -175.;
+      refR[1][2].real() = 70.;
+
+      refR[2][2].real() = -35.;
+    }
+    else {
+      refQ[0][0] = -6./7.;
+      refQ[0][1] = 69./175.;
+      refQ[0][2] = 58./175.;
+
+      refQ[1][0] = -3./7.;
+      refQ[1][1] = -158./175.;
+      refQ[1][2] = -6./175.;
+
+      refQ[2][0] = 2./7.;
+      refQ[2][1] = -6./35.;
+      refQ[2][2] = 33./35.;
+
+      refR[0][0] = -14.;
+      refR[0][1] = -21.;
+      refR[0][2] = 14.;
+
+      refR[1][1] = -175.;
+      refR[1][2] = 70.;
+
+      refR[2][2] = -35.;
+    }
+
+    {
+      bool test_flag_Q = true;
+      for (int i(0); (i < m) && test_flag_Q; ++i) {
+        for (int j(0); (j < n) && test_flag_Q; ++j) {
+          if (ats::abs(h_Q(i,j) - refQ[i][j]) > absTol) {
+            test_flag_Q = false;
+          }
+        }
+      }
+      ASSERT_EQ(test_flag_Q, true);
+    }
+
+    {
+      bool test_flag_R = true;
+      for (int i(0); (i < m) && test_flag_R; ++i) {
+        for (int j(0); (j < n) && test_flag_R; ++j) {
+          if (ats::abs(h_R(i,j) - refR[i][j]) > absTol) {
+            test_flag_R = false;
+          }
+        }
+      }
+      ASSERT_EQ(test_flag_R, true);
+    }
+  }
+
+  {
+    bool test_flag_QR = true;
+    for (int i(0); (i < m) && test_flag_QR; ++i) {
+      for (int j(0); (j < n) && test_flag_QR; ++j) {
+        if (ats::abs(h_QR(i,j) - h_Aorig(i,j)) > absTol) {
+          std::cout << "m = " << m
+                    << ", n = " << n
+                    << ", i = " << i
+                    << ", j = " << j
+                    << ", h_Aorig(i,j) = " << std::setprecision(16) << h_Aorig(i,j)
+                    << ", h_QR(i,j) = "    << std::setprecision(16) << h_QR(i,j)
+                    << ", |diff| = "       << std::setprecision(16) << ats::abs(h_QR(i,j) - h_Aorig(i,j))
+                    << ", absTol = "       << std::setprecision(16) << absTol
+                    << std::endl;
+          test_flag_QR = false;
+        }
+      }
+    }
+    ASSERT_EQ(test_flag_QR, true);
   }
-  ASSERT_EQ(test_flag, true);
 }
 
 }  // namespace Test
@@ -274,6 +477,9 @@ void test_geqrf() {
   using view_type_tau_ll = Kokkos::View<Scalar*, Kokkos::LayoutLeft, Device>;
 
   Test::impl_test_geqrf<view_type_a_ll, view_type_tau_ll, Device>(3, 3);
+  Test::impl_test_geqrf<view_type_a_ll, view_type_tau_ll, Device>(100, 100);
+  //Test::impl_test_geqrf<view_type_a_ll, view_type_tau_ll, Device>(100, 70); // AquiEEP
+  Test::impl_test_geqrf<view_type_a_ll, view_type_tau_ll, Device>(70, 100);
 #endif
 }