NVIDIA
diff --git a/‎docs_input/api/linalg/decomp/pinv.rst
Lines changed: 19 additions & 0 deletions b/‎docs_input/api/linalg/decomp/pinv.rst
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs_input/api/linalg/other/det.rst
Lines changed: 18 additions & 0 deletions b/‎docs_input/api/linalg/other/det.rst
Lines changed: 18 additions & 0 deletions
diff --git a/‎include/matx/core/tensor.h
Lines changed: 3 additions & 2 deletions b/‎include/matx/core/tensor.h
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/matx/operators/chol.h
Lines changed: 0 additions & 2 deletions b/‎include/matx/operators/chol.h
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/matx/operators/det.h
Lines changed: 7 additions & 2 deletions b/‎include/matx/operators/det.h
Lines changed: 7 additions & 2 deletions
diff --git a/‎include/matx/operators/operators.h
Lines changed: 1 addition & 0 deletions b/‎include/matx/operators/operators.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/matx/operators/pinv.h
Lines changed: 144 additions & 0 deletions b/‎include/matx/operators/pinv.h
Lines changed: 144 additions & 0 deletions
diff --git a/‎include/matx/operators/svd.h
Lines changed: 5 additions & 0 deletions b/‎include/matx/operators/svd.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/matx/operators/trace.h
Lines changed: 0 additions & 2 deletions b/‎include/matx/operators/trace.h
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/matx/transforms/det.h
Lines changed: 7 additions & 15 deletions b/‎include/matx/transforms/det.h
Lines changed: 7 additions & 15 deletions
@@ -0,0 +1,19 @@
+.. _pinv_func:
+
+pinv
+####
+
+Compute the Moore-Penrose pseudo-inverse of a matrix.
+
+.. doxygenfunction:: pinv(const OpA &a, float rcond = get_default_rcond<typename OpA::value_type>())
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_solver/Pinv.cu
+   :language: cpp
+   :start-after: example-begin pinv-test-1
+   :end-before: example-end pinv-test-1
+   :dedent:
+
+
@@ -0,0 +1,18 @@
+.. _det_func:
+
+det
+=====
+
+Compute the determinant of a tensor.
+
+.. doxygenfunction:: det(const OpA &a)
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_solver/Det.cu
+   :language: cpp
+   :start-after: example-begin det-test-1
+   :end-before: example-end det-test-1
+   :dedent:
+
@@ -658,9 +658,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
+    [[maybe_unused]] stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<stride_type>());
     // Ensure new shape's total size is not larger than the original
     MATX_ASSERT_STR(
-        sizeof(M) * shape.TotalSize() <= storage_.Bytes(), matxInvalidSize,
+        sizeof(M) * prod <= storage_.Bytes(), matxInvalidSize,
         "Total size of new tensor must not be larger than the original");
 
     // This could be loosened up to make sure only the fastest changing dims
@@ -877,7 +878,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
-    static_assert(RANK >= 2, "Only tensors of rank 2 and higher can be permuted.");
+    static_assert(RANK >= 1, "Only tensors of rank 1 and higher can be permuted.");
     cuda::std::array<shape_type, RANK> n;
     cuda::std::array<stride_type, RANK> s;
     [[maybe_unused]] bool done[RANK] = {0};
 
@@ -104,8 +104,6 @@ namespace detail {
         }
       }        
 
-      // Size is not relevant in eig() since there are multiple return values and it
-      // is not allowed to be called in larger expressions
       constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
       {
         return a_.Size(dim);
 
@@ -96,8 +96,6 @@ namespace detail {
         }
       }        
 
-      // Size is not relevant in det() since there are multiple return values and it
-      // is not allowed to be called in larger expressions
       constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
       {
         return a_.Size(dim);
@@ -106,6 +104,13 @@ namespace detail {
   };
 }
 
+/**
+ * Computes the determinant by performing an LU factorization of the input,
+ * and then calculating the product of diagonal entries of the U factor.
+ * 
+ * For tensors of rank > 2, batching is performed.
+ * 
+ */
 template<typename OpA>
 __MATX_INLINE__ auto det(const OpA &a) {
   return detail::DetOp(a);
 
@@ -80,6 +80,7 @@
 #include "matx/operators/outer.h"
 #include "matx/operators/overlap.h"
 #include "matx/operators/percentile.h"
+#include "matx/operators/pinv.h"
 #include "matx/operators/permute.h"
 #include "matx/operators/planar.h"
 #include "matx/operators/polyval.h"
 
@@ -0,0 +1,144 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// COpBright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above cOpBright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above cOpBright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the cOpBright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COpBRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COpBRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+
+#include "matx/core/type_utils.h"
+#include "matx/operators/base_operator.h"
+#include "matx/transforms/pinv.h"
+
+namespace matx {
+namespace detail {
+  template<typename OpA>
+  class PinvOp : public BaseOp<PinvOp<OpA>>
+  {
+    private:
+      OpA a_;
+      float rcond_;
+      cuda::std::array<index_t, OpA::Rank()> out_dims_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::value_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::value_type *ptr; 
+
+    public:
+      using matxop = bool;
+      using value_type = typename OpA::value_type;
+      using matx_transform_op = bool;
+      using pinv_xform_op = bool;
+
+      __MATX_INLINE__ std::string str() const { return "pinv()"; }
+      __MATX_INLINE__ PinvOp(OpA a, float rcond) : a_(a), rcond_(rcond) {
+        for (int r = 0; r < Rank(); r++) {
+          if (r >= Rank() - 2) {
+            out_dims_[r] = (r == Rank() - 1) ? a_.Size(Rank() - 2) : a_.Size(Rank() - 1);
+          }
+          else {
+            out_dims_[r] = a_.Size(r);
+          }
+        } 
+      };
+
+      template <typename... Is>
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const
+      {
+        return tmp_out_(indices...);
+      }
+
+      static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+      {
+        return OpA::Rank();
+      }
+
+      constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
+      {
+        return out_dims_.Size(dim);
+      }
+
+      template <typename Out, typename Executor>
+      void Exec(Out &&out, Executor &&ex) const{
+        pinv_impl(cuda::std::get<0>(out), a_, ex, rcond_);
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }       
+      }      
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+
+        Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }
+
+  };
+}
+
+/**
+ * Perfom a generalized inverse of a matrix using its singular-value decomposition (SVD).
+ * It automatically removes small singular values for stability.
+ * 
+ * For tensors of rank > 2, batching is performed.
+ * 
+ * @tparam OpA
+ *   Tensor or operator type of input A
+ * 
+ * @param a
+ *   Input tensor or operator of shape `... x m x n`
+ * @param rcond
+ *   Cutoff for small singular values. For stability, singular values
+ *   smaller than `rcond * largest_singular_value` are set to 0 for each matrix
+ *   in the batch. By default, `rcond` is approximately the machine epsilon of the tensor dtype.
+ * 
+ * @return
+ *   An operator that gives a tensor of size `... x n x m` representing the pseudo-inverse of the input
+ */
+template<typename OpA>
+__MATX_INLINE__ auto pinv(const OpA &a, float rcond = get_default_rcond<typename OpA::value_type>()) {
+  return detail::PinvOp(a, rcond);
+}
+
+}
@@ -65,6 +65,7 @@ namespace detail {
       template <typename... Is>
       __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const = delete;
 
+      // TODO: Handle SVDMode::NONE case better to not require U & VT
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
         static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svd(). ie: (mtie(U, S, VT) = svd(A))");
@@ -99,6 +100,10 @@ namespace detail {
 /**
  * Perform a singular value decomposition (SVD) using cuSolver or a LAPACK host
  * library.
+ * 
+ * The singular values within each vector are sorted in descending order.
+ * 
+ * For tensors of Rank > 2, batching is performed.
  *
  * @tparam OpA
  *   Operator input type
 
@@ -110,8 +110,6 @@ namespace detail {
 }
 
 /**
- * Computes the trace of a tensor
- *
  * Computes the trace of a square matrix by summing the diagonal
  *
  * @tparam InputOperator
 
@@ -51,10 +51,8 @@ namespace matx {
 /**
  * Compute the determinant of a matrix
  *
- * Computes the terminant of a matrix by first computing the LU composition,
- * then reduces the product of the diagonal elements of U. The input and output
- * parameters may be the same tensor. In that case, the input is destroyed and
- * the output is stored in-place.
+ * Computes the determinant of a matrix by first computing the LU decomposition,
+ * then reduces the product of the diagonal elements of U.
  *
  * @tparam T1
  *   Data type of matrix A
@@ -80,22 +78,16 @@ void det_impl(OutputTensor &out, const InputTensor &a,
   constexpr int RANK = InputTensor::Rank();
   using value_type = typename OutputTensor::value_type;
   using piv_value_type = std::conditional_t<is_cuda_executor_v<Executor>, int64_t, lapack_int_t>;
-  
-  auto a_new = OpToTensor(a, exec);
-
-  if(!a_new.isSameView(a)) {
-    (a_new = a).run(exec);
-  }
 
   // Get parameters required by these tensors
   cuda::std::array<index_t, RANK - 1> s;
 
   // Set batching dimensions of piv
   for (int i = 0; i < RANK - 2; i++) {
-    s[i] = a_new.Size(i);
+    s[i] = a.Size(i);
   }
 
-  index_t piv_len = cuda::std::min(a_new.Size(RANK - 1), a_new.Size(RANK - 2));
+  index_t piv_len = cuda::std::min(a.Size(RANK - 1), a.Size(RANK - 2));
   s[RANK - 2] = piv_len;
 
   tensor_t<piv_value_type, RANK-1> piv;
@@ -104,13 +96,13 @@ void det_impl(OutputTensor &out, const InputTensor &a,
   if constexpr (is_cuda_executor_v<Executor>) {
     const auto stream = exec.getStream();
     make_tensor(piv, s, MATX_ASYNC_DEVICE_MEMORY, stream);
-    make_tensor(ac, a_new.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
+    make_tensor(ac, a.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
   } else {
     make_tensor(piv, s, MATX_HOST_MALLOC_MEMORY);
-    make_tensor(ac, a_new.Shape(), MATX_HOST_MALLOC_MEMORY);
+    make_tensor(ac, a.Shape(), MATX_HOST_MALLOC_MEMORY);
   }
 
-  lu_impl(ac, piv, a_new, exec);
+  lu_impl(ac, piv, a, exec);
 
   // Determinant sign adjustment based on piv permutation
   // Create indices corresponding to no permutation to compare against
Original file line number	Diff line number	Diff line change
`@@ -104,8 +104,6 @@ namespace detail {`
`104`	`104`	`}`
`105`	`105`	`}`
`106`	`106`
`107`		`- // Size is not relevant in eig() since there are multiple return values and it`
`108`		`- // is not allowed to be called in larger expressions`
`109`	`107`	`constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const`
`110`	`108`	`{`
`111`	`109`	`return a_.Size(dim);`
Original file line number	Diff line number	Diff line change
`@@ -96,8 +96,6 @@ namespace detail {`
`96`	`96`	`}`
`97`	`97`	`}`
`98`	`98`
`99`		`- // Size is not relevant in det() since there are multiple return values and it`
`100`		`- // is not allowed to be called in larger expressions`
`101`	`99`	`constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const`
`102`	`100`	`{`
`103`	`101`	`return a_.Size(dim);`
`@@ -106,6 +104,13 @@ namespace detail {`
`106`	`104`	`};`
`107`	`105`	`}`
`108`	`106`
	`107`	`+/**`
	`108`	`+ * Computes the determinant by performing an LU factorization of the input,`
	`109`	`+ * and then calculating the product of diagonal entries of the U factor.`
	`110`	`+ *`
	`111`	`+ * For tensors of rank > 2, batching is performed.`
	`112`	`+ *`
	`113`	`+ */`
`109`	`114`	`template<typename OpA>`
`110`	`115`	`__MATX_INLINE__ auto det(const OpA &a) {`
`111`	`116`	`return detail::DetOp(a);`
Original file line number	Diff line number	Diff line change
`@@ -110,8 +110,6 @@ namespace detail {`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`/**`
`113`		`- * Computes the trace of a tensor`
`114`		`- *`
`115`	`113`	`* Computes the trace of a square matrix by summing the diagonal`
`116`	`114`	`*`
`117`	`115`	`* @tparam InputOperator`