Qiskit · MozammilQ · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
@@ -0,0 +1,40 @@
+---
+features:
+  - |
+    This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.
+    To be precise, this PR offloads the Singular Value Decomposition required for
+    Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.
+
+    While choosing for the backend for Matrix Product State simulation users can 
+    choose all as usual, but this time they can choose the device as GPU.
+
+    Example
+
+    .. code-block:: python
+
+      from qiskit_aer import AerSimulator
+      from qiskit.circuit import QuantumCircuit
+      from qiskit.compiler import transpile
+
+      num_qubits = 10
+      shots = 5
+
+      qc = QuantumCircuit(num_qubits)
+      qc.h(0)
+
+      for control, target in zip(range(num_qubits-1), range(1, num_qubits)):
+        qc.cx(control, target)
+
+      qc.measure_all()
+
+      sim = AerSimulator(method="matrix_product_state", device="GPU")
+      qc_t = transpile(qc, backend=sim)
+      job = sim.run(qc_t, shots = shots)
+
+      counts = job.result().get_counts()
+      counts
+
+
+
+
+
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -360,6 +360,19 @@ void State::set_config(const Config &config) {
 
   // Set LAPACK SVD
   MPS::set_mps_lapack_svd(config.mps_lapack);
+
+  // Set device for SVD
+  MPS::set_mps_svd_device(config.device);
+
+  // Get CUDA device, if GPU offloading enabled
+  if (config.device.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
+    cudaDeviceProp prop;
+    int deviceId{-1};
+    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
+    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+#endif // AER_THRUST_CUDA
+  }
 }
 
 void State::add_metadata(ExperimentResult &result) const {

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -19,6 +19,7 @@
 #include "stdlib.h"
 #include "string.h"
 #include <iostream>
+#include <string>
 #include <utility>
 
 #include "framework/linalg/almost_equal.hpp"
@@ -45,6 +46,9 @@ double MPS::json_chop_threshold_ = 1E-8;
 std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
+#ifdef AER_THRUST_CUDA
+std::string MPS::mps_svd_device_;
+#endif // AER_THRUST_CUDA
 
 //------------------------------------------------------------------------
 // local function declarations
@@ -663,8 +667,14 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
+#ifdef AER_THRUST_CUDA
+  double discarded_value = MPS_Tensor::Decompose(
+      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
+      MPS::mps_svd_device_, cuda_stream, cutensor_handle);
+#else
   double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
                                                  right_gamma, MPS::mps_lapack_);
+#endif // AER_THRUST_CUDA
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1803,7 +1813,18 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     // step 2 - SVD
     S.clear();
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
+
+#ifdef AER_THRUST_CUDA
+    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+      cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream,
+                            cutensor_handle);
+    } else {
+      csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+    }
+#else
     csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+#endif // AER_THRUST_CUDA
+
     reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
                  MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);
 

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -15,12 +15,12 @@
 #ifndef _aer_matrix_product_state_hpp_
 #define _aer_matrix_product_state_hpp_
 
-#include <cstdarg>
-
 #include "framework/json.hpp"
 #include "framework/operations.hpp"
 #include "framework/utils.hpp"
 #include "matrix_product_state_tensor.hpp"
+#include <cstdarg>
+#include <string>
 
 namespace AER {
 namespace MatrixProductState {
@@ -81,7 +81,14 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT };
 
 class MPS {
 public:
-  MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {}
+  MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {
+#ifdef AER_THRUST_CUDA
+    if (mps_svd_device_.compare("GPU") == 0) {
+      cudaStreamCreate(&cuda_stream);
+      cutensornetCreate(&cutensor_handle);
+    }
+#endif // AER_THRUST_CUDA
+  }
   ~MPS() {}
 
   //--------------------------------------------------------------------------
@@ -321,6 +328,9 @@ class MPS {
   }
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
+  static void set_mps_svd_device(std::string mps_svd_device) {
+    mps_svd_device_ = mps_svd_device;
+  }
 
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
@@ -544,6 +554,11 @@ class MPS {
   std::vector<MPS_Tensor> q_reg_;
   std::vector<rvector_t> lambda_reg_;
 
+#ifdef AER_THRUST_CUDA
+  cudaStream_t cuda_stream;
+  cutensornetHandle_t cutensor_handle;
+#endif // AER_THRUST_CUDA
+
   struct ordering {
     // order_ stores the current ordering of the qubits,
     // location_ stores the location of each qubit in the vector. It is derived
@@ -570,6 +585,7 @@ class MPS {
   static bool mps_log_data_;
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
+  static std::string mps_svd_device_;
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {

diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -157,12 +157,22 @@ class MPS_Tensor {
   static MPS_Tensor contract(const MPS_Tensor &left_gamma,
                              const rvector_t &lambda,
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
+#ifdef AER_THRUST_CUDA
+  static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
+                          rvector_t &lambda, MPS_Tensor &right_gamma,
+                          bool mps_lapack, std::string mps_svd_device,
+                          cudaStream_t &cuda_stream,
+                          cutensornetHandle_t &cutensor_handle);
+#else
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
                           bool mps_lapack);
+#endif // AER_THRUST_CUDA
+
   static void
   reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> &data,
                                   MPS_Tensor &reshaped_tensor);
+
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
                                     const MPS_Tensor &right_gamma,
                                     uint_t omp_threads, cmatrix_t &result);
@@ -591,15 +601,33 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //             rvector_t &lambda - tensors for the result.
 // Returns: none.
 //---------------------------------------------------------------
+#ifdef AER_THRUST_CUDA
+double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
+                             rvector_t &lambda, MPS_Tensor &right_gamma,
+                             bool mps_lapack, std::string mps_svd_device,
+                             cudaStream_t &cuda_stream,
+                             cutensornetHandle_t &cutensor_handle)
+#else
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack) {
+                             bool mps_lapack)
+#endif // AER_THRUST_CUDA
+{
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
+#ifdef AER_THRUST_CUDA
+  if (mps_svd_device.compare("GPU") == 0) {
+    cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle);
+  } else {
+    csvd_wrapper(C, U, S, V, mps_lapack);
+  }
+#else
   csvd_wrapper(C, U, S, V, mps_lapack);
+#endif // AER_THRUST_CUDA
+
   double discarded_value = 0.0;
   discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
                                  truncation_threshold_, mps_lapack);