SciNim · Vindaar · Oct 14, 2024 · Aug 26, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,14 @@
 nimcache
-examples/fft
-examples/pagerank
-examples/sparse
-examples/random
-.vscode
+
+# ignore all files except nim source files in the examples dir
+examples/cuda12_5/*
+!examples/cuda12_5/*.nim
+
+# ignore all files except nim source files in the examples dir
+examples/cuda8_0/*
+!examples/cuda8_0/*.nim
+
+#ignore docs
+htmldocs
+
+.vscode
diff --git a/examples/cuda12_5/blas.nim b/examples/cuda12_5/blas.nim
@@ -0,0 +1,74 @@
+# Link against the cuBLAS and CUDA runtime libraries
+
+import
+  std / [strformat],
+  ../../src/nimcuda/cuda12_5/[cuda_runtime_api, cublas_api,
+                               driver_types, check]
+
+
+
+proc main() =
+  var handle: cublasHandle_t
+
+  # Initialize cuBLAS library
+  check cublasCreate_v2(addr handle)
+
+  # Matrix dimensions
+  const m = 2  # Rows of A and C
+  const n = 2  # Columns of B and C
+  const k = 2  # Columns of A and rows of B
+
+  # Host matrices (column-major order)
+  var h_A: array[0..(m*k)-1, cfloat] = [cfloat 1.0, 2.0,
+                                               3.0, 4.0]
+
+  var h_B: array[0..(k*n)-1, cfloat] = [cfloat 5.0, 6.0,
+                                               7.0, 8.0]
+
+  var h_C: array[0..(m*n)-1, cfloat] = [cfloat 0.0, 0.0,
+                                               0.0, 0.0]
+
+  # Device pointers
+  var d_A, d_B, d_C: pointer
+
+  # Allocate device memory
+  check cudaMalloc(addr d_A, culong(m*k*sizeof(cfloat)))
+  check cudaMalloc(addr d_B, culong(k*n*sizeof(cfloat)))
+  check cudaMalloc(addr d_C, culong(m*n*sizeof(cfloat)))
+
+  # Copy host data to device
+  check cudaMemcpy(d_A, addr h_A[0], culong(m*k*sizeof(cfloat)),
+    cudaMemcpyHostToDevice)
+  check cudaMemcpy(d_B, addr h_B[0], culong(k*n*sizeof(cfloat)),
+    cudaMemcpyHostToDevice)
+
+  # Scalars for the operation
+  var alpha: cfloat = 1.0
+  var beta: cfloat = 0.0
+
+  # Perform matrix multiplication: C = alpha * A * B + beta * C
+  check cublasSgemm_v2(handle, CUBLAS_OP_N #[No transpose for A]#,
+    CUBLAS_OP_N #[No transpose for B]#, m, n, k, addr alpha,
+    cast[ptr cfloat](d_A), m, cast[ptr cfloat](d_B), k, addr beta,
+    cast[ptr cfloat](d_C), m)
+
+  # Copy result back to host
+  check cudaMemcpy(addr h_C[0], d_C, culong(m*n*sizeof(cfloat)),
+    cudaMemcpyDeviceToHost)
+
+  # Display the result
+  echo "Result matrix C:"
+  for i in 0..<m:
+    var rowStr = ""
+    for j in 0..<n:
+      rowStr.add fmt"{h_C[i + j*m]:^6.1f} "
+    echo rowStr
+
+  # Clean up resources
+  check cudaFree(d_A)
+  check cudaFree(d_B)
+  check cudaFree(d_C)
+  check cublasDestroy_v2(handle)
+
+main()
+
diff --git a/examples/cuda12_5/denseLinearSystem.nim b/examples/cuda12_5/denseLinearSystem.nim
@@ -0,0 +1,96 @@
+
+import
+  std / [strformat],
+  ../../src/nimcuda/cuda12_5/[driver_types, cusolver_common,
+    cusolverDn, cuda_runtime_api, check, cublas_api]
+
+proc main() =
+  var status: cusolverStatus_t
+  var handle: cusolverDnHandle_t
+
+  # Initialize cuSOLVER library
+  status = cusolverDnCreate(addr handle)
+  if status != CUSOLVER_STATUS_SUCCESS:
+    echo "CUSOLVER initialization failed"
+    return
+
+  # Matrix dimensions
+  const n = 3  # Number of equations and variables
+
+  # Host matrix A and right-hand side vector b (column-major order)
+  var h_A: array[0..(n*n)-1, cfloat] = [
+    cfloat 3.0, 1.0, 1.0,  # First column
+           2.0, 2.0, 1.0,  # Second column
+           1.0, 1.0, 1.0  # Third column
+    ]
+
+  var h_b: array[0..n-1, cfloat] = [cfloat 10, 8, 6]  # Right-hand side vector
+
+  # Device pointers
+  var d_A, d_b: pointer
+  var devIpiv: pointer  # Pivot array
+  var devInfo: pointer  # Info output
+
+  # Allocate device memory
+  check cudaMalloc(addr d_A, culong(n*n*sizeof(cfloat)))
+  check cudaMalloc(addr d_b, culong(n*sizeof(cfloat)))
+  check cudaMalloc(addr devIpiv, culong(n*sizeof(cint)))
+  check cudaMalloc(addr devInfo, culong(sizeof(cint)))
+
+  # Copy host data to device
+  check cudaMemcpy(d_A, addr h_A[0], culong(n*n*sizeof(cfloat)),
+                   cudaMemcpyHostToDevice)
+  check cudaMemcpy(d_b, addr h_b[0], culong(n*sizeof(cfloat)),
+                   cudaMemcpyHostToDevice)
+
+  # Get the buffer size for LU decomposition
+  var lwork: cint
+  check cusolverDnSgetrf_bufferSize(handle, n, n, cast[ptr cfloat](d_A), n,
+                                    addr lwork)
+
+  # Allocate workspace
+  var d_Workspace: pointer
+  check cudaMalloc(addr d_Workspace, culong(lwork*sizeof(cfloat)))
+
+  # Perform LU decomposition
+  check cusolverDnSgetrf(handle, n, n, cast[ptr cfloat](d_A), n,
+    cast[ptr cfloat](d_Workspace), cast[ptr cint](devIpiv),
+    cast[ptr cint](devInfo))
+
+  # Check devInfo after getrf
+  var h_info: cint
+  check cudaMemcpy(addr h_info, devInfo, culong(sizeof(cint)),
+                   cudaMemcpyDeviceToHost)
+  if h_info != 0:
+    echo "LU decomposition failed, info = ", h_info
+    return
+
+  # Solve the system A*x = b
+  check cusolverDnSgetrs(handle, CUBLAS_OP_N, n, 1, cast[ptr cfloat](d_A), n,
+    cast[ptr cint](devIpiv), cast[ptr cfloat](d_b), n, cast[ptr cint](devInfo))
+
+  # Check devInfo after getrs
+  check cudaMemcpy(addr h_info, devInfo, culong(sizeof(cint)),
+                   cudaMemcpyDeviceToHost)
+  if h_info != 0:
+    echo "Solving the linear system failed, info = ", h_info
+    return
+
+  # Copy result back to host
+  check cudaMemcpy(addr h_b[0], d_b, culong(n*sizeof(cfloat)),
+                   cudaMemcpyDeviceToHost)
+
+  # Display the result
+  echo "Solution vector x:"
+  for i in 0..<n:
+    echo fmt" x[{i}] = {h_b[i]:^6.4f}"
+
+  # Clean up resources
+  check cudaFree(d_A)
+  check cudaFree(d_b)
+  check cudaFree(d_Workspace)
+  check cudaFree(devIpiv)
+  check cudaFree(devInfo)
+  check cusolverDnDestroy(handle)
+
+main()
diff --git a/examples/cuda12_5/fft b/examples/cuda12_5/fft
diff --git a/examples/cuda12_5/random b/examples/cuda12_5/random
diff --git a/examples/cuda12_5/sparse b/examples/cuda12_5/sparse
diff --git a/examples/cuda12_5/sparseLinearSystem.nim b/examples/cuda12_5/sparseLinearSystem.nim
@@ -0,0 +1,101 @@
+
+# Link against the cuSOLVER, cuSPARSE, and CUDA runtime libraries
+{.passL: "-lcusolver -lcusparse -lcudart".}
+
+import
+  std / [strformat],
+  ../../src/nimcuda/cuda12_5/[cuda_runtime_api, driver_types, cusolverSp,
+    check, cusparse]
+
+
+proc main() =
+  var handle: cusolverSpHandle_t
+  var descrA: cusparseMatDescr_t
+
+  # Initialize cuSOLVER Sparse library
+  check cusolverSpCreate(addr handle)
+
+  # Create matrix descriptor
+  check cusparseCreateMatDescr(addr descrA)
+
+  # Matrix dimensions and number of non-zero elements
+  const m = 3    # Number of rows
+  const n = 3    # Number of columns
+  const nnz = 7  # Number of non-zero elements
+
+  # Host representation of the sparse matrix A in CSR format
+  # A = [ 10  0   0
+  #        3  9   0
+  #        0  7   8 ]
+
+  # Row pointers
+  var h_csrRowPtrA: array[0..m, cint] = [cint 0, 1, 3, 7]
+
+  # Column indices
+  var h_csrColIndA: array[0..nnz-1, cint] = [cint 0, 0, 1, 1, 2, 1, 2]
+
+  # Non-zero values
+  var h_csrValA: array[0..nnz-1, cfloat] = [cfloat 10.0, 3.0, 9.0, 7.0, 8.0, 7.0, 8.0]
+
+  # Right-hand side vector b
+  var h_b: array[0..m-1, cfloat] = [cfloat 10.0, 21.0, 38.0]
+
+  # Solution vector x
+  var h_x: array[0..m-1, cfloat] = [cfloat 0.0, 0.0, 0.0]
+
+  # Device pointers
+  var d_csrRowPtrA, d_csrColIndA, d_csrValA, d_b, d_x: pointer
+
+  # Allocate device memory
+  check cudaMalloc(addr d_csrRowPtrA, culong((m+1)*sizeof(cint)))
+  check cudaMalloc(addr d_csrColIndA, culong(nnz*sizeof(cint)))
+  check cudaMalloc(addr d_csrValA, culong(nnz*sizeof(cfloat)))
+  check cudaMalloc(addr d_b, culong(m*sizeof(cfloat)))
+  check cudaMalloc(addr d_x, culong(n*sizeof(cfloat)))
+
+  # Copy host data to device
+  check cudaMemcpy(d_csrRowPtrA, addr h_csrRowPtrA[0],
+                   culong((m+1)*sizeof(cint)), cudaMemcpyHostToDevice)
+  check cudaMemcpy(d_csrColIndA, addr h_csrColIndA[0], culong(nnz*sizeof(cint)),
+                   cudaMemcpyHostToDevice)
+  check cudaMemcpy(d_csrValA, addr h_csrValA[0], culong(nnz*sizeof(cfloat)),
+                   cudaMemcpyHostToDevice)
+  check cudaMemcpy(d_b, addr h_b[0], culong(m*sizeof(cfloat)),
+                   cudaMemcpyHostToDevice)
+
+  # Tolerance for the solver and reorder parameter
+  const tol: cfloat = 1e-6
+  const reorder: cint = 0  # No reordering
+
+  # Variable to hold the position of zero pivot (if any)
+  var singularity: cint
+
+  # Solve the sparse linear system A*x = b
+  check cusolverSpScsrlsvQr(handle, m, nnz, descrA, cast[ptr cfloat](d_csrValA),
+    cast[ptr cint](d_csrRowPtrA), cast[ptr cint](d_csrColIndA),
+    cast[ptr cfloat](d_b), tol, reorder, cast[ptr cfloat](d_x),
+    addr singularity)
+
+  if singularity >= 0:
+    echo "A is singular at row ", singularity
+    return
+
+  # Copy result back to host
+  check cudaMemcpy(addr h_x[0], d_x, culong(n*sizeof(cfloat)),
+                   cudaMemcpyDeviceToHost)
+
+  # Display the result
+  echo "Solution vector x:"
+  for i in 0..<n:
+    echo fmt" x[{i}] = {h_x[i]:^6.4f}"
+
+  # Clean up resources
+  check cudaFree(d_csrRowPtrA)
+  check cudaFree(d_csrColIndA)
+  check cudaFree(d_csrValA)
+  check cudaFree(d_b)
+  check cudaFree(d_x)
+  check cusparseDestroyMatDescr(descrA)
+  check cusolverSpDestroy(handle)
+
+main()