Skip to content

Backwards nim compat, more examples, and check update #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
56c2e91
initial commit - all libraries will compile, except for cudnn. Actual…
lilkeet Aug 26, 2024
fb71271
compiles, functionality untested. last commit used some files from 8.…
lilkeet Sep 13, 2024
51a9d2f
random example working
lilkeet Sep 13, 2024
949b526
removed pagerank example since nvgraph is no longer in cuda
lilkeet Sep 14, 2024
0cf6d60
sparse example works
lilkeet Sep 14, 2024
eb76e29
fft example works
lilkeet Sep 14, 2024
64820b7
refinement of postprocessor and c2nim directives
lilkeet Sep 14, 2024
50d32ef
more discardable procs
lilkeet Sep 14, 2024
300307b
restructure to have both versions 8.0 and 12.5
lilkeet Sep 14, 2024
b912b83
restructure in a way that nimble likes, adapt nimble file to support …
lilkeet Sep 15, 2024
f90e29c
implement documentation generation (index file wont build for 12.5, b…
lilkeet Sep 15, 2024
0cbd7ac
update task description to match implementation
lilkeet Sep 15, 2024
91879be
Update README.md
lilkeet Sep 15, 2024
a162c07
Merge branch 'SciNim:master' into master
lilkeet Oct 11, 2024
3e81788
remove compiled examples (https://github.com/SciNim/nimcuda/pull/17#i…
lilkeet Oct 11, 2024
c746731
better gitignore to avoid more compiled example uploads
lilkeet Oct 11, 2024
08d5055
add back pagerank example (only available for cuda 8.0)
lilkeet Oct 11, 2024
489ee11
ignore docs
lilkeet Oct 11, 2024
99cd166
deleted nvgraph from 12.5, was left over from 8.0. (nvgraph no longer…
lilkeet Oct 12, 2024
baa490e
fix pagerank example task
lilkeet Oct 12, 2024
aaba104
new cublas example (issue #19)
lilkeet Oct 12, 2024
ebb6f73
remove path type usage on windows
lilkeet Oct 12, 2024
e1dc840
new cusolverDn example (issue #19)
lilkeet Oct 12, 2024
e6a69f9
more consistent interface for version exclusive tasks
lilkeet Oct 12, 2024
f923021
new cusolverSp example (issue #19)
lilkeet Oct 12, 2024
a97a1ea
check template -> func, for better stack tracing
lilkeet Oct 12, 2024
38f0bef
update required nim version
lilkeet Oct 13, 2024
aec143c
1.6.x bugfix, cant inc csize_t
lilkeet Oct 13, 2024
e7377d2
nim 1.6.x bugfix, type inference was not as good
lilkeet Oct 13, 2024
883a117
another type inference bugfix
lilkeet Oct 13, 2024
fd428d2
another type inference bugfix
lilkeet Oct 13, 2024
a49f19e
update required nim version
lilkeet Oct 13, 2024
2295bfc
nim 1.4.x compat, didnt have std/enumutils
lilkeet Oct 13, 2024
b616638
replaced csize with csize_t in outdated 8.0 files
lilkeet Oct 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
nimcache
examples/fft
examples/pagerank
examples/sparse
examples/random
.vscode

# ignore all files except nim source files in the examples dir
examples/cuda12_5/*
!examples/cuda12_5/*.nim

# ignore all files except nim source files in the examples dir
examples/cuda8_0/*
!examples/cuda8_0/*.nim

#ignore docs
htmldocs

.vscode
74 changes: 74 additions & 0 deletions examples/cuda12_5/blas.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Link against the cuBLAS and CUDA runtime libraries

import
std / [strformat],
../../src/nimcuda/cuda12_5/[cuda_runtime_api, cublas_api,
driver_types, check]



proc main() =
var handle: cublasHandle_t

# Initialize cuBLAS library
check cublasCreate_v2(addr handle)

# Matrix dimensions
const m = 2 # Rows of A and C
const n = 2 # Columns of B and C
const k = 2 # Columns of A and rows of B

# Host matrices (column-major order)
var h_A: array[0..(m*k)-1, cfloat] = [cfloat 1.0, 2.0,
3.0, 4.0]

var h_B: array[0..(k*n)-1, cfloat] = [cfloat 5.0, 6.0,
7.0, 8.0]

var h_C: array[0..(m*n)-1, cfloat] = [cfloat 0.0, 0.0,
0.0, 0.0]

# Device pointers
var d_A, d_B, d_C: pointer

# Allocate device memory
check cudaMalloc(addr d_A, culong(m*k*sizeof(cfloat)))
check cudaMalloc(addr d_B, culong(k*n*sizeof(cfloat)))
check cudaMalloc(addr d_C, culong(m*n*sizeof(cfloat)))

# Copy host data to device
check cudaMemcpy(d_A, addr h_A[0], culong(m*k*sizeof(cfloat)),
cudaMemcpyHostToDevice)
check cudaMemcpy(d_B, addr h_B[0], culong(k*n*sizeof(cfloat)),
cudaMemcpyHostToDevice)

# Scalars for the operation
var alpha: cfloat = 1.0
var beta: cfloat = 0.0

# Perform matrix multiplication: C = alpha * A * B + beta * C
check cublasSgemm_v2(handle, CUBLAS_OP_N #[No transpose for A]#,
CUBLAS_OP_N #[No transpose for B]#, m, n, k, addr alpha,
cast[ptr cfloat](d_A), m, cast[ptr cfloat](d_B), k, addr beta,
cast[ptr cfloat](d_C), m)

# Copy result back to host
check cudaMemcpy(addr h_C[0], d_C, culong(m*n*sizeof(cfloat)),
cudaMemcpyDeviceToHost)

# Display the result
echo "Result matrix C:"
for i in 0..<m:
var rowStr = ""
for j in 0..<n:
rowStr.add fmt"{h_C[i + j*m]:^6.1f} "
echo rowStr

# Clean up resources
check cudaFree(d_A)
check cudaFree(d_B)
check cudaFree(d_C)
check cublasDestroy_v2(handle)

main()

96 changes: 96 additions & 0 deletions examples/cuda12_5/denseLinearSystem.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@

import
std / [strformat],
../../src/nimcuda/cuda12_5/[driver_types, cusolver_common,
cusolverDn, cuda_runtime_api, check, cublas_api]

proc main() =
var status: cusolverStatus_t
var handle: cusolverDnHandle_t

# Initialize cuSOLVER library
status = cusolverDnCreate(addr handle)
if status != CUSOLVER_STATUS_SUCCESS:
echo "CUSOLVER initialization failed"
return

# Matrix dimensions
const n = 3 # Number of equations and variables

# Host matrix A and right-hand side vector b (column-major order)
var h_A: array[0..(n*n)-1, cfloat] = [
cfloat 3.0, 1.0, 1.0, # First column
2.0, 2.0, 1.0, # Second column
1.0, 1.0, 1.0 # Third column
]

var h_b: array[0..n-1, cfloat] = [cfloat 10, 8, 6] # Right-hand side vector

# Device pointers
var d_A, d_b: pointer
var devIpiv: pointer # Pivot array
var devInfo: pointer # Info output

# Allocate device memory
check cudaMalloc(addr d_A, culong(n*n*sizeof(cfloat)))
check cudaMalloc(addr d_b, culong(n*sizeof(cfloat)))
check cudaMalloc(addr devIpiv, culong(n*sizeof(cint)))
check cudaMalloc(addr devInfo, culong(sizeof(cint)))

# Copy host data to device
check cudaMemcpy(d_A, addr h_A[0], culong(n*n*sizeof(cfloat)),
cudaMemcpyHostToDevice)
check cudaMemcpy(d_b, addr h_b[0], culong(n*sizeof(cfloat)),
cudaMemcpyHostToDevice)

# Get the buffer size for LU decomposition
var lwork: cint
check cusolverDnSgetrf_bufferSize(handle, n, n, cast[ptr cfloat](d_A), n,
addr lwork)

# Allocate workspace
var d_Workspace: pointer
check cudaMalloc(addr d_Workspace, culong(lwork*sizeof(cfloat)))

# Perform LU decomposition
check cusolverDnSgetrf(handle, n, n, cast[ptr cfloat](d_A), n,
cast[ptr cfloat](d_Workspace), cast[ptr cint](devIpiv),
cast[ptr cint](devInfo))

# Check devInfo after getrf
var h_info: cint
check cudaMemcpy(addr h_info, devInfo, culong(sizeof(cint)),
cudaMemcpyDeviceToHost)
if h_info != 0:
echo "LU decomposition failed, info = ", h_info
return

# Solve the system A*x = b
check cusolverDnSgetrs(handle, CUBLAS_OP_N, n, 1, cast[ptr cfloat](d_A), n,
cast[ptr cint](devIpiv), cast[ptr cfloat](d_b), n, cast[ptr cint](devInfo))

# Check devInfo after getrs
check cudaMemcpy(addr h_info, devInfo, culong(sizeof(cint)),
cudaMemcpyDeviceToHost)
if h_info != 0:
echo "Solving the linear system failed, info = ", h_info
return

# Copy result back to host
check cudaMemcpy(addr h_b[0], d_b, culong(n*sizeof(cfloat)),
cudaMemcpyDeviceToHost)

# Display the result
echo "Solution vector x:"
for i in 0..<n:
echo fmt" x[{i}] = {h_b[i]:^6.4f}"

# Clean up resources
check cudaFree(d_A)
check cudaFree(d_b)
check cudaFree(d_Workspace)
check cudaFree(devIpiv)
check cudaFree(devInfo)
check cusolverDnDestroy(handle)

main()
Binary file removed examples/cuda12_5/fft
Binary file not shown.
Binary file removed examples/cuda12_5/random
Binary file not shown.
Binary file removed examples/cuda12_5/sparse
Binary file not shown.
101 changes: 101 additions & 0 deletions examples/cuda12_5/sparseLinearSystem.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@

# Link against the cuSOLVER, cuSPARSE, and CUDA runtime libraries
{.passL: "-lcusolver -lcusparse -lcudart".}

import
std / [strformat],
../../src/nimcuda/cuda12_5/[cuda_runtime_api, driver_types, cusolverSp,
check, cusparse]


proc main() =
var handle: cusolverSpHandle_t
var descrA: cusparseMatDescr_t

# Initialize cuSOLVER Sparse library
check cusolverSpCreate(addr handle)

# Create matrix descriptor
check cusparseCreateMatDescr(addr descrA)

# Matrix dimensions and number of non-zero elements
const m = 3 # Number of rows
const n = 3 # Number of columns
const nnz = 7 # Number of non-zero elements

# Host representation of the sparse matrix A in CSR format
# A = [ 10 0 0
# 3 9 0
# 0 7 8 ]

# Row pointers
var h_csrRowPtrA: array[0..m, cint] = [cint 0, 1, 3, 7]

# Column indices
var h_csrColIndA: array[0..nnz-1, cint] = [cint 0, 0, 1, 1, 2, 1, 2]

# Non-zero values
var h_csrValA: array[0..nnz-1, cfloat] = [cfloat 10.0, 3.0, 9.0, 7.0, 8.0, 7.0, 8.0]

# Right-hand side vector b
var h_b: array[0..m-1, cfloat] = [cfloat 10.0, 21.0, 38.0]

# Solution vector x
var h_x: array[0..m-1, cfloat] = [cfloat 0.0, 0.0, 0.0]

# Device pointers
var d_csrRowPtrA, d_csrColIndA, d_csrValA, d_b, d_x: pointer

# Allocate device memory
check cudaMalloc(addr d_csrRowPtrA, culong((m+1)*sizeof(cint)))
check cudaMalloc(addr d_csrColIndA, culong(nnz*sizeof(cint)))
check cudaMalloc(addr d_csrValA, culong(nnz*sizeof(cfloat)))
check cudaMalloc(addr d_b, culong(m*sizeof(cfloat)))
check cudaMalloc(addr d_x, culong(n*sizeof(cfloat)))

# Copy host data to device
check cudaMemcpy(d_csrRowPtrA, addr h_csrRowPtrA[0],
culong((m+1)*sizeof(cint)), cudaMemcpyHostToDevice)
check cudaMemcpy(d_csrColIndA, addr h_csrColIndA[0], culong(nnz*sizeof(cint)),
cudaMemcpyHostToDevice)
check cudaMemcpy(d_csrValA, addr h_csrValA[0], culong(nnz*sizeof(cfloat)),
cudaMemcpyHostToDevice)
check cudaMemcpy(d_b, addr h_b[0], culong(m*sizeof(cfloat)),
cudaMemcpyHostToDevice)

# Tolerance for the solver and reorder parameter
const tol: cfloat = 1e-6
const reorder: cint = 0 # No reordering

# Variable to hold the position of zero pivot (if any)
var singularity: cint

# Solve the sparse linear system A*x = b
check cusolverSpScsrlsvQr(handle, m, nnz, descrA, cast[ptr cfloat](d_csrValA),
cast[ptr cint](d_csrRowPtrA), cast[ptr cint](d_csrColIndA),
cast[ptr cfloat](d_b), tol, reorder, cast[ptr cfloat](d_x),
addr singularity)

if singularity >= 0:
echo "A is singular at row ", singularity
return

# Copy result back to host
check cudaMemcpy(addr h_x[0], d_x, culong(n*sizeof(cfloat)),
cudaMemcpyDeviceToHost)

# Display the result
echo "Solution vector x:"
for i in 0..<n:
echo fmt" x[{i}] = {h_x[i]:^6.4f}"

# Clean up resources
check cudaFree(d_csrRowPtrA)
check cudaFree(d_csrColIndA)
check cudaFree(d_csrValA)
check cudaFree(d_b)
check cudaFree(d_x)
check cusparseDestroyMatDescr(descrA)
check cusolverSpDestroy(handle)

main()
Loading