generated from cap-jmk/continuous-template
-
Notifications
You must be signed in to change notification settings - Fork 0
HPC handling
Manuel edited this page Mar 29, 2022
·
2 revisions
This Wiki here started during the Nvidia Hackthon in collaboration with HZB in March 2022.
To program a python function with CUDA, we can use pycUDA to define kernels in a C-wise manner. For example, we can define scalar multiplication of a vector via
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule
ker = SourceModule("""
__global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec)
{
int i = threadIdx.x;
outvec[i] = scalar*vec[i];
}
""")
scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel")
testvec = np.random.randn(512).astype(np.float32)
testvec_gpu = gpuarray.to_gpu(testvec)
outvec_gpu = gpuarray.empty_like(testvec_gpu)
scalar_multiply_gpu( outvec_gpu, np.float32(2), testvec_gpu, block=(512,1,1), grid=(1,1,1))
print( "Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get() , 2*testvec) ))
#!/usr/bin/env python
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from pycuda import gpuarray
from math import sqrt
import numpy as np
# get device information
MAX_THREADS_PER_BLOCK = drv.Device(0).get_attribute(pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
BLOCK_SIZE = int(sqrt(MAX_THREADS_PER_BLOCK))
print(MAX_THREADS_PER_BLOCK,BLOCK_SIZE)
X = np.array([[0, 1, 0], [5, 5, 5], [6, 0, 6], [4,3,5]],dtype=np.float32,order='C')
XRM = np.ravel(X)
print(X[0])
print(XRM)
ndim = np.shape(X)[0]
nvdim = np.shape(X)[1]
print(ndim, nvdim)
dm = np.zeros((ndim, ndim),dtype=np.float32,order='C')
dmRM = np.ravel(dm)
dmRM_CPU=np.zeros((ndim, ndim),dtype=np.float32,order='C')
dmRM_CPU = np.ravel(dmRM_CPU)
print(dm)
print(dmRM)
print(dmRM_CPU)
# distance matrix cpu version
for i in range(ndim):
for j in range(i+1,ndim):
dist = 0
for k in range(nvdim):
ab = XRM[i*nvdim + k] - XRM[j*nvdim + k]
dist += ab*ab
dmRM_CPU[i*ndim + j] = np.sqrt(dist)
print(dmRM_CPU)
kerneltemplate = SourceModule("""
#include <math.h>
__global__ void
gpuDM (float *dm, float *x, int n_dim, int n_vdim){
extern __shared__ float a[];
float ab, dist;
int tidx;
int idx = threadIdx.x + blockDim.x*blockIdx.x;
for (int r=0; r < n_dim; r++) {
dist = 0;
for (int i=0; i<=n_vdim/blockDim.x; i++){
tidx = i*blockDim.x+threadIdx.x;
if (tidx < n_vdim)
a[tidx] = x[r*n_vdim + tidx];
}
__syncthreads();
for(int i=0; i < n_vdim && idx < n_dim; i++){
ab = a[i] - x[idx*n_vdim + i];
dist += ab*ab;
}
if ( idx < n_dim ){
dm[idx*n_dim+r] = sqrtf(dist);
}
__syncthreads();
}
}
""")
dm_gpu = kerneltemplate.get_function("gpuDM")
XRM_gpu = gpuarray.to_gpu(XRM)
dmRM_gpu = gpuarray.empty(shape=dmRM.shape, dtype=np.float32)
smem_size = ndim*nvdim*4
dm_gpu(dmRM_gpu, XRM_gpu, np.int32(ndim), np.int32(nvdim), block=(512,1,1), grid=(1,1,1), shared=smem_size)
print( "Does our kernel work correctly? : {}".format(np.allclose(dmRM_gpu.get() , dmRM_CPU) ))
To talk to a supercomputer we use SLURM. For a test please run the following on your slurm systems
srun -N 1 --ntasks=4 --mem=10G -p all-gpu --gres=gpu:1 --time=24:00:00 --pty bash
© openmd 2022