Skip to content

HPC handling

Manuel edited this page Mar 29, 2022 · 2 revisions

Welcome to the HPC-Wiki

This Wiki here started during the Nvidia Hackthon in collaboration with HZB in March 2022.

CUDA concepts

PyCuda

Scalar multiplication

To program a python function with CUDA, we can use pycUDA to define kernels in a C-wise manner. For example, we can define scalar multiplication of a vector via

import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule

ker = SourceModule("""
__global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec)
{
 int i = threadIdx.x;
 outvec[i] = scalar*vec[i];
}
""")
scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel")

testvec = np.random.randn(512).astype(np.float32)
testvec_gpu = gpuarray.to_gpu(testvec)
outvec_gpu = gpuarray.empty_like(testvec_gpu)

scalar_multiply_gpu( outvec_gpu, np.float32(2), testvec_gpu, block=(512,1,1), grid=(1,1,1))

print( "Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get() , 2*testvec) ))

Distance function

#!/usr/bin/env python

import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from pycuda import gpuarray
from math import sqrt
import numpy as np

# get device information
MAX_THREADS_PER_BLOCK = drv.Device(0).get_attribute(pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
BLOCK_SIZE = int(sqrt(MAX_THREADS_PER_BLOCK))
print(MAX_THREADS_PER_BLOCK,BLOCK_SIZE)

X = np.array([[0, 1, 0], [5, 5, 5], [6, 0, 6], [4,3,5]],dtype=np.float32,order='C')
XRM = np.ravel(X)
print(X[0])
print(XRM)
ndim = np.shape(X)[0]
nvdim = np.shape(X)[1]
print(ndim, nvdim)

dm = np.zeros((ndim, ndim),dtype=np.float32,order='C')
dmRM = np.ravel(dm)
dmRM_CPU=np.zeros((ndim, ndim),dtype=np.float32,order='C')
dmRM_CPU = np.ravel(dmRM_CPU)
print(dm)
print(dmRM)
print(dmRM_CPU)

# distance matrix cpu version
for i in range(ndim):
    for j in range(i+1,ndim):
        dist = 0
        for k in range(nvdim):
            ab = XRM[i*nvdim + k] - XRM[j*nvdim + k]
            dist += ab*ab
        dmRM_CPU[i*ndim + j] = np.sqrt(dist)
print(dmRM_CPU)

kerneltemplate = SourceModule("""
#include <math.h>
__global__ void
gpuDM (float *dm, float *x, int n_dim, int n_vdim){

    extern __shared__ float a[];
    float ab, dist;
    int tidx;

    int idx = threadIdx.x + blockDim.x*blockIdx.x;

    for (int r=0; r < n_dim; r++) {
        dist = 0;
        for (int i=0; i<=n_vdim/blockDim.x; i++){
            tidx = i*blockDim.x+threadIdx.x;
            if (tidx < n_vdim)
                a[tidx] = x[r*n_vdim + tidx];
        }
        __syncthreads();

        for(int i=0; i < n_vdim && idx < n_dim; i++){
            ab = a[i] - x[idx*n_vdim + i];
            dist += ab*ab;
        }

        if ( idx < n_dim ){
            dm[idx*n_dim+r] = sqrtf(dist);
	}
        __syncthreads();
    }
}
""")

dm_gpu = kerneltemplate.get_function("gpuDM")
XRM_gpu = gpuarray.to_gpu(XRM)
dmRM_gpu = gpuarray.empty(shape=dmRM.shape, dtype=np.float32)
smem_size = ndim*nvdim*4
dm_gpu(dmRM_gpu, XRM_gpu, np.int32(ndim), np.int32(nvdim), block=(512,1,1), grid=(1,1,1), shared=smem_size)

print( "Does our kernel work correctly? : {}".format(np.allclose(dmRM_gpu.get() , dmRM_CPU) ))

cuNumerics

Talking to a supercomputer

SLURM

To talk to a supercomputer we use SLURM. For a test please run the following on your slurm systems

srun -N 1 --ntasks=4 --mem=10G -p all-gpu --gres=gpu:1 --time=24:00:00 --pty bash