Skip to content

Commit f426525

Browse files
committed
started separating the gpu backend operations
1 parent 95d0f05 commit f426525

File tree

12 files changed

+128
-139
lines changed

12 files changed

+128
-139
lines changed

gpu_chemistry/src/gpuKernelEvaluator/Make/options

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ EXE_INC = \
77
#LIB_LIBS = -lcudart_static -lcudart
88
LIB_LIBS += -L$(CUDA_LIBS) -lcudart
99

10-
include ../../hipcc
10+
#include ../../hipcc
11+
include ../../nvcc

gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu

Lines changed: 8 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
#include "cuda_host_dev.H"
66

7-
#include "error_handling.H"
7+
#include "for_each_index.H"
88
#include "host_device_vectors.H"
99
#include <thrust/execution_policy.h>
1010
#include <thrust/extrema.h> //min_element
@@ -30,36 +30,10 @@ GpuKernelEvaluator::GpuKernelEvaluator(
3030
, solver_(make_gpuODESolver(system_, odeInputs))
3131
, inputs_(odeInputs)
3232
, memory_(nCells, nSpecie) {
33-
/*
34-
int num;
35-
CHECK_CUDA_ERROR(cudaGetDeviceCount(&num)); // number of CUDA
36-
devices
37-
38-
int dev = (nCells % num);
39-
//cudaDeviceProp::canMapHostMemory prop;
40-
//CHECK_CUDA_ERROR(cudaChooseDevice(&dev, &prop));
41-
42-
43-
CHECK_CUDA_ERROR(cudaSetDevice(dev));
44-
std::cout << "Using device: " << dev << std::endl;
45-
*/
46-
47-
/*
48-
for (int i = 0; i < num; i++) {
49-
// Query the device properties.
50-
cudaDeviceProp prop;
51-
cudaGetDeviceProperties(&prop, i);
52-
std::cout << "Device id: " << i << std::endl;
53-
std::cout << "Device name: " << prop.name << std::endl;
54-
}
55-
*/
33+
5634
}
5735

58-
__global__ void cuda_kernel(gLabel nCells, singleCellSolver op) {
5936

60-
int celli = blockIdx.x * blockDim.x + threadIdx.x;
61-
if (celli < nCells) { op(celli); }
62-
}
6337
/*
6438
static inline auto parseTimes(const char* label,
6539
const std::vector<gpuBuffer>& b) {
@@ -115,33 +89,18 @@ GpuKernelEvaluator::computeYNew(
11589
singleCellSolver op(
11690
deltaT, nSpecie_, ddeltaTChem, dYvf, buffer_span, solver_);
11791

92+
for_each_index(op, nCells);
93+
94+
95+
/*
11896
gLabel NTHREADS = 32;
11997
gLabel NBLOCKS = (nCells + NTHREADS - 1) / NTHREADS;
12098
cuda_kernel<<<NBLOCKS, NTHREADS>>>(nCells, op);
12199
122100
CHECK_LAST_CUDA_ERROR();
123-
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
124-
125-
////
126-
/*
127-
auto bhost = toStdVector(buffers);
128-
129-
parseTimes("adaptive", bhost);
130-
parseTimes("Jacobian", bhost);
131-
parseTimes("step1", bhost);
132-
parseTimes("step2", bhost);
133-
parseTimes("step3", bhost);
134-
135-
*/
136-
137-
////
138-
139-
/*
140-
thrust::for_each(thrust::device,
141-
thrust::make_counting_iterator(0),
142-
thrust::make_counting_iterator(nCells),
143-
op);
101+
gpuErrorCheck(cudaDeviceSynchronize());
144102
*/
103+
145104
return std::make_pair(toStdVector(dYvf_arr),
146105
toStdVector(ddeltaTChem_arr));
147106
}

gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#include "gpuMemoryResource.H"
2-
#include <thrust/device_malloc_allocator.h>
3-
4-
using labelAllocator = thrust::device_malloc_allocator<gLabel>;
5-
using scalarAllocator = thrust::device_malloc_allocator<gScalar>;
2+
#include "device_allocate.H"
3+
#include "device_free.H"
64

75
namespace FoamGpu {
86

@@ -15,39 +13,27 @@ gpuMemoryResource::~gpuMemoryResource() { this->deallocate(); }
1513

1614
void gpuMemoryResource::allocate() {
1715

18-
labelAllocator lAllocator;
19-
scalarAllocator sAllocator;
20-
2116
for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) {
22-
labelData_[i] =
23-
make_raw_pointer(lAllocator.allocate(labelArrayLength()));
17+
labelData_[i] = device_allocate<gLabel>(labelArrayLength());
2418
}
2519
for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) {
26-
scalarData_[i] =
27-
make_raw_pointer(sAllocator.allocate(scalarArrayLength()));
20+
scalarData_[i] = device_allocate<gScalar>(scalarArrayLength());
2821
}
2922
for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) {
30-
twodScalarData_[i] =
31-
make_raw_pointer(sAllocator.allocate(twodScalarArrayLength()));
23+
twodScalarData_[i] = device_allocate<gScalar>(twodScalarArrayLength());
3224
}
3325
}
3426

3527
void gpuMemoryResource::deallocate() {
3628

37-
labelAllocator lAllocator;
38-
scalarAllocator sAllocator;
39-
4029
for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) {
41-
auto ptr = make_device_pointer(labelData_[i]);
42-
lAllocator.deallocate(ptr, labelArrayLength());
30+
device_free(labelData_[i]);
4331
}
4432
for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) {
45-
auto ptr = make_device_pointer(scalarData_[i]);
46-
sAllocator.deallocate(ptr, scalarArrayLength());
33+
device_free(scalarData_[i]);
4734
}
4835
for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) {
49-
auto ptr = make_device_pointer(twodScalarData_[i]);
50-
sAllocator.deallocate(ptr, twodScalarArrayLength());
36+
device_free(twodScalarData_[i]);
5137
}
5238
}
5339

gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ private:
3232
gpuReaction* reactions_;
3333

3434

35-
void allocate();
3635
void deallocate();
3736

3837

gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
1+
#include "thermosAndReactions.H"
12

23
#include "error_handling.H"
3-
#include "thermosAndReactions.H"
4+
#include "device_allocate.H"
5+
#include "device_free.H"
6+
#include "host_device_transfers.H"
47

58
namespace FoamGpu {
69

710
template <class T>
811
static inline T* allocateAndTransfer(const std::vector<T>& t) {
9-
T* ptr;
10-
const auto size = t.size();
11-
const auto bytesize = size * sizeof(T);
12+
13+
T* ptr = device_allocate<T>(t.size());
14+
const auto bytesize = t.size() * sizeof(T);
1215

13-
CHECK_CUDA_ERROR(cudaMalloc((void**)&ptr, bytesize));
14-
CHECK_CUDA_ERROR(
16+
gpuErrorCheck(
1517
cudaMemcpy(ptr, t.data(), bytesize, cudaMemcpyHostToDevice));
1618

1719
return ptr;
20+
/*
21+
T* ptr = device_allocate<T>(t.size());
22+
host_to_device(t.begin(), t.end(), ptr);
23+
return ptr;
24+
*/
1825
}
1926

2027
thermosAndReactions::thermosAndReactions
@@ -33,23 +40,11 @@ thermosAndReactions::~thermosAndReactions()
3340
this->deallocate();
3441
}
3542

36-
void thermosAndReactions::allocate()
37-
{
38-
CHECK_CUDA_ERROR
39-
(
40-
cudaMalloc((void**)&thermos_,nThermos_*sizeof(gpuThermo))
41-
);
42-
43-
CHECK_CUDA_ERROR
44-
(
45-
cudaMalloc((void**)&reactions_,nReactions_*sizeof(gpuReaction))
46-
);
4743

48-
}
4944
void thermosAndReactions::deallocate()
5045
{
51-
CHECK_CUDA_ERROR(cudaFree(thermos_));
52-
CHECK_CUDA_ERROR(cudaFree(reactions_));
46+
device_free(thermos_);
47+
device_free(reactions_);
5348
}
5449

5550

gpu_chemistry/unittest/testHelpers/test_utilities.H

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ static inline gScalar eval(T t)
5353
{
5454

5555
gScalar *d_result;
56-
CHECK_CUDA_ERROR(cudaMalloc(&d_result, sizeof(gScalar)));
56+
gpuErrorCheck(cudaMalloc(&d_result, sizeof(gScalar)));
5757
on_device<<<1,1>>>(t, d_result);
58-
CHECK_LAST_CUDA_ERROR();
59-
cudaDeviceSynchronize();
58+
gpuErrorCheck(cudaGetLastError())
59+
gpuErrorCheek(cudaDeviceSynchronize());
6060
gScalar h_result;
61-
CHECK_CUDA_ERROR(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost));
62-
cudaDeviceSynchronize();
63-
CHECK_CUDA_ERROR(cudaFree(d_result));
64-
cudaDeviceSynchronize();
61+
gpuErrorCheck(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost));
62+
gpuErrorCheck(cudaDeviceSynchronize());
63+
gpuErrorCheck(cudaFree(d_result));
64+
gpuErrorCheck(cudaDeviceSynchronize());
6565
return h_result;
6666

6767
}

gpu_utils/common/check_ptr.H

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,6 @@
11
#pragma once
2-
//#include <string_view>
3-
//#include <string.h>
42
#include <stdio.h>
5-
//#include "cuda_host_dev.H"
63

74

85
#define check_ptr(val, name) if (!val) {printf("null ptr %s", name); assert(0);}
96

10-
/*
11-
template<class T>
12-
static inline CUDA_HOSTDEV void check_ptr(T ptr, std::string_view name)
13-
{
14-
//Note string view may not be null terminated and this is dangerous
15-
if (!ptr)
16-
{
17-
printf("Bad alloc for: %s \n", name.data());
18-
}
19-
20-
}
21-
*/

gpu_utils/common/device_allocate.H

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#pragma once
2+
3+
#include "error_handling.H"
4+
5+
template<class T>
6+
static inline T* device_allocate(size_t length){
7+
8+
T* ptr;
9+
const auto bytesize = length * sizeof(T);
10+
gpuErrorCheck(cudaMalloc((void**)&ptr, bytesize));
11+
return ptr;
12+
}

gpu_utils/common/device_free.H

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#pragma once
2+
3+
#include "error_handling.H"
4+
5+
template<class T>
6+
static inline void device_free(T* ptr){
7+
gpuErrorCheck(cudaFree(ptr));
8+
}

gpu_utils/common/error_handling.H

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,35 +9,22 @@
99
#include <assert.h>
1010
#include "cuda_runtime.h"
1111

12+
#define gpuErrorCheck(call) \
13+
do{ \
14+
cudaError_t gpuErr = call; \
15+
if(cudaSuccess != gpuErr){ \
16+
printf("GPU Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(gpuErr)); \
17+
exit(1); \
18+
} \
19+
}while(0)
20+
21+
1222

13-
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
14-
template <typename T>
15-
static CUDA_HOSTDEV void check(T err, const char* const func, const char* const file,
16-
const int line, bool abort=true)
17-
{
18-
if (err != cudaSuccess)
19-
{
20-
printf("CUDA Runtime error at: %s %s %s %d\n", cudaGetErrorString(err), file, func, line);
21-
if (abort) assert(0);
22-
}
23-
}
24-
25-
#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
26-
static CUDA_HOSTDEV void checkLast(const char* const file, const int line, bool abort=true)
27-
{
28-
cudaError_t err{cudaGetLastError()};
29-
if (err != cudaSuccess)
30-
{
31-
printf("CUDA Runtime error at: %s %s %d\n", cudaGetErrorString(err), file, line);
32-
if (abort) assert(0);
33-
}
34-
}
3523

3624

3725
#else
3826

39-
#define CHECK_CUDA_ERROR(val)
40-
#define CHECK_LAST_CUDA_ERROR()
27+
#define gpuErrorCheck(val)
4128

4229

4330

gpu_utils/common/for_each_index.H

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#pragma once
2+
3+
#include "gpu_constants.H"
4+
#include "error_handling.H"
5+
6+
namespace detail{
7+
8+
9+
10+
template<class UnaryOperation>
11+
__global__ void cuda_backend(gLabel n, UnaryOperation op) {
12+
13+
int i = blockIdx.x * blockDim.x + threadIdx.x;
14+
if (i < n) { op(i); }
15+
}
16+
17+
}
18+
19+
///
20+
///@brief Evaluates op(i) for all i in range [0, n[ in parallel.
21+
///
22+
///@param op A unary opeartion taking a gLabel index as a parameter.
23+
///@param n The maximum i index (non-inclusive).
24+
///
25+
template<class UnaryOperation>
26+
static inline void for_each_index(UnaryOperation op, gLabel n){
27+
28+
gLabel NTHREADS = 32;
29+
gLabel NBLOCKS = (n + NTHREADS - 1) / NTHREADS;
30+
detail::cuda_backend<<<NBLOCKS, NTHREADS>>>(n, op);
31+
32+
gpuErrorCheck(cudaGetLastError());
33+
gpuErrorCheck(cudaDeviceSynchronize());
34+
35+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#pragma once
2+
3+
#include "error_handling.H"
4+
#include "thrust/copy.h"
5+
6+
template<class InputIter, class OutputIter>
7+
static inline void host_to_device(InputIter h_begin, InputIter h_end, OutputIter d_begin){
8+
9+
auto length = std::distance(h_begin, h_end);
10+
using T = typename std::iterator_traits<InputIter>::value_type;
11+
using T2 = typename std::iterator_traits<OutputIter>::value_type;
12+
13+
static_assert(std::is_same_v<T, T2>, "Mismatching types in host_to_device");
14+
15+
auto bytesize = length * sizeof(T);
16+
gpuErrorCheck(
17+
cudaMemcpy(d_begin, &(*h_begin), bytesize, cudaMemcpyHostToDevice));
18+
19+
20+
}
21+
22+

0 commit comments

Comments
 (0)