Skip to content

Commit 8a52dba

Browse files
Fix bugs to switch on O3 flag for GPU code
1 parent 7c2d4c8 commit 8a52dba

File tree

8 files changed

+94
-25
lines changed

8 files changed

+94
-25
lines changed

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ if(USE_GPU)
9595
else()
9696
target_compile_options(ddcMD-gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-gencode arch=compute_70,code=sm_70 -gencode=arch=compute_60,code=sm_60>)
9797
endif()
98-
# GPU code is debug only -TODO
9998
target_compile_options(ddcMD-gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g>)
10099
#link_directories("/usr/tce/packages/cuda/cuda-10.1.243/lib64")
101100
#target_compile_options(ddcMD-gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-gencode arch=compute_70,code=sm_70>)

arch/sierra.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ FFLAGS_DEBUG = -g -DLINUX_DEBUG -pg -cpp -w
6767
FFLAGS_PROF = -O3 -g
6868
FFLAGS_OMP = $(FFLAGS_OPT)
6969

70-
NVCCFLAGS_OPT = $(NVCCFLAGS_BASE) -g
70+
NVCCFLAGS_OPT = $(NVCCFLAGS_BASE) -O3
7171
NVCCFLAGS_DEBUG = $(NVCCFLAGS_BASE) -g -G
7272
NVCCFLAGS_PROF = $(NVCCFLAGS_BASE) -g -pg
7373

arch/summit.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ FFLAGS_DEBUG = -g -DLINUX_DEBUG -pg -cpp -w
6969
FFLAGS_PROF = -O3 -g
7070
FFLAGS_OMP = $(FFLAGS_OPT)
7171

72-
NVCCFLAGS_OPT = $(NVCCFLAGS_BASE) -g
72+
NVCCFLAGS_OPT = $(NVCCFLAGS_BASE) -O3
7373
NVCCFLAGS_DEBUG = $(NVCCFLAGS_BASE) -g -G
7474
NVCCFLAGS_PROF = $(NVCCFLAGS_BASE) -g -pg
7575

arch/toss3.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ ifeq ($(USE_GPU), 1)
5050

5151
NVCC = $(CUDAHOME)/bin/nvcc -std=c++11
5252

53-
NVCCFLAGS_OPT = $(NVCCFLAGS_BASE) -g
53+
NVCCFLAGS_OPT = $(NVCCFLAGS_BASE) -O3
5454
NVCCFLAGS_DEBUG = $(NVCCFLAGS_BASE) -g -G
5555
NVCCFLAGS_PROF = $(NVCCFLAGS_BASE) -g -pg
5656

src/gpuMemUtils.cu

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ void allocGPUState(COLLECTION *collection, int nIon)
159159
gnlist->binCountsg = NULL;
160160
gnlist->binCountsg2= NULL;
161161
gnlist->binHeadsg = NULL;
162+
gnlist->binHeadsgSave = gnlist->binHeadsg;
162163
gnlist->partial_sumsg = NULL;
163164
nvtxRangePop();
164165

@@ -208,15 +209,16 @@ void allocGPUnbins(GPUNLIST *gnlist, const int nBinsTot, const int blocksize)
208209
if(gnlist->binCountsg2 != NULL){
209210
CUDA_SAFE_CALL(cudaFree(gnlist->binCountsg2);)
210211
}
211-
if(gnlist->binHeadsg != NULL){
212-
CUDA_SAFE_CALL(cudaFree(gnlist->binHeadsg);)
212+
if(gnlist->binHeadsgSave != NULL){
213+
CUDA_SAFE_CALL(cudaFree(gnlist->binHeadsgSave);)
213214
}
214215

215216
// allocate 2X memory for bins/partial sums
216217
int nBinsTot2=nBinsTot*2;
217218
gpu_allocator(gnlist->binCountsg, nBinsTot2);
218219
gpu_allocator(gnlist->binCountsg2, nBinsTot2);
219220
gpu_allocator(gnlist->binHeadsg, nBinsTot2);
221+
gnlist->binHeadsgSave = gnlist->binHeadsg;
220222

221223
accParms->totbins=nBinsTot2;
222224

@@ -314,6 +316,7 @@ void deallocGPU(SYSTEM *sys, int n)
314316
CUDA_SAFE_CALL(cudaFree((gnlist->binCountsg)));
315317
cudaFree((gnlist->binCountsg2));
316318
cudaFree((gnlist->binHeadsg));
319+
gnlist->binHeadsg=NULL;
317320
cudaFree((gnlist->listIdsg));
318321
CUDA_SAFE_CALL(cudaFree((gnlist->partial_sumsg)));
319322

src/gpu_allocator.hpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,49 @@
44
#include <cuda_runtime_api.h>
55
#include <cuda.h>
66

7+
#include <execinfo.h>
8+
#include <string.h>
9+
#include <errno.h>
10+
#include <unistd.h>
11+
#include <stdlib.h>
12+
13+
static void full_write(int fd, const char *buf, size_t len)
14+
{
15+
while (len > 0) {
16+
ssize_t ret = write(fd, buf, len);
17+
18+
if ((ret == -1) && (errno != EINTR))
19+
break;
20+
21+
buf += (size_t) ret;
22+
len -= (size_t) ret;
23+
}
24+
}
25+
26+
inline void print_backtrace(void)
27+
{
28+
static const char start[] = "BACKTRACE ------------\n";
29+
static const char end[] = "----------------------\n";
30+
31+
void *bt[1024];
32+
int bt_size;
33+
char **bt_syms;
34+
int i;
35+
36+
bt_size = backtrace(bt, 1024);
37+
bt_syms = backtrace_symbols(bt, bt_size);
38+
full_write(STDERR_FILENO, start, strlen(start));
39+
for (i = 1; i < bt_size; i++) {
40+
size_t len = strlen(bt_syms[i]);
41+
full_write(STDERR_FILENO, bt_syms[i], len);
42+
full_write(STDERR_FILENO, "\n", 1);
43+
}
44+
full_write(STDERR_FILENO, end, strlen(end));
45+
free(bt_syms);
46+
}
47+
748
template<typename T> void gpu_allocator(T &ptrref, long long int n) {
49+
//print_backtrace();
850
CUDA_SAFE_CALL( cudaMalloc((void **) &ptrref, sizeof(*ptrref) * n);)
951
}
1052

src/gpunlist.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ typedef struct gpunlist_st {
2525
double *scratch, *scratch1, *scratch2, *scratch3, *scratch4; //scratch space
2626
double *partialCOMx, *partialCOMy, *partialCOMz;
2727
double *virCorx, *virCory, *virCorz;
28-
int *r_backg, *r_backbg, *binHeadsg, *binCountsg, *binCountsg2, *listIdsg, *partial_sumsg;
28+
int *r_backg, *r_backbg, *binHeadsg, *binHeadsgSave, *binCountsg, *binCountsg2, *listIdsg, *partial_sumsg;
2929
double *minsg, *lensg;
3030
int *nbinsg, *nbrIds, *nbrIdsx, *nbrIdsy, *nbrIdsz;
3131
int numNbrs, nBinsTot;

src/pairProcessGPU.cu

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <thrust/execution_policy.h>
44
#include <thrust/pair.h>
55
#include <nvToolsExt.h>
6+
#include "ddcMalloc.h"
67
#include "pairProcessGPU.h"
78
#include "nlistGPU.h"
89
#include "cudaUtils.h"
@@ -32,7 +33,7 @@
3233
#define TPP_EVAL 1
3334
#define TPP_EVALp 1
3435
#define CHECK_1(x) //first layer of Debugging, summary of binning data
35-
#define CHECK_2(x) //second layer of debugging, print all binning data
36+
#define CHECK_2(x) //second layer of debugging, print all binning data
3637
#define CHECK_3(x) //not used rn
3738
#define CHECK(x) //final layer of debugging, print all particle data
3839
#define CHECKB (x)
@@ -1153,7 +1154,7 @@ void binParticlesGPU(SYSTEM * sys, double rcut)
11531154
double *minsg, *lensg;
11541155
minsg = gnlist->minsg;
11551156
lensg = gnlist->lensg;
1156-
1157+
11571158
//determine number of bins necessary based on cutoff
11581159
CHECK_1(
11591160
printf("box len %f %f %f \n", lens[0], lens[1], lens[2]);
@@ -1166,7 +1167,6 @@ void binParticlesGPU(SYSTEM * sys, double rcut)
11661167
//binL[0] = lens[0] / nbins[0];
11671168
//binL[1] = lens[1] / nbins[1];
11681169
//binL[2] = lens[2] / nbins[2];
1169-
11701170
int nBinsTot = nbins[0] * nbins[1] * nbins[2];
11711171
gnlist->nBinsTot = nBinsTot;
11721172
//int tBins = nbins[0]*nbins[1]*nbins[2];
@@ -1267,44 +1267,63 @@ void binParticlesGPU(SYSTEM * sys, double rcut)
12671267
//27 if 1d
12681268
// 9 if 2d
12691269
// 3 if 1d
1270-
int* x_nbr;
1271-
int* y_nbr;
1272-
int* z_nbr;
1270+
int* x_nbr=NULL;
1271+
int* y_nbr=NULL;
1272+
int* z_nbr=NULL;
12731273
int num_x_nbr = 3;
12741274
int num_y_nbr = 3;
12751275
int num_z_nbr = 3;
12761276
if (nbins[0] > 1)
12771277
{
1278-
int x_nbr1 [3] = {-1, 0, 1};
1279-
x_nbr = x_nbr1;
1278+
//int x_nbr1 [3] = {-1, 0, 1};
1279+
//x_nbr = x_nbr1;
1280+
x_nbr=(int *)ddcMalloc(num_x_nbr*sizeof(int));
1281+
x_nbr[0]=-1;
1282+
x_nbr[1]=0;
1283+
x_nbr[2]=1;
12801284
}
12811285
else
12821286
{
1283-
int x_nbr1 [1] = {0};
1284-
x_nbr = x_nbr1;
1287+
//int x_nbr1 [1] = {0};
1288+
//x_nbr = x_nbr1;
12851289
num_x_nbr = 1;
1290+
x_nbr=(int *)ddcMalloc(num_x_nbr*sizeof(int));
1291+
x_nbr[0]=0;
12861292
}
12871293
if (nbins[1] > 1)
12881294
{
1289-
int y_nbr1 [3] = {-1, 0, 1}; //{-nbins[0],0,nbins[0]};
1290-
y_nbr = y_nbr1;
1295+
//int y_nbr1 [3] = {-1, 0, 1}; //{-nbins[0],0,nbins[0]};
1296+
//y_nbr = y_nbr1;
1297+
y_nbr=(int *)ddcMalloc(num_y_nbr*sizeof(int));
1298+
y_nbr[0]=-1;
1299+
y_nbr[1]=0;
1300+
y_nbr[2]=1;
1301+
12911302
}
12921303
else
12931304
{
1294-
int y_nbr1 [1] = {0};
1295-
y_nbr = y_nbr1;
1305+
//int y_nbr1 [1] = {0};
1306+
//y_nbr = y_nbr1;
12961307
num_y_nbr = 1;
1308+
y_nbr=(int *)ddcMalloc(num_y_nbr*sizeof(int));
1309+
y_nbr[0]=0;
12971310
}
12981311
if (nbins[2] > 1)
12991312
{
1300-
int z_nbr1 [3] = {-1, 0, 1}; //{-nbins[0]*nbins[1],0,nbins[0]*nbins[1]};
1301-
z_nbr = z_nbr1;
1313+
//int z_nbr1 [3] = {-1, 0, 1}; //{-nbins[0]*nbins[1],0,nbins[0]*nbins[1]};
1314+
//z_nbr = z_nbr1;
1315+
z_nbr=(int *)ddcMalloc(num_z_nbr*sizeof(int));
1316+
z_nbr[0]=-1;
1317+
z_nbr[1]=0;
1318+
z_nbr[2]=1;
13021319
}
13031320
else
13041321
{
1305-
int z_nbr1 [1] = {0};
1306-
z_nbr = z_nbr1;
1322+
//int z_nbr1 [1] = {0};
1323+
//z_nbr = z_nbr1;
13071324
num_z_nbr = 1;
1325+
z_nbr=(int *)ddcMalloc(num_z_nbr*sizeof(int));
1326+
z_nbr[0]=0;
13081327
}
13091328

13101329
CHECK_1(
@@ -1351,6 +1370,12 @@ void binParticlesGPU(SYSTEM * sys, double rcut)
13511370
gpu_memcpy_host2device(gnlist->nbrIdsy, y_nbr, num_y_nbr);
13521371
gpu_memcpy_host2device(gnlist->nbrIdsz, z_nbr, num_z_nbr);
13531372
gpu_memcpy_host2device(gnlist->numNbrsxyz, num_nbrsxyz, 3);
1373+
1374+
1375+
// Free the memory of pointers
1376+
ddcFree(x_nbr);
1377+
ddcFree(y_nbr);
1378+
ddcFree(z_nbr);
13541379
}
13551380

13561381
//

0 commit comments

Comments
 (0)