Skip to content

Commit

Permalink
Dynamic threadsinblock
Browse files Browse the repository at this point in the history
  • Loading branch information
Kepins committed Nov 8, 2024
1 parent 104e67d commit 50b3578
Show file tree
Hide file tree
Showing 20 changed files with 52 additions and 38 deletions.
4 changes: 2 additions & 2 deletions cudampilib/apps/app/appkernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ __global__ void appkernel(unsigned char *devPtr) {
devPtr[my_index] = devPtr[my_index] / 2;
}

extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) {
extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) {

dim3 blocksingrid(2);
dim3 threadsinblock(1024);
Expand All @@ -35,4 +35,4 @@ extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) {
}
}

extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); }
extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); }
2 changes: 2 additions & 0 deletions cudampilib/apps/collatz/app-streams-collatz.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU
#define ENABLE_OUTPUT_LOGS
#include "utility.h"

struct __cudampi__arguments_type __cudampi__arguments;

long long VECTORSIZE = COLLATZ_VECTORSIZE;

double *vectora;
Expand Down
8 changes: 4 additions & 4 deletions cudampilib/apps/collatz/appkernelcollatz.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,17 @@ __global__ void appkernel(void *devPtr)
devPtrc[my_index] = counter;
}

extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream)
extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream)
{
dim3 blocksingrid(COLLATZ_BLOCKS_IN_GRID);
dim3 threadsinblock(COLLATZ_THREADS_IN_BLOCK);
dim3 threadsinblock(batchSize / COLLATZ_BLOCKS_IN_GRID);

log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", COLLATZ_BLOCKS_IN_GRID, COLLATZ_THREADS_IN_BLOCK);
log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", COLLATZ_BLOCKS_IN_GRID, batchSize / COLLATZ_BLOCKS_IN_GRID);
appkernel<<<blocksingrid, threadsinblock, 0, stream>>>(devPtr);

if (cudaSuccess != cudaGetLastError()) {
log_message(LOG_ERROR, "Error during kernel launch in stream");
}
}

extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); }
extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); }
1 change: 0 additions & 1 deletion cudampilib/apps/collatz/collatz_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@

#define COLLATZ_VECTORSIZE 200000000
#define COLLATZ_BLOCKS_IN_GRID 100
#define COLLATZ_THREADS_IN_BLOCK (COLLATZ_BATCH_SIZE / COLLATZ_BLOCKS_IN_GRID)

#endif // COLLATZ_DEFINES_H
4 changes: 2 additions & 2 deletions cudampilib/apps/collatz/cpukernelcollatz.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ void appkernel(void *devPtr, int num_elements, int num_threads)
}
}

extern void launchcpukernel(void *devPtr, int num_threads)
extern void launchcpukernel(void *devPtr, int batchSize, int num_threads)
{
int num_elements = COLLATZ_BATCH_SIZE;
int num_elements = batchSize;
log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads);
appkernel(devPtr, num_elements, num_threads);
}
2 changes: 2 additions & 0 deletions cudampilib/apps/patternsearch/app-streams-patternsearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU
#define ENABLE_OUTPUT_LOGS
#include "utility.h"

struct __cudampi__arguments_type __cudampi__arguments;

long long VECTORSIZE = PATTERNSEARCH_VECTORSIZE;

char *vectora;
Expand Down
8 changes: 4 additions & 4 deletions cudampilib/apps/patternsearch/appkernelpatternsearch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,17 @@ __global__ void appkernel(void *devPtr) {
}
}

extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) {
extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) {

dim3 blocksingrid(PATTERNSEARCH_BLOCKS_IN_GRID);
dim3 threadsinblock(PATTERNSEARCH_THREADS_IN_BLOCK);
dim3 threadsinblock(batchSize / PATTERNSEARCH_BLOCKS_IN_GRID);

log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", PATTERNSEARCH_BLOCKS_IN_GRID, PATTERNSEARCH_THREADS_IN_BLOCK);
log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", PATTERNSEARCH_BLOCKS_IN_GRID, batchSize / PATTERNSEARCH_BLOCKS_IN_GRID);
appkernel<<<blocksingrid, threadsinblock, 0, stream>>>(devPtr);

if (cudaSuccess != cudaGetLastError()) {
log_message(LOG_ERROR, "Error during kernel launch in stream");
}
}

extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); }
extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); }
4 changes: 2 additions & 2 deletions cudampilib/apps/patternsearch/cpukernelpatternsearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ void appkernel(void *devPtr, int num_elements, int num_threads)
}
}

extern void launchcpukernel(void *devPtr,int num_threads)
extern void launchcpukernel(void *devPtr, int batchSize, int num_threads)
{
int num_elements = PATTERNSEARCH_BATCH_SIZE;
int num_elements = batchSize;
log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads);
appkernel(devPtr, num_elements, num_threads);
}
1 change: 0 additions & 1 deletion cudampilib/apps/patternsearch/patternsearch_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#define PATTERNSEARCH_VECTORSIZE 400000000
#define PATTERNSEARCH_BLOCKS_IN_GRID 100
#define PATTERNSEARCH_THREADS_IN_BLOCK (PATTERNSEARCH_BATCH_SIZE / PATTERNSEARCH_BLOCKS_IN_GRID)
#define PATTERNLENGTH 400
#define PATTERNCOUNT 400

Expand Down
2 changes: 2 additions & 0 deletions cudampilib/apps/vecadd/app-streams-vecadd.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU
#define ENABLE_OUTPUT_LOGS
#include "utility.h"

struct __cudampi__arguments_type __cudampi__arguments;

long long VECTORSIZE = VECADD_VECTOR_SIZE;

double *vectora;
Expand Down
8 changes: 4 additions & 4 deletions cudampilib/apps/vecadd/appkernelvecadd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ __global__ void appkernel(void *devPtr)
devPtrc[my_index] = devPtra[my_index] / 2 + devPtrb[my_index] / 3;
}

extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream)
extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream)
{
dim3 blocksingrid(VECADD_BLOCKS_IN_GRID);
dim3 threadsinblock(VECADD_THREADS_IN_BLOCK);
dim3 threadsinblock(batchSize / VECADD_BLOCKS_IN_GRID);

log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECADD_BLOCKS_IN_GRID, VECADD_THREADS_IN_BLOCK);
log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECADD_BLOCKS_IN_GRID, batchSize / VECADD_BLOCKS_IN_GRID);
appkernel<<<blocksingrid, threadsinblock, 0, stream>>>(devPtr);

if (cudaSuccess != cudaGetLastError()) {
log_message(LOG_ERROR, "Error during kernel launch in stream");
}
}

extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); }
extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); }
4 changes: 2 additions & 2 deletions cudampilib/apps/vecadd/cpukernelvecadd.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ void appkernel(void *devPtr, int num_elements, int num_threads)
}
}

extern void launchcpukernel(void *devPtr, int num_threads)
extern void launchcpukernel(void *devPtr, int batchSize, int num_threads)
{
int num_elements = VECADD_BATCH_SIZE;
int num_elements = batchSize;
log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads);
appkernel(devPtr, num_elements, num_threads);
}
3 changes: 1 addition & 2 deletions cudampilib/apps/vecadd/vecadd_defines.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#ifndef VECADD_DEFINES_H
#define VECADD_DEFINES_H

#define VECADD_BLOCKS_IN_GRID 100
#define VECADD_THREADS_IN_BLOCK (VECADD_BATCH_SIZE / VECADD_BLOCKS_IN_GRID)
#define VECADD_VECTOR_SIZE 80000000
#define VECADD_BLOCKS_IN_GRID 100

#endif // VECADD_DEFINES_H
3 changes: 3 additions & 0 deletions cudampilib/apps/vecmaxdiv/app-streams-vecmaxdiv.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU
#define ENABLE_OUTPUT_LOGS
#include "utility.h"


struct __cudampi__arguments_type __cudampi__arguments;

long long VECTORSIZE = VECMAXDIV_VECTORSIZE;

double *vectora;
Expand Down
8 changes: 4 additions & 4 deletions cudampilib/apps/vecmaxdiv/appkernelvecmaxdiv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,17 @@ __global__ void appkernel(void *devPtr) {
devPtrc[my_index] = result;
}

extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) {
extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) {

dim3 blocksingrid(VECMAXDIV_BLOCKS_IN_GRID);
dim3 threadsinblock(VECMAXDIV_THREADS_IN_BLOCK);
dim3 threadsinblock(batchSize / VECMAXDIV_BLOCKS_IN_GRID);

log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECMAXDIV_BLOCKS_IN_GRID, VECMAXDIV_THREADS_IN_BLOCK);
log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECMAXDIV_BLOCKS_IN_GRID, batchSize / VECMAXDIV_BLOCKS_IN_GRID);
appkernel<<<blocksingrid, threadsinblock, 0, stream>>>(devPtr);

if (cudaSuccess != cudaGetLastError()) {
log_message(LOG_ERROR, "Error during kernel launch in stream");
}
}

extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); }
extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); }
4 changes: 2 additions & 2 deletions cudampilib/apps/vecmaxdiv/cpukernelvecmaxdiv.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ for (long my_index = 0 ; my_index < num_elements; my_index++)
}
}

extern void launchcpukernel(void *devPtr, int num_threads)
extern void launchcpukernel(void *devPtr, int batchSize, int num_threads)
{
int num_elements = VECMAXDIV_BATCH_SIZE;
int num_elements = batchSize;
log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads);
appkernel(devPtr, num_elements, num_threads);
}
Expand Down
1 change: 0 additions & 1 deletion cudampilib/apps/vecmaxdiv/vecmaxdiv_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@

#define VECMAXDIV_VECTORSIZE 200000000
#define VECMAXDIV_BLOCKS_IN_GRID 100
#define VECMAXDIV_THREADS_IN_BLOCK (VECMAXDIV_BATCH_SIZE / VECMAXDIV_BLOCKS_IN_GRID)

#endif // VECMAXDIV_DEFINES_H
9 changes: 7 additions & 2 deletions cudampilib/cudampilib.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ float __cudampi__globalpowerlimit;

int powermeasurecounter[__CUDAMPI_MAX_THREAD_COUNT] = {0};

int __cudampi__batch_size;
extern struct __cudampi__arguments_type __cudampi__arguments;

static char doc[] = "Cudampi program";
static char args_doc[] = "";
Expand Down Expand Up @@ -504,6 +506,9 @@ void __cudampi__initializeMPI(int argc, char **argv) {
exit(-1);
}

__cudampi__batch_size = __cudampi__arguments.batch_size;
MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD);

MPI_Allgather(&__cudampi__localGpuDeviceCount, 1, MPI_INT, __cudampi__GPUcountspernode, 1, MPI_INT, MPI_COMM_WORLD);

// Master does not use local free threads for computations
Expand Down Expand Up @@ -1077,12 +1082,12 @@ cudaError_t __cudampi__cpuMemcpyAsync(void *dst, const void *src, size_t count,
}
}

void launchkernelinstream(void *devPtr, cudaStream_t stream);
void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream);

void __cudampi__cudaKernelInStream(void *devPtr, cudaStream_t stream) {

if (__cudampi_isLocalGpu) { // run locally
launchkernelinstream(devPtr, stream);
launchkernelinstream(devPtr, __cudampi__batch_size, stream);
} else { // launch remotely

size_t ssize = sizeof(void *) + sizeof(unsigned long);
Expand Down
12 changes: 8 additions & 4 deletions cudampilib/cudampislave.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ int __cudampi_totaldevicecount = 0; // how many GPUs in total (on all considered
int __cudampi__localGpuDeviceCount = 1;
int __cudampi__localFreeThreadCount = 0;

int __cudampi__batch_size;

unsigned long cpuStreamsValid[CPU_STREAMS_SUPPORTED];

typedef struct task_queue_entry {
Expand All @@ -69,8 +71,8 @@ omp_lock_t cpuEnergyLock;
int isInitialCpuEnergyMeasured = 0;

void launchkernel(void *devPtr);
void launchkernelinstream(void *devPtr, cudaStream_t stream);
void launchcpukernel(void *devPtr, int thread_count);
void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream);
void launchcpukernel(void *devPtr, int batchSize, int thread_count);

typedef struct {
unsigned char* buffer;
Expand Down Expand Up @@ -289,7 +291,7 @@ void logGpuMemcpyError(cudaError_t e, int tag) {

void cpuLaunchKernelTask(void* arg) {
// kernel just takes void*
launchcpukernel(arg, __cudampi__localFreeThreadCount - 1);
launchcpukernel(arg, __cudampi__batch_size ,__cudampi__localFreeThreadCount - 1);
}

void allocateCpuTaskInStream(void (*task_func)(void *), void *arg, unsigned long stream)
Expand Down Expand Up @@ -490,6 +492,8 @@ int main(int argc, char **argv) {

MPI_Allgather(&__cudampi__localFreeThreadCount, 1, MPI_INT, __cudampi__freeThreadsPerNode, 1, MPI_INT, MPI_COMM_WORLD);

MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD);

MPI_Bcast(&__cudampi_totaldevicecount, 1, MPI_INT, 0, MPI_COMM_WORLD);

__cudampi_targetMPIrankfordevice = (int *)malloc(__cudampi_totaldevicecount * sizeof(int));
Expand Down Expand Up @@ -999,7 +1003,7 @@ int main(int argc, char **argv) {
void *devPtr = *((void **)rdata);
cudaStream_t stream = *((cudaStream_t *)(rdata + sizeof(void *)));

launchkernelinstream(devPtr, stream);
launchkernelinstream(devPtr, __cudampi__batch_size, stream);
}

if (status.MPI_TAG == __cudampi__CUDAMPISTREAMCREATEREQ) {
Expand Down
2 changes: 1 addition & 1 deletion cudampilib/include/cudampilib.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ struct __cudampi__arguments_type
int batch_size;
int powercap; // 0 means disabled
long long problem_size;
}__cudampi__arguments;
};

void __cudampi__setglobalpowerlimit(float powerlimit);
int __cudampi__selectdevicesforpowerlimit_greedy();
Expand Down

0 comments on commit 50b3578

Please sign in to comment.