diff --git a/cudampilib/apps/app/appkernel.cu b/cudampilib/apps/app/appkernel.cu index 89ac9bb..ee6f7a8 100644 --- a/cudampilib/apps/app/appkernel.cu +++ b/cudampilib/apps/app/appkernel.cu @@ -23,7 +23,7 @@ __global__ void appkernel(unsigned char *devPtr) { devPtr[my_index] = devPtr[my_index] / 2; } -extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) { +extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) { dim3 blocksingrid(2); dim3 threadsinblock(1024); @@ -35,4 +35,4 @@ extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) { } } -extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); } +extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); } diff --git a/cudampilib/apps/collatz/app-streams-collatz.c b/cudampilib/apps/collatz/app-streams-collatz.c index 7392c9c..c2bc1b0 100644 --- a/cudampilib/apps/collatz/app-streams-collatz.c +++ b/cudampilib/apps/collatz/app-streams-collatz.c @@ -23,12 +23,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU #define ENABLE_OUTPUT_LOGS #include "utility.h" -long long VECTORSIZE = COLLATZ_VECTORSIZE; +struct __cudampi__arguments_type __cudampi__arguments; + +long long VECTORSIZE; double *vectora; double *vectorc; -int batchsize = COLLATZ_BATCH_SIZE; +int batchsize; long long globalcounter = 0; @@ -44,19 +46,11 @@ int main(int argc, char **argv) __cudampi__initializeMPI(argc, argv); - int alldevicescount = 0; - - if (argc > 1) - { - streamcount = atoi(argv[1]); - } + streamcount = __cudampi__arguments.number_of_streams; + batchsize = __cudampi__arguments.batch_size; + VECTORSIZE = __cudampi__arguments.problem_size; - if (argc > 2) - { - powerlimit = atof(argv[2]); - log_message(LOG_INFO, "\nSetting power limit=%f\n", powerlimit); - __cudampi__setglobalpowerlimit(powerlimit); - } + int alldevicescount = 0; __cudampi__getDeviceCount(&alldevicescount); diff --git a/cudampilib/apps/collatz/appkernelcollatz.cu b/cudampilib/apps/collatz/appkernelcollatz.cu index 0c9299e..3653f82 100644 --- a/cudampilib/apps/collatz/appkernelcollatz.cu +++ b/cudampilib/apps/collatz/appkernelcollatz.cu @@ -57,12 +57,12 @@ __global__ void appkernel(void *devPtr) devPtrc[my_index] = counter; } -extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) +extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) { dim3 blocksingrid(COLLATZ_BLOCKS_IN_GRID); - dim3 threadsinblock(COLLATZ_THREADS_IN_BLOCK); + dim3 threadsinblock(batchSize / COLLATZ_BLOCKS_IN_GRID); - log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", COLLATZ_BLOCKS_IN_GRID, COLLATZ_THREADS_IN_BLOCK); + log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", COLLATZ_BLOCKS_IN_GRID, batchSize / COLLATZ_BLOCKS_IN_GRID); appkernel<<>>(devPtr); if (cudaSuccess != cudaGetLastError()) { @@ -70,4 +70,4 @@ extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) } } -extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); } +extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); } diff --git a/cudampilib/apps/collatz/collatz_defines.h b/cudampilib/apps/collatz/collatz_defines.h index dedf998..d0cbef3 100644 --- a/cudampilib/apps/collatz/collatz_defines.h +++ b/cudampilib/apps/collatz/collatz_defines.h @@ -1,9 +1,6 @@ #ifndef COLLATZ_DEFINES_H #define COLLATZ_DEFINES_H -#define COLLATZ_VECTORSIZE 200000000 -#define COLLATZ_BATCH_SIZE 50000 #define COLLATZ_BLOCKS_IN_GRID 100 -#define COLLATZ_THREADS_IN_BLOCK (COLLATZ_BATCH_SIZE / COLLATZ_BLOCKS_IN_GRID) #endif // COLLATZ_DEFINES_H \ No newline at end of file diff --git a/cudampilib/apps/collatz/cpukernelcollatz.c b/cudampilib/apps/collatz/cpukernelcollatz.c index 6ff572b..dca1848 100644 --- a/cudampilib/apps/collatz/cpukernelcollatz.c +++ b/cudampilib/apps/collatz/cpukernelcollatz.c @@ -56,9 +56,9 @@ void appkernel(void *devPtr, int num_elements, int num_threads) } } -extern void launchcpukernel(void *devPtr, int num_threads) +extern void launchcpukernel(void *devPtr, int batchSize, int num_threads) { - int num_elements = COLLATZ_BATCH_SIZE; + int num_elements = batchSize; log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads); appkernel(devPtr, num_elements, num_threads); } \ No newline at end of file diff --git a/cudampilib/apps/patternsearch/app-streams-patternsearch.c b/cudampilib/apps/patternsearch/app-streams-patternsearch.c index 0372dbd..419d77c 100644 --- a/cudampilib/apps/patternsearch/app-streams-patternsearch.c +++ b/cudampilib/apps/patternsearch/app-streams-patternsearch.c @@ -23,12 +23,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU #define ENABLE_OUTPUT_LOGS #include "utility.h" -long long VECTORSIZE = PATTERNSEARCH_VECTORSIZE; +struct __cudampi__arguments_type __cudampi__arguments; + +long long VECTORSIZE; char *vectora; char *vectorc; -int batchsize = PATTERNSEARCH_BATCH_SIZE; +int batchsize; long long globalcounter = 0; @@ -45,19 +47,11 @@ int main(int argc, char **argv) __cudampi__initializeMPI(argc, argv); - int alldevicescount = 0; - - if (argc > 1) - { - streamcount = atoi(argv[1]); - } + streamcount = __cudampi__arguments.number_of_streams; + batchsize = __cudampi__arguments.batch_size; + VECTORSIZE = __cudampi__arguments.problem_size; - if (argc > 2) - { - powerlimit = atof(argv[2]); - log_message(LOG_ERROR, "\nSetting power limit=%f\n", powerlimit); - __cudampi__setglobalpowerlimit(powerlimit); - } + int alldevicescount = 0; __cudampi__getDeviceCount(&alldevicescount); diff --git a/cudampilib/apps/patternsearch/appkernelpatternsearch.cu b/cudampilib/apps/patternsearch/appkernelpatternsearch.cu index 8ded0cf..5ff6ff0 100644 --- a/cudampilib/apps/patternsearch/appkernelpatternsearch.cu +++ b/cudampilib/apps/patternsearch/appkernelpatternsearch.cu @@ -45,12 +45,12 @@ __global__ void appkernel(void *devPtr) { } } -extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) { +extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) { dim3 blocksingrid(PATTERNSEARCH_BLOCKS_IN_GRID); - dim3 threadsinblock(PATTERNSEARCH_THREADS_IN_BLOCK); + dim3 threadsinblock(batchSize / PATTERNSEARCH_BLOCKS_IN_GRID); - log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", PATTERNSEARCH_BLOCKS_IN_GRID, PATTERNSEARCH_THREADS_IN_BLOCK); + log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", PATTERNSEARCH_BLOCKS_IN_GRID, batchSize / PATTERNSEARCH_BLOCKS_IN_GRID); appkernel<<>>(devPtr); if (cudaSuccess != cudaGetLastError()) { @@ -58,4 +58,4 @@ extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) { } } -extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); } +extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); } diff --git a/cudampilib/apps/patternsearch/cpukernelpatternsearch.c b/cudampilib/apps/patternsearch/cpukernelpatternsearch.c index eadcbf7..899e087 100644 --- a/cudampilib/apps/patternsearch/cpukernelpatternsearch.c +++ b/cudampilib/apps/patternsearch/cpukernelpatternsearch.c @@ -49,9 +49,9 @@ void appkernel(void *devPtr, int num_elements, int num_threads) } } -extern void launchcpukernel(void *devPtr,int num_threads) +extern void launchcpukernel(void *devPtr, int batchSize, int num_threads) { - int num_elements = PATTERNSEARCH_BATCH_SIZE; + int num_elements = batchSize; log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads); appkernel(devPtr, num_elements, num_threads); } \ No newline at end of file diff --git a/cudampilib/apps/patternsearch/patternsearch_defines.h b/cudampilib/apps/patternsearch/patternsearch_defines.h index be6b39b..253610a 100644 --- a/cudampilib/apps/patternsearch/patternsearch_defines.h +++ b/cudampilib/apps/patternsearch/patternsearch_defines.h @@ -1,10 +1,7 @@ #ifndef PATTERNSEARCH_DEFINES_H #define PATTERNSEARCH_DEFINES_H -#define PATTERNSEARCH_VECTORSIZE 400000000 -#define PATTERNSEARCH_BATCH_SIZE 50000 #define PATTERNSEARCH_BLOCKS_IN_GRID 100 -#define PATTERNSEARCH_THREADS_IN_BLOCK (PATTERNSEARCH_BATCH_SIZE / PATTERNSEARCH_BLOCKS_IN_GRID) #define PATTERNLENGTH 400 #define PATTERNCOUNT 400 diff --git a/cudampilib/apps/vecadd/app-streams-vecadd.c b/cudampilib/apps/vecadd/app-streams-vecadd.c index ed59b85..076e083 100644 --- a/cudampilib/apps/vecadd/app-streams-vecadd.c +++ b/cudampilib/apps/vecadd/app-streams-vecadd.c @@ -22,13 +22,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU #define ENABLE_OUTPUT_LOGS #include "utility.h" -long long VECTORSIZE = VECADD_VECTOR_SIZE; +struct __cudampi__arguments_type __cudampi__arguments; + +long long VECTORSIZE; double *vectora; double *vectorb; double *vectorc; -int batchsize = VECADD_BATCH_SIZE; +int batchsize; long long globalcounter = 0; @@ -44,19 +46,11 @@ int main(int argc, char **argv) __cudampi__initializeMPI(argc, argv); - int alldevicescount = 0; - - if (argc > 1) - { - streamcount = atoi(argv[1]); - } + streamcount = __cudampi__arguments.number_of_streams; + batchsize = __cudampi__arguments.batch_size; + VECTORSIZE = __cudampi__arguments.problem_size; - if (argc > 2) - { - powerlimit = atof(argv[2]); - log_message(LOG_INFO, "\nSetting power limit=%f\n", powerlimit); - __cudampi__setglobalpowerlimit(powerlimit); - } + int alldevicescount = 0; __cudampi__getDeviceCount(&alldevicescount); diff --git a/cudampilib/apps/vecadd/appkernelvecadd.cu b/cudampilib/apps/vecadd/appkernelvecadd.cu index 3fe4219..9b256f0 100644 --- a/cudampilib/apps/vecadd/appkernelvecadd.cu +++ b/cudampilib/apps/vecadd/appkernelvecadd.cu @@ -33,12 +33,12 @@ __global__ void appkernel(void *devPtr) devPtrc[my_index] = devPtra[my_index] / 2 + devPtrb[my_index] / 3; } -extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) +extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) { dim3 blocksingrid(VECADD_BLOCKS_IN_GRID); - dim3 threadsinblock(VECADD_THREADS_IN_BLOCK); + dim3 threadsinblock(batchSize / VECADD_BLOCKS_IN_GRID); - log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECADD_BLOCKS_IN_GRID, VECADD_THREADS_IN_BLOCK); + log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECADD_BLOCKS_IN_GRID, batchSize / VECADD_BLOCKS_IN_GRID); appkernel<<>>(devPtr); if (cudaSuccess != cudaGetLastError()) { @@ -46,4 +46,4 @@ extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) } } -extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); } +extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); } diff --git a/cudampilib/apps/vecadd/cpukernelvecadd.c b/cudampilib/apps/vecadd/cpukernelvecadd.c index 506ca81..84177b2 100644 --- a/cudampilib/apps/vecadd/cpukernelvecadd.c +++ b/cudampilib/apps/vecadd/cpukernelvecadd.c @@ -32,9 +32,9 @@ void appkernel(void *devPtr, int num_elements, int num_threads) } } -extern void launchcpukernel(void *devPtr, int num_threads) +extern void launchcpukernel(void *devPtr, int batchSize, int num_threads) { - int num_elements = VECADD_BATCH_SIZE; + int num_elements = batchSize; log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads); appkernel(devPtr, num_elements, num_threads); } diff --git a/cudampilib/apps/vecadd/vecadd_defines.h b/cudampilib/apps/vecadd/vecadd_defines.h index fcd71e2..9bc3499 100644 --- a/cudampilib/apps/vecadd/vecadd_defines.h +++ b/cudampilib/apps/vecadd/vecadd_defines.h @@ -1,9 +1,6 @@ #ifndef VECADD_DEFINES_H #define VECADD_DEFINES_H -#define VECADD_BATCH_SIZE 100000 #define VECADD_BLOCKS_IN_GRID 100 -#define VECADD_THREADS_IN_BLOCK (VECADD_BATCH_SIZE / VECADD_BLOCKS_IN_GRID) -#define VECADD_VECTOR_SIZE 80000000 #endif // VECADD_DEFINES_H \ No newline at end of file diff --git a/cudampilib/apps/vecmaxdiv/app-streams-vecmaxdiv.c b/cudampilib/apps/vecmaxdiv/app-streams-vecmaxdiv.c index 7f359d2..a541e0a 100644 --- a/cudampilib/apps/vecmaxdiv/app-streams-vecmaxdiv.c +++ b/cudampilib/apps/vecmaxdiv/app-streams-vecmaxdiv.c @@ -23,13 +23,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU #define ENABLE_OUTPUT_LOGS #include "utility.h" -long long VECTORSIZE = VECMAXDIV_VECTORSIZE; + +struct __cudampi__arguments_type __cudampi__arguments; + +long long VECTORSIZE; double *vectora; double *vectorb; double *vectorc; -int batchsize = VECMAXDIV_BATCH_SIZE; +int batchsize; long long globalcounter = 0; @@ -45,19 +48,12 @@ int main(int argc, char **argv) __cudampi__initializeMPI(argc, argv); - int alldevicescount = 0; + streamcount = __cudampi__arguments.number_of_streams; + batchsize = __cudampi__arguments.batch_size; + VECTORSIZE = __cudampi__arguments.problem_size; - if (argc > 1) - { - streamcount = atoi(argv[1]); - } + int alldevicescount = 0; - if (argc > 2) - { - powerlimit = atof(argv[2]); - log_message(LOG_INFO,"\nSetting power limit=%f\n", powerlimit); - __cudampi__setglobalpowerlimit(powerlimit); - } __cudampi__getDeviceCount(&alldevicescount); cudaHostAlloc((void **)&vectora, sizeof(double) * VECTORSIZE, cudaHostAllocDefault); diff --git a/cudampilib/apps/vecmaxdiv/appkernelvecmaxdiv.cu b/cudampilib/apps/vecmaxdiv/appkernelvecmaxdiv.cu index 13ff6e7..02e7d9b 100644 --- a/cudampilib/apps/vecmaxdiv/appkernelvecmaxdiv.cu +++ b/cudampilib/apps/vecmaxdiv/appkernelvecmaxdiv.cu @@ -55,12 +55,12 @@ __global__ void appkernel(void *devPtr) { devPtrc[my_index] = result; } -extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) { +extern "C" void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream) { dim3 blocksingrid(VECMAXDIV_BLOCKS_IN_GRID); - dim3 threadsinblock(VECMAXDIV_THREADS_IN_BLOCK); + dim3 threadsinblock(batchSize / VECMAXDIV_BLOCKS_IN_GRID); - log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECMAXDIV_BLOCKS_IN_GRID, VECMAXDIV_THREADS_IN_BLOCK); + log_message(LOG_DEBUG, "Launichng GPU Kernel with %i blocks in grid and %i threads in block.", VECMAXDIV_BLOCKS_IN_GRID, batchSize / VECMAXDIV_BLOCKS_IN_GRID); appkernel<<>>(devPtr); if (cudaSuccess != cudaGetLastError()) { @@ -68,4 +68,4 @@ extern "C" void launchkernelinstream(void *devPtr, cudaStream_t stream) { } } -extern "C" void launchkernel(void *devPtr) { launchkernelinstream(devPtr, 0); } +extern "C" void launchkernel(void *devPtr, int batchSize) { launchkernelinstream(devPtr, batchSize, 0); } diff --git a/cudampilib/apps/vecmaxdiv/cpukernelvecmaxdiv.c b/cudampilib/apps/vecmaxdiv/cpukernelvecmaxdiv.c index e7798c1..f524473 100644 --- a/cudampilib/apps/vecmaxdiv/cpukernelvecmaxdiv.c +++ b/cudampilib/apps/vecmaxdiv/cpukernelvecmaxdiv.c @@ -59,9 +59,9 @@ for (long my_index = 0 ; my_index < num_elements; my_index++) } } -extern void launchcpukernel(void *devPtr, int num_threads) +extern void launchcpukernel(void *devPtr, int batchSize, int num_threads) { - int num_elements = VECMAXDIV_BATCH_SIZE; + int num_elements = batchSize; log_message(LOG_DEBUG, "Launichng CPU Kernel with %i elements and %i threads.", num_elements, num_threads); appkernel(devPtr, num_elements, num_threads); } diff --git a/cudampilib/apps/vecmaxdiv/vecmaxdiv_defines.h b/cudampilib/apps/vecmaxdiv/vecmaxdiv_defines.h index 99d5ab3..a3f0a92 100644 --- a/cudampilib/apps/vecmaxdiv/vecmaxdiv_defines.h +++ b/cudampilib/apps/vecmaxdiv/vecmaxdiv_defines.h @@ -1,9 +1,6 @@ #ifndef VECMAXDIV_DEFINES_H #define VECMAXDIV_DEFINES_H -#define VECMAXDIV_VECTORSIZE 200000000 -#define VECMAXDIV_BATCH_SIZE 50000 #define VECMAXDIV_BLOCKS_IN_GRID 100 -#define VECMAXDIV_THREADS_IN_BLOCK (VECMAXDIV_BATCH_SIZE / VECMAXDIV_BLOCKS_IN_GRID) #endif // VECMAXDIV_DEFINES_H \ No newline at end of file diff --git a/cudampilib/cudampilib.c b/cudampilib/cudampilib.c index 6c16c53..deb4ddf 100644 --- a/cudampilib/cudampilib.c +++ b/cudampilib/cudampilib.c @@ -9,6 +9,7 @@ The above copyright notice and this permission notice shall be included in all c THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include #include #include #include @@ -30,7 +31,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU #define __cudampi_isLocalGpu __cudampi__currentDevice < __cudampi__GPUcountspernode[0] #define __cudampi__currentMemcpyQueue &(__cudampi__memcpy_queues[omp_get_thread_num()]) - int *__cudampi__GPUcountspernode; int *__cudampi__CPUcountspernode; int *__cudampi__freeThreadsPerNode; @@ -71,6 +71,48 @@ float __cudampi__globalpowerlimit; int powermeasurecounter[__CUDAMPI_MAX_THREAD_COUNT] = {0}; +int __cudampi__batch_size; +int __cudampi__cpu_enabled; +extern struct __cudampi__arguments_type __cudampi__arguments; + +static char doc[] = "Cudampi program"; +static char args_doc[] = ""; +static struct argp_option options[] = { + { "cpu-enabled", 'c', "ENABLED", 0, "Enable CPU processing (1 to enable, 0 to disable)" }, + { "number-of-streams", 'n', "NUM", 0, "Set the number of streams" }, + { "batch-size", 'b', "SIZE", 0, "Set the batch size" }, + { "powercap", 'p', "WATTS", 0, "Set the power cap (0 to disable)" }, + { "problem-size", 's', "SIZE", 0, "Set the problem size" }, + { 0 } +}; +static error_t parse_opt(int key, char *arg, struct argp_state *state) +{ + struct __cudampi__arguments_type *arguments = state->input; + + switch (key) + { + case 'c': + arguments->cpu_enabled = atoi(arg); + break; + case 'n': + arguments->number_of_streams = atoi(arg); + break; + case 'b': + arguments->batch_size = atoi(arg); + break; + case 'p': + arguments->powercap = atoi(arg); + break; + case 's': + arguments->problem_size = atoll(arg); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} +static struct argp argp = { options, parse_opt, args_doc, doc }; + // Counter that holds a unique tag for asynchronously exchanged messages // it increments by 2 (D_MSG_TAG) to accomodate data message and status int asyncMsgCounter = MIN_ASYNC_MSG_TAG; @@ -179,6 +221,7 @@ void __cudampi__setglobalpowerlimit(float powerlimit) { __cudampi__globalpowerlimit = powerlimit; } + float __cudampi__gettotalpowerofselecteddevices() { // gets total power of currently enabled devices int i; float power = 0; @@ -413,6 +456,29 @@ void __cudampi__initializeMPI(int argc, char **argv) { exit(-1); } + /* Default values */ + __cudampi__arguments.cpu_enabled = 1; + __cudampi__arguments.number_of_streams = 1; + __cudampi__arguments.batch_size = 50000; + __cudampi__arguments.powercap = 0; + __cudampi__arguments.problem_size = 200000000; + + /* Parse our arguments; every option seen by parse_opt will be reflected in arguments. */ + argp_parse(&argp, argc, argv, 0, 0, &__cudampi__arguments); + + /* Print parsed arguments using log_message with LOG_INFO level */ + log_message(LOG_INFO, "CPU Enabled : %d", __cudampi__arguments.cpu_enabled); + log_message(LOG_INFO, "Number of Streams : %d", __cudampi__arguments.number_of_streams); + log_message(LOG_INFO, "Batch Size : %d", __cudampi__arguments.batch_size); + log_message(LOG_INFO, "Power Cap : %d", __cudampi__arguments.powercap); + log_message(LOG_INFO, "Problem Size : %lld", __cudampi__arguments.problem_size); + + if (__cudampi__arguments.powercap > 0) { + log_message(LOG_INFO, "\nSetting power limit=%f\n", __cudampi__arguments.powercap); + __cudampi__setglobalpowerlimit(__cudampi__arguments.powercap); + } + + // fetch information about the rank and number of processes MPI_Comm_size(MPI_COMM_WORLD, &__cudampi__MPIproccount); @@ -441,6 +507,12 @@ void __cudampi__initializeMPI(int argc, char **argv) { exit(-1); } + __cudampi__batch_size = __cudampi__arguments.batch_size; + __cudampi__cpu_enabled = __cudampi__arguments.cpu_enabled; + MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&__cudampi__cpu_enabled, 1, MPI_INT, 0, MPI_COMM_WORLD); + + MPI_Allgather(&__cudampi__localGpuDeviceCount, 1, MPI_INT, __cudampi__GPUcountspernode, 1, MPI_INT, MPI_COMM_WORLD); // Master does not use local free threads for computations @@ -1007,12 +1079,12 @@ cudaError_t __cudampi__cpuMemcpyAsync(void *dst, const void *src, size_t count, } } -void launchkernelinstream(void *devPtr, cudaStream_t stream); +void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream); void __cudampi__cudaKernelInStream(void *devPtr, cudaStream_t stream) { if (__cudampi_isLocalGpu) { // run locally - launchkernelinstream(devPtr, stream); + launchkernelinstream(devPtr, __cudampi__batch_size, stream); } else { // launch remotely size_t ssize = sizeof(void *) + sizeof(unsigned long); diff --git a/cudampilib/cudampislave.c b/cudampilib/cudampislave.c index a825caa..6053615 100644 --- a/cudampilib/cudampislave.c +++ b/cudampilib/cudampislave.c @@ -44,6 +44,9 @@ int __cudampi_totaldevicecount = 0; // how many GPUs in total (on all considered int __cudampi__localGpuDeviceCount = 1; int __cudampi__localFreeThreadCount = 0; +int __cudampi__batch_size; +int __cudampi__cpu_enabled; + unsigned long cpuStreamsValid[CPU_STREAMS_SUPPORTED]; typedef struct task_queue_entry { @@ -69,8 +72,8 @@ omp_lock_t cpuEnergyLock; int isInitialCpuEnergyMeasured = 0; void launchkernel(void *devPtr); -void launchkernelinstream(void *devPtr, cudaStream_t stream); -void launchcpukernel(void *devPtr, int thread_count); +void launchkernelinstream(void *devPtr, int batchSize, cudaStream_t stream); +void launchcpukernel(void *devPtr, int batchSize, int thread_count); typedef struct { unsigned char* buffer; @@ -289,7 +292,7 @@ void logGpuMemcpyError(cudaError_t e, int tag) { void cpuLaunchKernelTask(void* arg) { // kernel just takes void* - launchcpukernel(arg, __cudampi__localFreeThreadCount - 1); + launchcpukernel(arg, __cudampi__batch_size ,__cudampi__localFreeThreadCount - 1); } void allocateCpuTaskInStream(void (*task_func)(void *), void *arg, unsigned long stream) @@ -481,9 +484,17 @@ int main(int argc, char **argv) { exit(-1); // we could exit in a nicer way! TBD } - if (cudaSuccess != __cudampi__getCpuFreeThreads(&__cudampi__localFreeThreadCount)) { - log_message(LOG_ERROR, "Error invoking __cudampi__getCpuFreeThreads()"); - exit(-1); + MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&__cudampi__cpu_enabled, 1, MPI_INT, 0, MPI_COMM_WORLD); + + if (__cudampi__cpu_enabled){ + if (cudaSuccess != __cudampi__getCpuFreeThreads(&__cudampi__localFreeThreadCount)) { + log_message(LOG_ERROR, "Error invoking __cudampi__getCpuFreeThreads()"); + exit(-1); + } + } + else { + __cudampi__localFreeThreadCount = 0; } MPI_Allgather(&__cudampi__localGpuDeviceCount, 1, MPI_INT, __cudampi__GPUcountspernode, 1, MPI_INT, MPI_COMM_WORLD); @@ -999,7 +1010,7 @@ int main(int argc, char **argv) { void *devPtr = *((void **)rdata); cudaStream_t stream = *((cudaStream_t *)(rdata + sizeof(void *))); - launchkernelinstream(devPtr, stream); + launchkernelinstream(devPtr, __cudampi__batch_size, stream); } if (status.MPI_TAG == __cudampi__CUDAMPISTREAMCREATEREQ) { diff --git a/cudampilib/include/cudampilib.h b/cudampilib/include/cudampilib.h index 9c9e4a1..260b53a 100644 --- a/cudampilib/include/cudampilib.h +++ b/cudampilib/include/cudampilib.h @@ -17,6 +17,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OU extern __global__ void kernel(long *devPtr); +struct __cudampi__arguments_type +{ + int cpu_enabled; + int number_of_streams; + int batch_size; + int powercap; // 0 means disabled + long long problem_size; +}; + void __cudampi__setglobalpowerlimit(float powerlimit); int __cudampi__selectdevicesforpowerlimit_greedy(); diff --git a/cudampilib/run_scripts/run-app b/cudampilib/run_scripts/run-app index d34fd4e..5f08c07 100755 --- a/cudampilib/run_scripts/run-app +++ b/cudampilib/run_scripts/run-app @@ -9,57 +9,28 @@ is_number() { } # Basic check -if [ $# -lt 1 ]; then - echo "Usage: $0 " +if [ $# -lt 2 ]; then + echo "Usage: $0 " exit 1 fi APP_NAME="$1" MODE="$2" -if [ $# -gt 1 ]; then - echo "All arguments provided:" - echo "=====================================" - echo "APP_NAME: '$APP_NAME'" - echo "MODE: '$MODE'" -fi +# Display provided arguments +echo "All arguments provided:" +echo "=====================================" +echo "APP_NAME: '$APP_NAME'" +echo "MODE: '$MODE'" +echo "=====================================" if [ "$1" == "H" ]; then - echo "There are 3 modes:" - echo "* Debug: ex. ./run_scripts/run-app vecadd L 1 30 - run app vecadd locally with 1 slave process \ -and 1 stream for GPU with 30 as powercap value. Basically fast way to run on an app on two nodes." - echo "* Choose: ex ./run_scripts/run-app vecadd C 2 4 6 8 10 A 1 30 - run app vecadd remotly \ -on KASK server with 4 slave processes (each on different machine: des04, des06, des08, des10) - and 1 master process on des02 with 1 stream for GPU and 30 as powercap value, 'A' separates arguments from machines." - echo "* Bulk: ex ./run_scripts/run_app vecadd B 5 - run app vecadd remotly on first 5 machines in hostfile, \ -where master will be run on first and slaves on the rest." - -elif [ $2 == "D" ]; then - # Debug execution - STREAMS_COUNT="$3" - POWER_CAP="$4" - - echo "SLAVE_PROC_AMOUNT: 1" - echo "STREAMS_COUNT: '${STREAMS_COUNT:-1}'" - echo "POWER_CAP: '${POWER_CAP:-NONE}'" - echo "=====================================" - - echo "Running remotly with application '$APP_NAME'..." - - mpi_command="mpirun --mca orte_keep_fqdn_hostnames t \ ---mca btl_tcp_if_exclude docker0,docker_gwbridge,lo,vboxnet0 \ ---bind-to none \ ---machinefile ./debughostfile \ --np 1 ./$BUILD_DIR/app-streams-$APP_NAME $STREAMS_COUNT $POWER_CAP \ -: -np 1 ./$BUILD_DIR/cudampislave-$APP_NAME" - - echo "Command: $mpi_command" - echo "=====================================" - eval "$mpi_command" + echo "There are 2 modes:" + echo "* Choose: ex ./run_scripts/run-app vecadd C 2 4 6 8 A {args} - run app vecadd remotely on specific machines." + echo "* Bulk: ex ./run_scripts/run_app vecadd B 5 {args} - run app vecadd remotely on first 5 machines in hostfile." elif [ "$MODE" == "C" ]; then # Custom machine selection mode - shift # Remove the 'APP_NAME' from the arguments - shift # Remove the 'MODE' from the arguments + shift 2 # Remove the 'APP_NAME' and 'MODE' from the arguments selected_machines=() while [[ "$1" != "A" ]]; do @@ -75,64 +46,45 @@ elif [ "$MODE" == "C" ]; then exit 1 fi selected_machines+=("$machine") - echo "Selected machine: $machine." - - shift # Move to the next argument + shift done - if [ "$1" != "A" ]; then - echo "Error: Expected 'A' followed by two arguments for app-streams-vecadd." - exit 1 - fi - shift # Remove 'A' from the arguments SLAVE_PROC_AMOUNT=${#selected_machines[@]} SLAVE_PROC_AMOUNT=$((SLAVE_PROC_AMOUNT - 1)) - STREAMS_COUNT=$1 - POWER_CAP=$2 echo "SLAVE_PROC_AMOUNT: '$SLAVE_PROC_AMOUNT'" - echo "STREAMS_COUNT: '${STREAMS_COUNT:-1}'" - echo "POWER_CAP: '${POWER_CAP:-NONE}'" echo "=====================================" # Create a temporary hostfile with the selected machines printf "%s\n" "${selected_machines[@]}" > "$TEMP_HOSTFILE" - num_selected_machines=${#selected_machines[@]} echo "Running on the following machines: ${selected_machines[*]}" - echo "Running remotly with application '$APP_NAME'..." + echo "Running remotely with application '$APP_NAME'..." mpi_command="mpirun --mca orte_keep_fqdn_hostnames t \ --mca btl_tcp_if_exclude docker0,docker_gwbridge,lo,vboxnet0 \ --bind-to none \ --machinefile \"$TEMP_HOSTFILE\" \ --np 1 ./$BUILD_DIR/app-streams-$APP_NAME $STREAMS_COUNT $POWER_CAP \ +-np 1 ./$BUILD_DIR/app-streams-$APP_NAME ${@} \ : -np $SLAVE_PROC_AMOUNT ./$BUILD_DIR/cudampislave-$APP_NAME" echo "Command: $mpi_command" echo "=====================================" eval "$mpi_command" - - # Clean up the temporary hostfile - rm "$TEMP_HOSTFILE" - elif [ $2 == "B" ]; then + rm "$TEMP_HOSTFILE" # Clean up the temporary hostfile +elif [ "$MODE" == "B" ]; then # Bulk selection mode - MACHINES_COUNT="$3" SLAVE_PROC_AMOUNT=$((MACHINES_COUNT - 1)) - STREAMS_COUNT="$4" - POWER_CAP="$5" echo "MACHINES COUNT: '$MACHINES_COUNT'" echo "SLAVE_PROC_AMOUNT: '$SLAVE_PROC_AMOUNT'" - echo "STREAMS_COUNT: '${STREAMS_COUNT:-1}'" - echo "POWER_CAP: '${POWER_CAP:-NONE}'" echo "=====================================" echo "Running master on des01 and slaves on des (2 - $MACHINES_COUNT)" - echo "Running remotly with application '$APP_NAME'..." + echo "Running remotely with application '$APP_NAME'..." head -n "$MACHINES_COUNT" "$HOSTFILE" > "$TEMP_HOSTFILE" @@ -140,16 +92,15 @@ elif [ "$MODE" == "C" ]; then --mca btl_tcp_if_exclude docker0,docker_gwbridge,lo,vboxnet0 \ --bind-to none \ --machinefile \"$TEMP_HOSTFILE\" \ --np 1 ./$BUILD_DIR/app-streams-$APP_NAME $STREAMS_COUNT $POWER_CAP \ +-np 1 ./$BUILD_DIR/app-streams-$APP_NAME ${@:4} \ : -np $SLAVE_PROC_AMOUNT ./$BUILD_DIR/cudampislave-$APP_NAME" echo "Command: $mpi_command" echo "=====================================" eval "$mpi_command" - rm "$TEMP_HOSTFILE" else - # Invalid option for $1 - echo "Error: Invalid argument '$1'. Use 'H' for help, 'D' for debug, , or 'C' for custom." + # Invalid option for MODE + echo "Error: Invalid mode '$MODE'. Use 'H' for help, 'C' for custom, or 'B' for bulk." exit 1 -fi \ No newline at end of file +fi