From 6ebc1f951f150f928679315d8c9d356661c43b2f Mon Sep 17 00:00:00 2001 From: yupei1990 Date: Thu, 8 Nov 2018 19:59:33 -0500 Subject: [PATCH 01/15] create a test case for sending varied size data to different ranks Signed-off-by: Yu Pei --- tests/dsl/ptg/CMakeLists.txt | 1 + .../ptg/check_multisize_bcast/CMakeLists.txt | 7 ++ .../ptg/check_multisize_bcast/Testings.cmake | 4 + .../check_multisize_bcast.jdf | 96 +++++++++++++++++++ .../check_multisize_bcast_wrapper.c | 52 ++++++++++ .../check_multisize_bcast_wrapper.h | 6 ++ .../dsl/ptg/check_multisize_bcast/data_gen.c | 42 ++++++++ .../dsl/ptg/check_multisize_bcast/data_gen.h | 10 ++ tests/dsl/ptg/check_multisize_bcast/main.c | 74 ++++++++++++++ 9 files changed, 292 insertions(+) create mode 100644 tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt create mode 100644 tests/dsl/ptg/check_multisize_bcast/Testings.cmake create mode 100644 tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf create mode 100644 tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c create mode 100644 tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h create mode 100644 tests/dsl/ptg/check_multisize_bcast/data_gen.c create mode 100644 tests/dsl/ptg/check_multisize_bcast/data_gen.h create mode 100644 tests/dsl/ptg/check_multisize_bcast/main.c diff --git a/tests/dsl/ptg/CMakeLists.txt b/tests/dsl/ptg/CMakeLists.txt index 84f7a4208..cacc65041 100644 --- a/tests/dsl/ptg/CMakeLists.txt +++ b/tests/dsl/ptg/CMakeLists.txt @@ -17,3 +17,4 @@ add_subdirectory(controlgather) add_Subdirectory(user-defined-functions) add_Subdirectory(cuda) add_Subdirectory(local-indices) +add_subdirectory(check_multisize_bcast) diff --git a/tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt b/tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt new file mode 100644 index 000000000..3583307ec --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt @@ -0,0 +1,7 @@ +include(ParsecCompilePTG) + +parsec_addtest_executable(C check_multisize_bcast + SOURCES main.c check_multisize_bcast_wrapper.c data_gen.c) +target_ptg_sources(check_multisize_bcast PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/check_multisize_bcast.jdf") + +include(Testings.cmake) diff --git a/tests/dsl/ptg/check_multisize_bcast/Testings.cmake b/tests/dsl/ptg/check_multisize_bcast/Testings.cmake new file mode 100644 index 000000000..4e2566e4d --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/Testings.cmake @@ -0,0 +1,4 @@ +parsec_addtest_cmd(unit_check_multisize_bcast_shm ${SHM_TEST_CMD_LIST} ./check_multisize_bcast) +if( MPI_C_FOUND ) + parsec_addtest_cmd(unit_check_multisize_bcast_mpi ${MPI_TEST_CMD_LIST} 4 ./check_multisize_bcast) +endif( MPI_C_FOUND) diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf new file mode 100644 index 000000000..d1d47d7a5 --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf @@ -0,0 +1,96 @@ +extern "C" %{ +/* + * Copyright (c) 2018-2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ +#include "parsec/data_distribution.h" +#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data.h" +#include "parsec/utils/mca_param.h" +#include "parsec/arena.h" +#include +#include +#include + +%} + +%option no_taskpool_instance = true /* can be aything */ + +descA [type = "struct parsec_tiled_matrix_dc_t *"] +NB [type = int] +NT [type = int] + + +/************************************************** + * potrf_dpotrf * + **************************************************/ +potrf_dpotrf(k) + +// Execution space +k = 0 .. descA->mt-1 + +// Parallel partitioning +:descA(k, k) + +// Parameters +RW T <- (k == 0) ? descA(k, k) : A potrf_diag(k, k-1) + -> T potrf_diag(k+1..descA->mt-1, k) + -> T potrf_col(k+1..descA->mt-1..2, k) [layout = MPI_DOUBLE count = 2] + -> T potrf_col(k+2..descA->mt-1..2, k) [layout = MPI_DOUBLE count = 3] + -> descA(k, k) + +BODY +{ +printf("potrf %d\n", k); +} +END + + +/************************************************** + * potrf_col * + **************************************************/ +potrf_col(m, k) + +// Execution space +k = 0 .. descA->mt-2 +m = k+1 .. descA->mt-1 + +// Parallel partitioning +: descA(m, k) + +// Parameters +READ T <- T potrf_dpotrf(k) +RW C <- descA(m, k) + -> descA(m, k) + +BODY +{ + printf("col %d %d\n", m, k); +} +END + + +/************************************************** + * potrf_diag * + **************************************************/ +potrf_diag(k, i) + +// Execution space +i = 0 .. descA->mt-2 +k = i+1 .. descA->mt-1 + +// Parallel partitioning +: descA(k, k) + +// Parameters +READ T <- T potrf_dpotrf(i) +RW A <- (i == 0) ? descA(k, k) : A potrf_diag(k, i-1) + -> (k == i+1) ? T potrf_dpotrf(k) : A potrf_diag(k, i+1) + +BODY +{ + printf("diag %d, iteration %d\n", k, i); +} +END + diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c new file mode 100644 index 000000000..b917a827f --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018-2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ + +#include "parsec/runtime.h" +#include "parsec/data_distribution.h" +#include "parsec/arena.h" + +#if defined(PARSEC_HAVE_MPI) +#include +#endif +#include + +#include "check_multisize_bcast.h" +#include "check_multisize_bcast_wrapper.h" + +/** + * @param [IN] A the data, already distributed and allocated + * @param [IN] nb matrix size + * @param [IN] nt tile size + * + * @return the parsec object to schedule. + */ +parsec_taskpool_t* check_multisize_bcast_new(parsec_tiled_matrix_dc_t *A, int nb, int nt) +{ + parsec_check_multisize_bcast_taskpool_t *tp = NULL; + + tp = parsec_check_multisize_bcast_new(A, nb, nt); + + /* As the datatype is parsec_datatype_int32_t all communications to/from + * this arena should use the count property or they will exchange a + * single integer. */ + parsec_arena_datatype_construct(&tp->arenas_datatypes[PARSEC_check_multisize_bcast_DEFAULT_ADT_IDX], + nb*sizeof(int), PARSEC_ARENA_ALIGNMENT_SSE, + parsec_datatype_int32_t); + + return (parsec_taskpool_t*)tp; +} + +/** + * @param [INOUT] o the parsec object to destroy + */ +static void +check_multisize_bcast_destructor(parsec_check_multisize_bcast_taskpool_t *tp) +{ + parsec_matrix_del2arena(&tp->arenas_datatypes[PARSEC_check_multisize_bcast_DEFAULT_ADT_IDX]); +} + +PARSEC_OBJ_CLASS_INSTANCE(parsec_check_multisize_bcast_taskpool_t, parsec_taskpool_t, + NULL, check_multisize_bcast_destructor); diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h new file mode 100644 index 000000000..2ce692636 --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h @@ -0,0 +1,6 @@ +#include "parsec/runtime.h" +#include "parsec/data_distribution.h" +#include "parsec/data_dist/matrix/matrix.h" + +parsec_taskpool_t *check_multisize_bcast_new(parsec_tiled_matrix_dc_t *A, int size, int nt); + diff --git a/tests/dsl/ptg/check_multisize_bcast/data_gen.c b/tests/dsl/ptg/check_multisize_bcast/data_gen.c new file mode 100644 index 000000000..68fc55559 --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/data_gen.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2018-2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ + +#include "parsec/runtime.h" +#include "data_gen.h" +#include "stdarg.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" + +#include +#include + +/* + * mb, whole matrix row/column number, mt, each tile row/column number + */ +parsec_tiled_matrix_dc_t *create_and_distribute_data(int rank, int world, int mb, int mt, int typesize) +{ + two_dim_block_cyclic_t *m = (two_dim_block_cyclic_t*)malloc(sizeof(two_dim_block_cyclic_t)); + + two_dim_block_cyclic_init(m, matrix_ComplexDouble, matrix_Tile, + rank, + mb, mb, + mt*mb, mt*mb, + 0, 0, + mt*mb, mt*mb, + 1, world, + 1, 1, 0, 0); + + m->mat = parsec_data_allocate((size_t)m->super.nb_local_tiles * + (size_t)m->super.bsiz * + (size_t)parsec_datadist_getsizeoftype(m->super.mtype)); + + return (parsec_tiled_matrix_dc_t*)m; +} + +void free_data(parsec_tiled_matrix_dc_t *d) +{ + parsec_data_collection_destroy(&d->super); + free(d); +} diff --git a/tests/dsl/ptg/check_multisize_bcast/data_gen.h b/tests/dsl/ptg/check_multisize_bcast/data_gen.h new file mode 100644 index 000000000..7d9f94dd2 --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/data_gen.h @@ -0,0 +1,10 @@ +#if !defined(_DATA_GEN_H_) +#define _DATA_GEN_H_ + +#include "parsec/runtime.h" +#include "parsec/data_dist/matrix/matrix.h" + +parsec_tiled_matrix_dc_t *create_and_distribute_data(int rank, int world, int nb, int nt, int typesize); +void free_data(parsec_tiled_matrix_dc_t *d); + +#endif diff --git a/tests/dsl/ptg/check_multisize_bcast/main.c b/tests/dsl/ptg/check_multisize_bcast/main.c new file mode 100644 index 000000000..ca0df98c0 --- /dev/null +++ b/tests/dsl/ptg/check_multisize_bcast/main.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018-2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ + +#include + +#include "parsec/runtime.h" +#include "parsec/utils/debug.h" +#include "check_multisize_bcast_wrapper.h" +#if defined(PARSEC_HAVE_STRING_H) +#include +#endif /* defined(PARSEC_HAVE_STRING_H) */ +#include "data_gen.h" + +int main(int argc, char *argv[]) +{ + parsec_context_t* parsec; + int rank = 0, world = 1, cores = -1; + int nt = 2, nb = 16, rc; + parsec_tiled_matrix_dc_t *dcA; + parsec_taskpool_t *bcast; + +#if defined(PARSEC_HAVE_MPI) + { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided); + } + MPI_Comm_size(MPI_COMM_WORLD, &world); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#endif + if( argc > 1 ) { + char* endptr; + long val = strtol(argv[1], &endptr, 0); + if( endptr == argv[1] ) { + printf("Bad argument (found %s instead of the number of tiles)\n", argv[1]); + exit(-1); + } + nt = (int)val; + if( 0 == nt ) { + printf("Bad value for nt (it canot be zero) !!!\n"); + exit(-1); + } + } + + parsec = parsec_init(cores, &argc, &argv); + if( NULL == parsec ) { + exit(1); + } + + dcA = create_and_distribute_data(rank, world, nb, nt, sizeof(int)); + parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); + + bcast = check_multisize_bcast_new(dcA, nb, nt); + + rc = parsec_context_add_taskpool(parsec, bcast); + PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool"); + rc = parsec_context_start(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_start"); + rc = parsec_context_wait(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_wait"); + + parsec_taskpool_free((parsec_taskpool_t*)bcast); + free_data(dcA); + + parsec_fini(&parsec); + +#ifdef PARSEC_HAVE_MPI + MPI_Finalize(); +#endif + + return 0; +} From 3d003500895a1857114348d495d8675067ef665c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 30 Jan 2023 23:00:27 -0500 Subject: [PATCH 02/15] Fixes a lot of typos. Signed-off-by: George Bosilca --- CMakeLists.txt | 2 +- parsec/interfaces/dtd/insert_function.c | 5 +++-- parsec/utils/debug.h | 8 +++++--- tests/apps/stencil/stencil_1D.jdf | 6 +++--- tests/apps/stencil/stencil_internal.c | 2 +- tests/apps/stencil/testing_stencil_1D.c | 4 ++-- tests/collections/reshape/avoidable_reshape.jdf | 4 ++-- tests/collections/reshape/local_no_reshape.jdf | 4 ++-- .../reshape/remote_multiple_outs_same_pred_flow.jdf | 2 +- .../remote_multiple_outs_same_pred_flow_multiple_deps.jdf | 2 +- tests/collections/reshape/testing_avoidable_reshape.c | 4 ++-- .../reshape/testing_input_dep_reshape_single_copy.c | 2 +- .../reshape/testing_remote_multiple_outs_same_pred_flow.c | 8 ++++---- tests/collections/reshape/testing_reshape.c | 6 +++--- 14 files changed, 31 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27da45a80..7935aa434 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,7 +158,7 @@ option(PARSEC_DIST_PRIORITIES option(PARSEC_DIST_COLLECTIVES "Use optimized asynchronous operations where collective communication pattern is detected" ON) set (PARSEC_DIST_SHORT_LIMIT 1 CACHE STRING - "Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU") + "Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU.") ### GPU engine parameters option(PARSEC_GPU_WITH_CUDA diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index 2d6877f02..3bc1a80d1 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -2015,10 +2015,11 @@ output_data_of_dtd_task(parsec_execution_stream_t *es, static int datatype_lookup_of_dtd_task(parsec_execution_stream_t *es, const parsec_task_t *this_task, + const parsec_task_t *parent_task, uint32_t *flow_mask, parsec_dep_data_description_t *data) { parsec_arena_datatype_t *adt; - (void)es; + (void)es; (void)parent_task; data->remote.src_count = data->remote.dst_count = 1; data->remote.src_displ = data->remote.dst_displ = 0; data->data_future = NULL; @@ -2181,7 +2182,7 @@ parsec_dtd_create_task_classv(const char *name, tc->release_deps = parsec_dtd_release_deps; tc->prepare_input = data_lookup_of_dtd_task; tc->prepare_output = output_data_of_dtd_task; - tc->get_datatype = (parsec_datatype_lookup_t *)datatype_lookup_of_dtd_task; + tc->get_datatype = datatype_lookup_of_dtd_task; tc->complete_execution = complete_hook_of_dtd; tc->release_task = parsec_release_dtd_task_to_mempool; diff --git a/parsec/utils/debug.h b/parsec/utils/debug.h index 7483565f6..ddda2352b 100644 --- a/parsec/utils/debug.h +++ b/parsec/utils/debug.h @@ -146,7 +146,7 @@ extern void (*parsec_weaksym_exit)(int status); "d@%05d " FMT " @%.20s:%-5d", parsec_debug_rank, \ ##__VA_ARGS__, __func__, __LINE__); \ } while(0) -#else +#else /* defined(PARSEC_DEBUG_HISTORY) */ #define PARSEC_DEBUG_VERBOSE(LVL, OUT, FMT, ...) do { \ parsec_output_verbose(LVL, OUT, \ "%.*sd@%05d%.*s " FMT " %.*s@%.64s:%-5d%.*s", \ @@ -155,8 +155,10 @@ extern void (*parsec_weaksym_exit)(int status); parsec_debug_colorize, "\x1B[36m", __func__, __LINE__, \ parsec_debug_colorize, "\033[0m"); \ } while(0) -#endif -#else +#endif /* defined(PARSEC_DEBUG_HISTORY) */ + +#else /* defined(PARSEC_DEBUG_NOISIER) */ + #define PARSEC_DEBUG_VERBOSE(...) do{} while(0) #endif /* defined(PARSEC_DEBUG_VERBOSE) */ diff --git a/tests/apps/stencil/stencil_1D.jdf b/tests/apps/stencil/stencil_1D.jdf index 5ea82fbe7..91b392107 100644 --- a/tests/apps/stencil/stencil_1D.jdf +++ b/tests/apps/stencil/stencil_1D.jdf @@ -1,19 +1,19 @@ extern "C" %{ /* - * Copyright (c) 2019-2021 The Universiy of Tennessee and The Universiy + * Copyright (c) 2019-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. */ #include "stencil_internal.h" -/* Automatically check datetype */ +/* Automatically check datatype */ const int sizeof_datatype = sizeof(DTYPE); /** * @brief stencil_1D copy data to ghost region * * @param [out] A0: output data - * @param [in] AL: letf input data + * @param [in] AL: left input data * @param [in] AR: right input data * @param [in] MB: row tile size * @param [in] NB: column tile size diff --git a/tests/apps/stencil/stencil_internal.c b/tests/apps/stencil/stencil_internal.c index 05189ead6..e77024c94 100644 --- a/tests/apps/stencil/stencil_internal.c +++ b/tests/apps/stencil/stencil_internal.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 The Universiy of Tennessee and The Universiy + * Copyright (c) 2019-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. */ diff --git a/tests/apps/stencil/testing_stencil_1D.c b/tests/apps/stencil/testing_stencil_1D.c index 46d1a8e3d..8541bcc3b 100644 --- a/tests/apps/stencil/testing_stencil_1D.c +++ b/tests/apps/stencil/testing_stencil_1D.c @@ -163,7 +163,7 @@ int main(int argc, char *argv[]) (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_unary_op_t)stencil_1D_init_ops, op_args); - /* intialize weight_1D */ + /* initialize weight_1D */ weight_1D = (DTYPE *)malloc(sizeof(DTYPE) * (2*R+1)); for(jj = 1; jj <= R; jj++) { @@ -172,7 +172,7 @@ int main(int argc, char *argv[]) } WEIGHT_1D(0) = (DTYPE)1.0; - /* Generete LOOPGEN Kernel */ + /* Generate LOOPGEN Kernel */ #if LOOPGEN if( 0 == rank ){ char command[50]; diff --git a/tests/collections/reshape/avoidable_reshape.jdf b/tests/collections/reshape/avoidable_reshape.jdf index 77d50721d..44b939a7e 100644 --- a/tests/collections/reshape/avoidable_reshape.jdf +++ b/tests/collections/reshape/avoidable_reshape.jdf @@ -9,8 +9,8 @@ extern "C" %{ /******************* * No local reshape - * When only type_remote is used on the dependencies, the pointer to the origianl - * matrix tiles is passed to the succesors tasks. Thus, the full original tiles are + * When only type_remote is used on the dependencies, the pointer to the original + * matrix tiles is passed to the successors tasks. Thus, the full original tiles are * set to 0. *******************/ diff --git a/tests/collections/reshape/local_no_reshape.jdf b/tests/collections/reshape/local_no_reshape.jdf index a9633678a..7911d1f4d 100644 --- a/tests/collections/reshape/local_no_reshape.jdf +++ b/tests/collections/reshape/local_no_reshape.jdf @@ -9,8 +9,8 @@ extern "C" %{ /******************* * No local reshape - * When only type_remote is used on the dependencies, the pointer to the origianl - * matrix tiles is passed to the succesors tasks. Thus, the full original tiles are + * When only type_remote is used on the dependencies, the pointer to the original + * matrix tiles is passed to the successors tasks. Thus, the full original tiles are * set to 0. *******************/ diff --git a/tests/collections/reshape/remote_multiple_outs_same_pred_flow.jdf b/tests/collections/reshape/remote_multiple_outs_same_pred_flow.jdf index 89b480ccc..63a05572f 100644 --- a/tests/collections/reshape/remote_multiple_outs_same_pred_flow.jdf +++ b/tests/collections/reshape/remote_multiple_outs_same_pred_flow.jdf @@ -46,7 +46,7 @@ k = 0 .. descM->nt-1 RW A <- A READ_A(m) [type_remote=UPPER_TILE] - -> descM(m, k) [type=DEFAULT type_data=DEFAULT] //Write back full tile, otherwises remote has UPPER_TILE + -> descM(m, k) [type=DEFAULT type_data=DEFAULT] //Write back full tile, otherwise remote has UPPER_TILE READ B <- A READ_A(m) [type_remote=LOWER_TILE] diff --git a/tests/collections/reshape/remote_multiple_outs_same_pred_flow_multiple_deps.jdf b/tests/collections/reshape/remote_multiple_outs_same_pred_flow_multiple_deps.jdf index 4b6dae74d..ab7735de7 100644 --- a/tests/collections/reshape/remote_multiple_outs_same_pred_flow_multiple_deps.jdf +++ b/tests/collections/reshape/remote_multiple_outs_same_pred_flow_multiple_deps.jdf @@ -88,7 +88,7 @@ k = 0 .. descM->nt-1 RW A <- A READ_A(m) [type_remote=UPPER_TILE] - -> descM(m, k) [type=DEFAULT type_data=DEFAULT] //Write back full tile, otherwises remote has UPPER_TILE + -> descM(m, k) [type=DEFAULT type_data=DEFAULT] //Write back full tile, otherwise remote has UPPER_TILE READ B <- A READ_A(m) [type_remote=LOWER_TILE] diff --git a/tests/collections/reshape/testing_avoidable_reshape.c b/tests/collections/reshape/testing_avoidable_reshape.c index bb36283ef..365eaa7f9 100644 --- a/tests/collections/reshape/testing_avoidable_reshape.c +++ b/tests/collections/reshape/testing_avoidable_reshape.c @@ -67,7 +67,7 @@ int main(int argc, char *argv[]) /******************* - * Doing avoidable reshape becasue dc datatype differs from default ADT. + * Doing avoidable reshape because dc datatype differs from default ADT. *******************/ op_args = (int *)malloc(sizeof(int)); op_args[0] = 0; @@ -88,7 +88,7 @@ int main(int argc, char *argv[]) ctp->arenas_datatypes[PARSEC_avoidable_reshape_DEFAULT_ADT_IDX] = adt_default; #ifdef AVOID_UNNECESSARY_RESHAPING - /* Can be avoided by setting the datacollection type as the default adt*/ + /* Can be avoided by setting the data collection type as the default adt*/ parsec_datatype_t tmp = adt_default.opaque_dtt; ctp->arenas_datatypes[PARSEC_avoidable_reshape_DEFAULT_ADT_IDX].opaque_dtt = dcA.super.super.default_dtt; #endif diff --git a/tests/collections/reshape/testing_input_dep_reshape_single_copy.c b/tests/collections/reshape/testing_input_dep_reshape_single_copy.c index 3ecf8b129..ad2f8a5c4 100644 --- a/tests/collections/reshape/testing_input_dep_reshape_single_copy.c +++ b/tests/collections/reshape/testing_input_dep_reshape_single_copy.c @@ -16,7 +16,7 @@ #include "input_dep_single_copy_reshape.h" /* Program to test the different reshaping functionalities - * Each different test is comented on the main program. + * Each different test is commented on the main program. */ int main(int argc, char *argv[]) diff --git a/tests/collections/reshape/testing_remote_multiple_outs_same_pred_flow.c b/tests/collections/reshape/testing_remote_multiple_outs_same_pred_flow.c index 9d707e544..5366cb91b 100644 --- a/tests/collections/reshape/testing_remote_multiple_outs_same_pred_flow.c +++ b/tests/collections/reshape/testing_remote_multiple_outs_same_pred_flow.c @@ -27,13 +27,13 @@ * Currently, PaRSEC doesn't support this scenario using SHORT. * In this case, two datas are included on the activation message, and * after reception on the receiver, the predecessor task is faked and - * iterate_sucessors of the predecessor task is run only ONCE, therefore, - * for one sucessors the flow will contain incorrect data. + * iterate_successors of the predecessor task is run only ONCE, therefore, + * for one successors the flow will contain incorrect data. * - * When SHORT is not used, PaRSEC runs iterate_sucessors for each data received, + * When SHORT is not used, PaRSEC runs iterate_successors for each data received, * thus, always the correct data is used. * This gives the chance to the reshaping mechanism to put the first data on - * the predecessor's repo, and any subsequent data on the sucessors repo, avoiding + * the predecessor's repo, and any subsequent data on the successors repo, avoiding * any overwrites. */ diff --git a/tests/collections/reshape/testing_reshape.c b/tests/collections/reshape/testing_reshape.c index 6056c2200..17cb578dd 100644 --- a/tests/collections/reshape/testing_reshape.c +++ b/tests/collections/reshape/testing_reshape.c @@ -22,7 +22,7 @@ #include "local_input_LU_LL.h" /* Program to test the different reshaping functionalities - * Each different test is comented on the main program. + * Each different test is commented on the main program. */ int main(int argc, char *argv[]) @@ -70,8 +70,8 @@ int main(int argc, char *argv[]) /******************* * No local reshape - * When only type_remote is used on the dependencies, the pointer to the origianl - * matrix tiles is passed to the succesors tasks. Thus, the full original tiles are + * When only type_remote is used on the dependencies, the pointer to the origianal + * matrix tiles is passed to the successors tasks. Thus, the full original tiles are * set to 0. *******************/ op_args = (int *)malloc(sizeof(int)); From 2114418c410c70147ffb413b7fd1fbf19c3cac03 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 30 Jan 2023 23:01:26 -0500 Subject: [PATCH 03/15] Variable size data and short protocol 1. Allow sender to send data of any size. The sender can send less data than expected by the receiver. If we select the communication protocol based on how much data the receiver expects, the sender and receiver could diverge on the protocol to be used (mainly visible when the sent message is below eager, while the receiver expects more data. So, force the sender to pack the amount of data per dep, and force the receiver to abide by this amount. 2. bring back short message protocol (aka data embedded directly into the activation message - enable propagation of the data sizes. This is now part of the activation message, but unfortunately they will dissapear after the first call to get_datatype, and there is no way to retrieve them. The side effect of this is that the get_datatype function should only be called once, and this is not the case today (the reshape code makes heavy usage). - clean up the reshape code. I don't understand it, it adds a lot of overhead on the critical path (a lot of initializations of useless structures, and few function calls). I barely tried to minimize the overheads, but at some point we need to understand this code and make it more user friendly. Signed-off-by: George Bosilca --- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 125 ++++------ parsec/parsec_internal.h | 1 + parsec/parsec_reshape.c | 8 +- parsec/remote_dep.c | 4 +- parsec/remote_dep.h | 14 +- parsec/remote_dep_mpi.c | 219 +++++++++++++----- tests/apps/stencil/stencil_internal.h | 2 +- .../check_multisize_bcast.jdf | 30 ++- .../check_multisize_bcast_wrapper.c | 15 +- .../check_multisize_bcast_wrapper.h | 5 +- .../dsl/ptg/check_multisize_bcast/data_gen.c | 27 ++- .../dsl/ptg/check_multisize_bcast/data_gen.h | 6 +- tests/dsl/ptg/check_multisize_bcast/main.c | 8 +- 13 files changed, 258 insertions(+), 206 deletions(-) diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 1cf949484..520a12d0e 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -4296,7 +4296,7 @@ static void jdf_generate_one_function( const jdf_t *jdf, jdf_function_entry_t *f string_arena_add_string(sa, " .prepare_input = (parsec_hook_t*)%s,\n", prefix); sprintf(prefix, "datatype_lookup_of_%s_%s", jdf_basename, f->fname); jdf_generate_code_datatype_lookup(jdf, f, prefix); - string_arena_add_string(sa, " .get_datatype = (parsec_datatype_lookup_t*)%s,\n", prefix); + string_arena_add_string(sa, " .get_datatype = %s,\n", prefix); sprintf(prefix, "hook_of_%s_%s", jdf_basename, f->fname); jdf_generate_code_hooks(jdf, f, prefix); @@ -5039,9 +5039,6 @@ jdf_generate_code_fillup_datatypes(string_arena_t * sa_tmp_arena, string_are { /* Prepare the memory layout of the output dependency. */ if( dump_all || (sa_arena == NULL) || strcmp(string_arena_get_string(sa_tmp_arena), string_arena_get_string(sa_arena))) { - if( strcmp(target, "local") == 0) { - if(sa_arena != NULL) string_arena_add_string(sa_out, " data%sdata_future = NULL;\n", access); - } /* The type might change (possibly from undefined), so let's output */ if(sa_arena != NULL){ string_arena_init(sa_arena); @@ -5169,7 +5166,9 @@ jdf_generate_code_reshape_input_from_desc(const jdf_t *jdf, * reshaping of the future. We mark that with NULL and MPI_DATYPE_NULL. * Otherwise, we use SRC and DST dt. */ - coutput("%s data.data = chunk;", spaces); + coutput("%s data.data = chunk;\n" + "%s data.data_future = NULL;\n", + spaces, spaces); reshape_dtt_dst = dl->datatype_local; reshape_dtt_src = dl->datatype_data; @@ -6269,13 +6268,15 @@ jdf_generate_code_datatype_lookup(const jdf_t *jdf, ai.sa = sa2; ai.holder = "this_task->locals."; ai.expr = NULL; - coutput("static int %s(parsec_execution_stream_t *es, const %s *this_task,\n" + coutput("static int %s(parsec_execution_stream_t *es, const parsec_task_t *this_generic_task, const parsec_task_t *parent_task,\n" " uint32_t* flow_mask, parsec_dep_data_description_t* data)\n" "{\n" + " const %s *this_task = (%s*)this_generic_task;\n" " const __parsec_%s_internal_taskpool_t *__parsec_tp = (__parsec_%s_internal_taskpool_t *)this_task->taskpool;\n" - " (void)__parsec_tp; (void)es; (void)this_task; (void)data;\n" + " (void)__parsec_tp; (void)es; (void)this_task; (void)data; (void)parent_task;\n" "%s", - name, parsec_get_name(jdf, f, "task_t"), + name, + parsec_get_name(jdf, f, "task_t"), parsec_get_name(jdf, f, "task_t"), jdf_basename, jdf_basename, UTIL_DUMP_LIST(sa, f->locals, next, dump_local_assignments, &ai, "", " ", "\n", "\n")); @@ -6400,12 +6401,8 @@ jdf_generate_code_datatype_lookup(const jdf_t *jdf, coutput(" no_mask_match:\n"); coutput(" data->data = NULL;\n" - " data->local.arena = data->remote.arena = NULL;\n" - " data->local.src_datatype = data->local.dst_datatype = PARSEC_DATATYPE_NULL;\n" " data->remote.src_datatype = data->remote.dst_datatype = PARSEC_DATATYPE_NULL;\n" - " data->local.src_count = data->local.dst_count = 0;\n" " data->remote.src_count = data->remote.dst_count = 0;\n" - " data->local.src_displ = data->local.dst_displ = 0;\n" " data->remote.src_displ = data->remote.dst_displ = 0;\n" " data->data_future = NULL;\n" " (*flow_mask) = 0; /* nothing left */\n" @@ -7611,21 +7608,20 @@ static void jdf_check_relatives( jdf_function_entry_t *f, jdf_dep_flags_t flow_t #define OUTPUT_PREV_DEPS(MASK, SA_DATATYPE, SA_DEPS) \ if( strlen(string_arena_get_string((SA_DEPS))) ) { \ - if( strlen(string_arena_get_string((SA_DATATYPE))) ) { \ - string_arena_add_string(sa_coutput, \ - " %s", \ - string_arena_get_string((SA_DATATYPE))); \ - } \ if( (JDF_DEP_FLOW_OUT & flow_type) && fl->flow_dep_mask_out == (MASK) ) { \ string_arena_add_string(sa_coutput, \ - " %s", \ + " %s" \ + " %s", \ + string_arena_get_string((SA_DATATYPE)), \ string_arena_get_string((SA_DEPS))); \ } else { \ string_arena_add_string(sa_coutput, \ " if( action_mask & 0x%x ) {\n" \ " %s" \ + " %s" \ " }\n", \ - MASK, string_arena_get_string((SA_DEPS))); \ + MASK, string_arena_get_string((SA_DATATYPE)), \ + string_arena_get_string((SA_DEPS))); \ } \ string_arena_init((SA_DEPS)); \ string_arena_init((SA_DATATYPE)); \ @@ -7647,23 +7643,15 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_t *sa_deps = string_arena_new(1024); string_arena_t *sa_datatype = string_arena_new(1024); - string_arena_t *sa_arena = string_arena_new(256); - string_arena_t *sa_tmp_arena = string_arena_new(256); - string_arena_t *sa_count = string_arena_new(256); + string_arena_t *sa_tmp_arena = string_arena_new(256); string_arena_t *sa_tmp_count = string_arena_new(256); - string_arena_t *sa_displ = string_arena_new(256); string_arena_t *sa_tmp_displ = string_arena_new(256); - string_arena_t *sa_type = string_arena_new(256); string_arena_t *sa_tmp_type = string_arena_new(256); string_arena_t *sa_temp = string_arena_new(1024); - string_arena_t *sa_arena_r = string_arena_new(256); - string_arena_t *sa_tmp_arena_r = string_arena_new(256); - string_arena_t *sa_count_r = string_arena_new(256); + string_arena_t *sa_tmp_arena_r = string_arena_new(256); string_arena_t *sa_tmp_count_r = string_arena_new(256); - string_arena_t *sa_displ_r = string_arena_new(256); string_arena_t *sa_tmp_displ_r = string_arena_new(256); - string_arena_t *sa_type_r = string_arena_new(256); string_arena_t *sa_tmp_type_r = string_arena_new(256); string_arena_t *sa_temp_r = string_arena_new(1024); @@ -7722,18 +7710,11 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_init(sa_coutput); string_arena_init(sa_deps); string_arena_init(sa_datatype); - string_arena_init(sa_arena); - string_arena_init(sa_count); - string_arena_init(sa_displ); - string_arena_init(sa_type); - string_arena_init(sa_arena_r); - string_arena_init(sa_count_r); - string_arena_init(sa_displ_r); - string_arena_init(sa_type_r); nb_open_ldef = 0; string_arena_add_string(sa_coutput, - " data.data = this_task->data._f_%s.data_out;\n", + " data.data = this_task->data._f_%s.data_out;\n" + " data.data_future = NULL;\n", fl->varname); for(dl = fl->deps; dl != NULL; dl = dl->next) { @@ -7752,16 +7733,9 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_init(sa_tmp_type_r); string_arena_init(sa_tmp_displ_r); - + string_arena_init(sa_datatype); /*********************************/ /* LOCAL DATATYPE FOR RESHAPPING */ - /* always checked if !=; deps are - * grouped by type_remote, thus no - * checking for last_datatype_idx - * dl->dep_datatype_index - * Change that to minimize the - * number of reshapings? /!\ - */ /*********************************/ if( JDF_FLOW_TYPE_CTL & fl->flow_flags ) { string_arena_add_string(sa_tmp_arena, "NULL"); @@ -7787,10 +7761,10 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, && (NULL == reshape_dtt.layout) ){ /* User didn't specify a custom layout*/ string_arena_add_string(sa_tmp_type, "PARSEC_DATATYPE_NULL"); - }else{ + } else { if( NULL == reshape_dtt.layout ){ /* User didn't specify a custom layout*/ string_arena_add_string(sa_tmp_type, "%s->opaque_dtt", string_arena_get_string(sa_temp)); - }else{ + } else { string_arena_add_string(sa_tmp_type, "%s", dump_expr((void**)reshape_dtt.layout, &info)); } } @@ -7800,17 +7774,6 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_add_string(sa_tmp_displ, "%s", dump_expr((void**)reshape_dtt.displ, &info)); } - string_arena_add_string(sa_datatype," if (action_mask & (PARSEC_ACTION_RESHAPE_ON_RELEASE | PARSEC_ACTION_RESHAPE_REMOTE_ON_RELEASE | PARSEC_ACTION_SEND_REMOTE_DEPS)) {\n"); - jdf_generate_code_fillup_datatypes(sa_tmp_arena, sa_arena, - sa_tmp_type, sa_type, - sa_tmp_displ, sa_displ, - sa_tmp_count, sa_count, - NULL, - NULL, - NULL, - sa_datatype, - ".", "local", 0); - /* Generate the remote datatype info only during releasing deps of * a real task. That is, avoid it when after reception during * release deps of a fake predecessor task. @@ -7836,10 +7799,10 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, * of iterate_successors -> get_datatype to recv the data; we are running over "fake predecessor task" * and the goal is to check the successor datatype) */ string_arena_add_string(sa_tmp_type_r, "(data.data != NULL ? data.data->dtt : PARSEC_DATATYPE_NULL )"); - }else{ + } else { if( NULL == dl->datatype_remote.layout ){ /* User didn't specify a custom layout*/ string_arena_add_string(sa_tmp_type_r, "%s->opaque_dtt", string_arena_get_string(sa_temp)); - }else{ + } else { string_arena_add_string(sa_tmp_type_r, "%s", dump_expr((void**)dl->datatype_remote.layout, &info)); } } @@ -7848,20 +7811,26 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_add_string(sa_tmp_displ_r, "%s", dump_expr((void**)dl->datatype_remote.displ, &info)); } - if( last_datatype_idx != dl->dep_datatype_index ) { - jdf_generate_code_fillup_datatypes(sa_tmp_arena_r, sa_arena_r, - sa_tmp_type_r, sa_type_r, - sa_tmp_displ_r, sa_displ_r, - sa_tmp_count_r, sa_count_r, - NULL, - NULL, - NULL, - sa_datatype, - ".", "remote", 0); - - last_datatype_idx = dl->dep_datatype_index; + string_arena_add_string(sa_datatype," if (action_mask & (PARSEC_ACTION_RESHAPE_ON_RELEASE | PARSEC_ACTION_RESHAPE_REMOTE_ON_RELEASE | PARSEC_ACTION_SEND_REMOTE_DEPS)) {\n"); + jdf_generate_code_fillup_datatypes(sa_tmp_arena, NULL, + sa_tmp_type, NULL, + sa_tmp_displ, NULL, + sa_tmp_count, NULL, + NULL, + NULL, + NULL, + sa_datatype, + ".", "local", 0); + jdf_generate_code_fillup_datatypes(sa_tmp_arena_r, NULL, + sa_tmp_type_r, NULL, + sa_tmp_displ_r, NULL, + sa_tmp_count_r, NULL, + NULL, + NULL, + NULL, + sa_datatype, + ".", "remote", 0); - } //end if of string_arena_add_string(sa_datatype," if (action_mask & (PARSEC_ACTION_RESHAPE_ON_RELEASE | PARSEC_ACTION_RESHAPE_REMOTE_ON_RELEASE | PARSEC_ACTION_SEND_REMOTE_DEPS)) {\n"); string_arena_add_string(sa_datatype," }\n"); @@ -8035,23 +8004,15 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_free(sa_coutput); string_arena_free(sa_deps); string_arena_free(sa_datatype); - string_arena_free(sa_arena); string_arena_free(sa_tmp_arena); - string_arena_free(sa_count); string_arena_free(sa_tmp_count); - string_arena_free(sa_displ); string_arena_free(sa_tmp_displ); - string_arena_free(sa_type); string_arena_free(sa_tmp_type); string_arena_free(sa_temp); - string_arena_free(sa_arena_r); string_arena_free(sa_tmp_arena_r); - string_arena_free(sa_count_r); string_arena_free(sa_tmp_count_r); - string_arena_free(sa_displ_r); string_arena_free(sa_tmp_displ_r); - string_arena_free(sa_type_r); string_arena_free(sa_tmp_type_r); string_arena_free(sa_temp_r); diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h index 2bf4b7382..c6d472374 100644 --- a/parsec/parsec_internal.h +++ b/parsec/parsec_internal.h @@ -304,6 +304,7 @@ typedef float (parsec_evaluate_function_t)(const parsec_task_t* task); */ typedef int (parsec_datatype_lookup_t)(struct parsec_execution_stream_s* es, const parsec_task_t * this_task, + const parsec_task_t * parent_task, uint32_t * flow_mask, parsec_dep_data_description_t * data); diff --git a/parsec/parsec_reshape.c b/parsec/parsec_reshape.c index 0fd6d5779..c2bc4846b 100644 --- a/parsec/parsec_reshape.c +++ b/parsec/parsec_reshape.c @@ -356,7 +356,7 @@ parsec_create_reshape_promise(parsec_execution_stream_t *es, * Setting up reshape promises shared among local or remote successors. * Two scenarios: * - No reshaping needed: fulfilled promise set up on predecessor repo. - * - Reshaping needed: unfullfilled promise set up on the predecessor if + * - Reshaping needed: unfulfilled promise set up on the predecessor if * the repo is free, otherwise on the successors repo. * * * (data->local.*_count == 0) corresponds to CTL flow. @@ -364,7 +364,7 @@ parsec_create_reshape_promise(parsec_execution_stream_t *es, * * During release_deps_of a fake remote predecessor (from which this node has * received data) this routine detects PARSEC_DATATYPE_PACKED and generates - * the appropriate reshape promises for the successsors reception datatypes. + * the appropriate reshape promises for the successors reception datatypes. * * NOTE: flow dependencies are ordered by type & type_remote and * type=UNDEFINED (no reshape, fulfilled promise) are placed as the first @@ -441,7 +441,7 @@ parsec_set_up_reshape_promise(parsec_execution_stream_t *es, int dsize; parsec_dep_data_description_t aux_data; - if ( PARSEC_HOOK_RETURN_DONE == fct->get_datatype(es, newcontext, &flow_mask, &aux_data)){ + if ( PARSEC_HOOK_RETURN_DONE == fct->get_datatype(es, newcontext, oldcontext, &flow_mask, &aux_data)){ parsec_fatal("Unable to find unpacking datatype."); } data->local = aux_data.remote; @@ -451,7 +451,7 @@ parsec_set_up_reshape_promise(parsec_execution_stream_t *es, /* Check if the previous future set up on iterate successor is tracking the same * data with the same reshaping. This can not be the case when after a reception, - * as we may generate different reshapings from PACKED to successors remote_type. + * as we may generate different reshaping from PACKED to successors remote_type. * (data->data_future is only clean up during iterate_successors when the predecessor * remote type changes, there's no info about the successor remote type). */ diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index d8dc381c2..8a38d7fd6 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -502,9 +502,9 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, */ if( (remote_deps->outgoing_mask & (1U<data.data) ) { /* if propagated and not a CONTROL */ - /* This assert is not correct anymore, we don't need and arena to send to a remote + /* This assert is not correct anymore, we don't need an arena to send to a remote * assert(NULL != output->data.remote.arena);*/ - assert( !parsec_is_CTL_dep(output->data) ); + assert( !parsec_is_CTL_dep(&output->data) ); PARSEC_OBJ_RETAIN(output->data.data); } diff --git a/parsec/remote_dep.h b/parsec/remote_dep.h index ba7d59089..d38af6e6e 100644 --- a/parsec/remote_dep.h +++ b/parsec/remote_dep.h @@ -66,7 +66,7 @@ struct parsec_dep_type_description_s { }; /** - * This structure holds the key information for any data mouvement. It contains the arena + * This structure holds the key information for any data movement. It contains the arena * where the data is allocated from, or will be allocated from. It also contains the * pointer to the buffer involved in the communication (or NULL if the data will be * allocated before the reception). Finally, it contains the triplet allowing a correct send @@ -235,14 +235,14 @@ int parsec_remote_dep_propagate(parsec_execution_stream_t* es, #endif /* DISTRIBUTED */ /* check if this data description represents a CTL dependency */ -#define parsec_is_CTL_dep(dep_data_desc)\ - ((dep_data_desc.data == NULL) \ - && (dep_data_desc.remote.src_datatype == PARSEC_DATATYPE_NULL) \ - && (0 == dep_data_desc.remote.src_count)) +#define parsec_is_CTL_dep(PDEP_DATA_DESC)\ + ((NULL == (PDEP_DATA_DESC)->data) \ + && (PARSEC_DATATYPE_NULL == (PDEP_DATA_DESC)->remote.src_datatype) \ + && (0 == (PDEP_DATA_DESC)->remote.src_count)) /* set this data description to CTL dependency */ -#define parsec_set_CTL_dep(dep_data_desc)\ - dep_data_desc.data = NULL; dep_data_desc.remote.src_datatype = PARSEC_DATATYPE_NULL; dep_data_desc.remote.src_count=0; +#define parsec_set_CTL_dep(PDEP_DATA_DESC)\ + { (PDEP_DATA_DESC)->data = NULL; (PDEP_DATA_DESC)->remote.src_datatype = PARSEC_DATATYPE_NULL; (PDEP_DATA_DESC)->remote.src_count=0; } /** @} */ diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index a55081966..5f4f0203a 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -486,6 +486,7 @@ void* remote_dep_dequeue_main(parsec_context_t* context) /* The MPI thread is owning the lock */ assert( parsec_communication_engine_up == 2 ); + remote_dep_mpi_on(context); /* acknowledge the activation */ parsec_communication_engine_up = 3; @@ -754,7 +755,7 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu, /* Extract the datatype, count and displacement from the target task */ - fct->get_datatype(eu, newcontext, &flow_mask, &output->data); + fct->get_datatype(eu, newcontext, oldcontext, &flow_mask, &output->data); /* Checking PARSEC_HOOK_RETURN_DONE == fct->get_datatype and setting * output->data to *out_data is no longer valid. * Now, send operation can rely on the datacopy dtt, @@ -786,24 +787,23 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu, deps->incoming_mask |= (1U << dep->dep_datatype_index); deps->root = src_rank; - if(output->data.remote.dst_count == 0){ + if(output->data.remote.dst_count == 0) { /* control dep */ return PARSEC_ITERATE_STOP; } - if(old_dtt != PARSEC_DATATYPE_NULL){ - if(old_dtt != output->data.remote.dst_datatype){ + if(old_dtt != PARSEC_DATATYPE_NULL) { + if(old_dtt != output->data.remote.dst_datatype) { #if defined(PARSEC_DEBUG_NOISIER) - char type_name_src[MAX_TASK_STRLEN] = "NULL"; - char type_name_dst[MAX_TASK_STRLEN] = "NULL"; - int len; - if(old_dtt!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len); - if(output->data.remote.dst_datatype!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len); - PARSEC_DEBUG_VERBOSE(30, parsec_comm_output_stream, "MPI: retrieve dtt for %s [dep_datatype_index %x] DTT: old %s new %s (%p) --> PACKED", + char type_name_src[MAX_TASK_STRLEN] = "NULL"; + char type_name_dst[MAX_TASK_STRLEN] = "NULL"; + int len; + if(old_dtt!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len); + if(output->data.remote.dst_datatype!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len); + PARSEC_DEBUG_VERBOSE(30, parsec_comm_output_stream, "MPI: retrieve dtt for %s [dep_datatype_index %x] DTT: old %s new %s (%p) --> PACKED", newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype); #endif - // TODO JS: implement MPI_Pack_size int dsize; - MPI_Pack_size(output->data.remote.dst_count, output->data.remote.dst_datatype, MPI_COMM_WORLD, &dsize); + parsec_ce.pack_size(&parsec_ce, output->data.remote.dst_count, output->data.remote.dst_datatype, &dsize); output->data.remote.src_count = output->data.remote.dst_count = dsize; output->data.remote.src_datatype = output->data.remote.dst_datatype = PARSEC_DATATYPE_PACKED; @@ -839,7 +839,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, assert(NULL == origin->taskpool); origin->taskpool = parsec_taskpool_lookup(origin->msg.taskpool_id); if( NULL == origin->taskpool ) - return -1; /* the parsec taskpool doesn't exist yet */ + return -1; /* the taskpool doesn't exist yet locally */ /* This function is divided into DTD and PTG's logic */ if( PARSEC_TASKPOOL_TYPE_DTD == origin->taskpool->taskpool_type ) { @@ -860,8 +860,6 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if(!(origin->msg.output_mask & (1U<msg.locals[0].value<<32 | (1U<task_hash_table, (parsec_key_t)key, &kh); @@ -879,13 +877,13 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, * time or not. Since, this function is called from other places (when * we later try to activate a task for which we have already received * an activation for) we do not need to store the buffer and we send - * PARSEC_DTD_SKIP_SAVING as an indication of that. + * PARSEC_DTD_SKIP_SAVING as an indicator of that. */ if( storage_id != PARSEC_DTD_SKIP_SAVING) { char* packed_buffer; /* Copy the short data to some temp storage */ packed_buffer = malloc(origin->msg.length); - memcpy(packed_buffer, origin->eager_msg + *position, origin->msg.length); + memcpy(packed_buffer, origin->eager_msg, origin->msg.length); *position += origin->msg.length; /* move to the next order */ origin->taskpool = (parsec_taskpool_t*)packed_buffer; /* temporary storage */ } @@ -901,13 +899,14 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, origin->msg.task_class_id = dtd_task->super.task_class->task_class_id; origin->output[k].data.remote.src_datatype = origin->output[k].data.remote.dst_datatype = PARSEC_DATATYPE_NULL; dtd_task->super.task_class->iterate_successors(es, (parsec_task_t *)dtd_task, - local_mask, + (1U<taskpool; + int idx, *data_sizes = (int*)origin->eager_msg; /* Do not set the task.task_class here, because it might trigger a race condition in DTD */ task.priority = 0; /* unknown yet */ @@ -926,7 +925,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, * be able to identify the dep_index for each particular datatype index, and * call the iterate_successors on each of the dep_index sets. */ - for(k = 0; origin->msg.output_mask>>k; k++) { + for(k = idx = 0; origin->msg.output_mask>>k; k++) { if(!(origin->msg.output_mask & (1U<out[i]; i++ ) { if(!(task.task_class->out[i]->flow_datatype_mask & (1U<output[k].data.remote.src_datatype = origin->output[k].data.remote.dst_datatype = PARSEC_DATATYPE_NULL; - PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tRetrieve datatype with mask 0x%x (remote_dep_get_datatypes)", local_mask); + assert(idx <= data_sizes[0]); + origin->output[k].data.remote.src_count = data_sizes[idx+1]; + PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, + "MPI:\tRetrieve datatype with mask 0x%x (remote_dep_get_datatypes) remote size %d", + local_mask, origin->output[k].data.remote.src_count); task.task_class->iterate_successors(es, &task, local_mask, remote_dep_mpi_retrieve_datatype, origin); + idx++; } } @@ -1301,13 +1305,13 @@ int remote_dep_mpi_on(parsec_context_t* context) static int remote_dep_mpi_pack_dep(int peer, dep_cmd_item_t* item, char* packed_buffer, - int length, - int* position) + uint32_t length, + int32_t* position) { parsec_remote_deps_t *deps = (parsec_remote_deps_t*)item->cmd.activate.task.source_deps; remote_dep_wire_activate_t* msg = &deps->msg; - int k, dsize, saved_position = *position; - uint32_t peer_bank, peer_mask, expected = 0; + int k, dsize, data_idx, saved_position = *position; + uint32_t peer_bank, peer_mask, expected = 0, *data_sizes; #if defined(PARSEC_DEBUG) || defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; remote_dep_cmd_to_string(&deps->msg, tmp, 128); @@ -1316,53 +1320,92 @@ static int remote_dep_mpi_pack_dep(int peer, peer_bank = peer / (sizeof(uint32_t) * 8); peer_mask = 1U << (peer % (sizeof(uint32_t) * 8)); + /* size of the handshake header */ parsec_ce.pack_size(&parsec_ce, dep_count, dep_dtt, &dsize); + /* reserve space for the termination detection piggybacked message */ dsize += deps->taskpool->tdm.module->outgoing_message_piggyback_size; - if( (length - (*position)) < dsize ) { /* no room. bail out */ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Can't pack at %d/%d. Bail out!", *position, length); + /* count the number of data to prepare the space for their length */ + for(k = 0, data_idx = 0; deps->outgoing_mask >> k; k++) { + if( !((1U << k) & deps->outgoing_mask )) continue; + if( !(deps->output[k].rank_bits[peer_bank] & peer_mask) ) continue; + data_idx++; + } + if( (length - (*position)) < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { /* no room. bail out */ + PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "Can't pack at %d/%d. Bail out!", *position, length); + if( length < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { + parsec_fatal("The header plus data cannot be sent on a single message " + "(need %zd but have %zd)\n", + length, dsize + data_idx * sizeof(uint32_t)); + } return 1; } /* Don't pack yet, we need to update the length field before packing */ *position += dsize; + data_sizes = (uint32_t*)(packed_buffer + *position); + assert(0 == (((uintptr_t)data_sizes) & (sizeof(uint32_t)-1))); + data_sizes[0] = data_idx; /* save the total number of data */ assert((0 != msg->output_mask) && /* this should be preset */ (msg->output_mask & deps->outgoing_mask) == deps->outgoing_mask); - msg->length = deps->taskpool->tdm.module->outgoing_message_piggyback_size; + msg->length = (data_idx + 1) * (uint32_t)sizeof(uint32_t); + *position += msg->length; + msg->length += deps->taskpool->tdm.module->outgoing_message_piggyback_size; item->cmd.activate.task.output_mask = 0; /* clean start */ /* Treat for special cases: CTL, Short, etc... */ - for(k = 0; deps->outgoing_mask >> k; k++) { + for(k = 0, data_idx = 1; deps->outgoing_mask >> k; k++) { if( !((1U << k) & deps->outgoing_mask )) continue; if( !(deps->output[k].rank_bits[peer_bank] & peer_mask) ) continue; + parsec_dep_data_description_t *data_desc = &deps->output[k].data; + parsec_dep_type_description_t *type_desc = &data_desc->remote; /* Remove CTL from the message we expect to send */ #if defined(PARSEC_PROF_DRY_DEP) - parsec_set_CTL_dep(deps->output[k].data); + parsec_set_CTL_dep(data_desc); #endif - if( parsec_is_CTL_dep(deps->output[k].data) ) { - PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, " CTL\t%s\tparam %d\tdemoted to be a control", tmp, k); + if( parsec_is_CTL_dep(data_desc) ) { + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, " CTL\t%s\tparam %d\tdemoted to be a control", tmp, k); continue; } - #if defined(PARSEC_DEBUG) || defined(PARSEC_DEBUG_NOISIER) - if(PARSEC_DATATYPE_NULL == deps->output[k].data.remote.src_datatype) { + if(PARSEC_DATATYPE_NULL == type_desc->src_datatype) { parsec_fatal("Output %d of %s has not defined a datatype: check that the data collection does" " define a datatype for each data it provides", k, tmp); } #endif - + assert(type_desc->src_count > 0); + /* Embed data (up to short size) with the activate msg */ + parsec_ce.pack_size( &parsec_ce, type_desc->src_count, type_desc->src_datatype, &dsize); + data_sizes[data_idx++] = dsize; + if( parsec_param_short_limit ) { + if((length - (*position)) >= dsize) { + parsec_ce.pack(&parsec_ce, ((char*)PARSEC_DATA_COPY_GET_PTR(data_desc->data)) + type_desc->src_displ, + type_desc->src_count, type_desc->src_datatype, + packed_buffer, length, position); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, " EGR\t%s\tparam %d\tshort piggyback in the activate msg (%d/%d)", + tmp, k, *position, length); + msg->length += dsize; + continue; /* go to the next */ + } else if( 0 != saved_position ) { + PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "DATA\t%s\tparam %d\texceed buffer length. Start again from here next iteration", + tmp, k); + *position = saved_position; + return 1; + } + /* the data doesn't fit in the buffer. */ + } expected++; item->cmd.activate.task.output_mask |= (1U<pending_ack); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "DATA\t%s\tparam %d\tdeps %p send on demand (increase deps counter by %d [%d])", + tmp, k, deps, expected, deps->pending_ack); } if(expected) (void)parsec_atomic_fetch_add_int32(&deps->pending_ack, expected); /* Keep track of the inflight data */ #if defined(PARSEC_DEBUG) || defined(PARSEC_DEBUG_NOISIER) - parsec_debug_verbose(6, parsec_debug_output, "MPI:\tTO\t%d\tActivate\t% -8s\n" - " \t\t\twith datakey %lx\tmask %lx\t(tag=%d) eager mask %lu length %d", - peer, tmp, msg->deps, msg->output_mask, -1, - msg->output_mask ^ item->cmd.activate.task.output_mask, msg->length); + parsec_debug_verbose(6, parsec_comm_output_stream, "MPI:\tTO\t%d\tActivate\t% -8s\n" + " \t\t\twith datakey %lx\tmask %lx short mask %lu length %d", + peer, tmp, msg->deps, msg->output_mask, + msg->output_mask ^ item->cmd.activate.task.output_mask, msg->length); #endif /* And now pack the updated message (msg->length and msg->output_mask) itself. */ parsec_ce.pack(&parsec_ce, msg, dep_count, dep_dtt, packed_buffer, length, &saved_position); @@ -1774,33 +1817,91 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, (void) length; (void) position; (void) packed_buffer; remote_dep_datakey_t complete_mask = 0; - int k; + int k, dsize, ds_idx; + uint32_t *data_sizes = (uint32_t*)(packed_buffer + *position); #if defined(PARSEC_DEBUG) || defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; remote_dep_cmd_to_string(&deps->msg, tmp, MAX_TASK_STRLEN); #endif #if defined(PARSEC_DEBUG) || defined(PARSEC_DEBUG_NOISIER) - parsec_debug_verbose(6, parsec_debug_output, "MPI:\tFROM\t%d\tActivate\t% -8s\n" - "\twith datakey %lx\tparams %lx length %d (pack buf %d/%d) prio %d", - deps->from, tmp, deps->msg.deps, deps->incoming_mask, - deps->msg.length, *position, length, deps->max_priority); + parsec_debug_verbose(6, parsec_comm_output_stream, "MPI:\tFROM\t%d\tActivate\t% -8s\n" + "\twith datakey %lx\tparams %lx length %d (pack buf %d/%d) prio %d", + deps->from, tmp, deps->msg.deps, deps->incoming_mask, + deps->msg.length, *position, length, deps->max_priority); #endif deps->taskpool->tdm.module->incoming_message_start(deps->taskpool, deps->from, packed_buffer, position, length, deps); + /* move the position after the data sizes */ + *position += (data_sizes[0] + 1) * (uint32_t)sizeof(uint32_t); + ds_idx = 0; for(k = 0; deps->incoming_mask>>k; k++) { if(!(deps->incoming_mask & (1U<output[k].data; + parsec_dep_type_description_t *type_desc = &data_desc->remote; /* Check for CTL and data that do not carry payload */ - if( parsec_is_CTL_dep(deps->output[k].data) ){ - PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "MPI:\tHERE\t%d\tGet NONE\t% -8s\tk=%d\twith datakey %lx at type CONTROL", - deps->from, tmp, k, deps->msg.deps); + if( parsec_is_CTL_dep(data_desc) ) { + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tHERE\t%d\tGet NONE\t% -8s\tk=%d\twith datakey %lx at type CONTROL", + deps->from, tmp, k, deps->msg.deps); /* deps->output[k].data.data = NULL; This is unnecessary*/ complete_mask |= (1U< *position) ) { + parsec_ce.pack_size( &parsec_ce, 1, type_desc->dst_datatype, &dsize); /* for a single type */ + if( (type_desc->dst_count * dsize) != data_sizes[ds_idx] ) { + /* We only receive the minimum between expected and sent, potentially converting to byte */ + int count_in_msg = data_sizes[ds_idx] / dsize; + if( (dsize * type_desc->dst_count) > data_sizes[ds_idx] ) { + if( data_sizes[ds_idx] % dsize ) { + type_desc->dst_datatype = parsec_datatype_int8_t; + count_in_msg = data_sizes[ds_idx]; + } + } else { + count_in_msg = type_desc->dst_count; + if( data_sizes[ds_idx] % dsize ) { + type_desc->dst_datatype = parsec_datatype_int8_t; + count_in_msg = dsize * type_desc->dst_count; + } + } + type_desc->dst_count = count_in_msg; /* update the count */ + PARSEC_DEBUG_VERBOSE(0, parsec_comm_output_stream, + " EGR\t%s\tparam %d\treceive does not match the expected type and count (leftover %d)." + " Convert to byte and receive as much as possible.\n", + tmp, k, data_sizes[ds_idx] % dsize); + } + + /* Check if the data is short-embedded in the activate */ + if((length - (*position)) >= data_sizes[ds_idx]) { + assert(NULL == data_desc->data); /* we do not support in-place tiles now, make sure it doesn't happen yet */ + if(NULL == data_desc->data) { + data_desc->data = remote_dep_copy_allocate(type_desc); + } +#ifndef PARSEC_PROF_DRY_DEP + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, + " EGR\t%s\tparam %d\tshort from the activate msg (exp/rcv/avail) (%d/%d/%d)", + tmp, k, type_desc->dst_count * dsize, data_sizes[ds_idx], length - *position); + int save_position = *position; /* save the position */ + parsec_ce.unpack(&parsec_ce, packed_buffer, *position + data_sizes[ds_idx], position, + (char*)PARSEC_DATA_COPY_GET_PTR(data_desc->data) + type_desc->dst_displ, + type_desc->dst_count, type_desc->dst_datatype); + *position = save_position + data_sizes[ds_idx]; /* jump the entire short data */ +#endif /* PARSEC_PROF_DRY_DEP */ + complete_mask |= (1U<from, tmp, k, deps->msg.deps); } + assert(length == *position); /* Release all the already satisfied deps without posting the RDV */ @@ -1808,8 +1909,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, #if defined(PARSEC_DEBUG_NOISIER) for(int k = 0; complete_mask>>k; k++) if((1U<from, tmp, k, deps->msg.deps, deps->output[k].data.data, k ); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tHERE\t%d\tGet PREEND\t% -8s\tk=%d\twith datakey %lx at %p ALREADY SATISFIED\t", + deps->from, tmp, k, deps->msg.deps, deps->output[k].data.data); #endif /* If this is the only call then force the remote deps propagation */ deps = remote_dep_release_incoming(es, deps, complete_mask); @@ -1849,7 +1950,7 @@ remote_dep_mpi_save_activate_cb(parsec_comm_engine_t *ce, parsec_ce_tag_t tag, ce->unpack(ce, msg, length, &position, &deps->msg, dep_count, dep_dtt); deps->from = src; - deps->eager_msg = msg; + deps->eager_msg = (char*)msg + position; /* Retrieve the data arenas and update the msg.incoming_mask to reflect * the data we should be receiving from the predecessor. @@ -1884,6 +1985,7 @@ remote_dep_mpi_save_activate_cb(parsec_comm_engine_t *ce, parsec_ce_tag_t tag, remote_dep_mpi_recv_activate(es, deps, msg, position + deps->msg.length, &position); assert( parsec_param_enable_aggregate || (position == length)); + deps->eager_msg = NULL; /* this buffer will now be reused, not safe to store here */ } assert(position == length); PARSEC_PINS(es, ACTIVATE_CB_END, NULL); @@ -1899,8 +2001,8 @@ remote_dep_mpi_new_taskpool(parsec_execution_stream_t* es, #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; #endif - PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "OPAQUE_MPI: ThreadID %d\tNew taskpool %d registered", - (int)pthread_self(), obj->taskpool_id); + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "OPAQUE_MPI: ThreadID %ld\tNew taskpool %d registered", + pthread_self(), obj->taskpool_id); for(item = PARSEC_LIST_ITERATOR_FIRST(&dep_activates_noobj_fifo); item != PARSEC_LIST_ITERATOR_END(&dep_activates_noobj_fifo); item = PARSEC_LIST_ITERATOR_NEXT(item) ) { @@ -1909,6 +2011,7 @@ remote_dep_mpi_new_taskpool(parsec_execution_stream_t* es, char* buffer = (char*)deps->taskpool; /* get back the buffer from the "temporary" storage */ int rc, position = 0; deps->taskpool = NULL; + deps->eager_msg = buffer; /* provide get_datatype with access to the remote sizes */ rc = remote_dep_get_datatypes(es, deps, PARSEC_DTD_SKIP_SAVING, &position); assert( -1 != rc ); assert(deps->taskpool != NULL); PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tFROM\t%d\tActivate NEWOBJ\t% -8s\twith datakey %lx\tparams %lx", @@ -1918,15 +2021,17 @@ remote_dep_mpi_new_taskpool(parsec_execution_stream_t* es, item = parsec_list_nolock_remove(&dep_activates_noobj_fifo, item); /* In case of DTD execution, receiving rank might not have discovered - * the task responsible for this message. So we have to put this message - * in a hash table so that we can activate it, when this rank discovers it. + * the task responsible for this message. We save this activation message + * in a hash table for deferred activation, when the task is locally discovered. */ if( -2 == rc ) { /* DTD problems, defer activating this remote dep */ deps->taskpool = (parsec_taskpool_t*) buffer; + deps->eager_msg = NULL; /* back to NULL */ continue; } remote_dep_mpi_recv_activate(es, deps, buffer, deps->msg.length, &position); + deps->eager_msg = NULL; /* back to NULL */ free(buffer); (void)rc; } @@ -1949,7 +2054,7 @@ remote_dep_mpi_release_delayed_deps(parsec_execution_stream_t* es, parsec_remote_deps_t *deps = item->cmd.release.deps; int rc, position = 0; char* buffer = (char*)deps->taskpool; /* get back the buffer from the "temporary" storage */ - deps->taskpool = NULL; + deps->taskpool = NULL; /* get_datatype require no taskpool to be set. */ rc = remote_dep_get_datatypes(es, deps, 1, &position); @@ -2110,7 +2215,7 @@ remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce, char tmp[MAX_TASK_STRLEN]; #endif - PARSEC_DEBUG_VERBOSE(6, parsec_debug_output, "MPI:\tFROM\t%d\tGet END \t% -8s\tk=%d\twith datakey na \tparams %lx\t(tag=%d)", + PARSEC_DEBUG_VERBOSE(6, parsec_debug_output, "MPI:\tFROM\t%d\tGet END \t% -8s\tk=%d\twith datakey na\tparams %lx\t(tag=%d)", src, remote_dep_cmd_to_string(&deps->msg, tmp, MAX_TASK_STRLEN), callback_data->k, deps->incoming_mask, src); diff --git a/tests/apps/stencil/stencil_internal.h b/tests/apps/stencil/stencil_internal.h index a8f2c8197..8bb43dc6d 100644 --- a/tests/apps/stencil/stencil_internal.h +++ b/tests/apps/stencil/stencil_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 The Universiy of Tennessee and The Universiy + * Copyright (c) 2019-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. */ diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf index d1d47d7a5..9bc79e1a4 100644 --- a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf +++ b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf @@ -4,20 +4,16 @@ extern "C" %{ * of Tennessee Research Foundation. All rights * reserved. */ -#include "parsec/data_distribution.h" -#include "parsec/data_dist/matrix/matrix.h" -#include "parsec/data.h" -#include "parsec/utils/mca_param.h" -#include "parsec/arena.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include #include #include %} -%option no_taskpool_instance = true /* can be aything */ +%option no_taskpool_instance = true /* can be anything */ -descA [type = "struct parsec_tiled_matrix_dc_t *"] +descA [type = "parsec_matrix_block_cyclic_t*"] NB [type = int] NT [type = int] @@ -25,19 +21,19 @@ NT [type = int] /************************************************** * potrf_dpotrf * **************************************************/ -potrf_dpotrf(k) +potrf_dpotrf(k) // Execution space -k = 0 .. descA->mt-1 +k = 0 .. NT-1 // Parallel partitioning :descA(k, k) // Parameters RW T <- (k == 0) ? descA(k, k) : A potrf_diag(k, k-1) - -> T potrf_diag(k+1..descA->mt-1, k) - -> T potrf_col(k+1..descA->mt-1..2, k) [layout = MPI_DOUBLE count = 2] - -> T potrf_col(k+2..descA->mt-1..2, k) [layout = MPI_DOUBLE count = 3] + -> T potrf_diag(k+1..NT-1, k) + -> T potrf_col(k+1..NT-1..2, k) [layout = MPI_DOUBLE count = 2] + -> T potrf_col(k+2..NT-1..2, k) [layout = MPI_DOUBLE count = 3] -> descA(k, k) BODY @@ -50,11 +46,11 @@ END /************************************************** * potrf_col * **************************************************/ -potrf_col(m, k) +potrf_col(m, k) // Execution space -k = 0 .. descA->mt-2 -m = k+1 .. descA->mt-1 +k = 0 .. NT-2 +m = k+1 .. NT-1 // Parallel partitioning : descA(m, k) @@ -77,8 +73,8 @@ END potrf_diag(k, i) // Execution space -i = 0 .. descA->mt-2 -k = i+1 .. descA->mt-1 +i = 0 .. NT-2 +k = i+1 .. NT-1 // Parallel partitioning : descA(k, k) diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c index b917a827f..e5f1732f5 100644 --- a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c +++ b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c @@ -13,8 +13,8 @@ #endif #include -#include "check_multisize_bcast.h" #include "check_multisize_bcast_wrapper.h" +#include "check_multisize_bcast.h" /** * @param [IN] A the data, already distributed and allocated @@ -23,7 +23,7 @@ * * @return the parsec object to schedule. */ -parsec_taskpool_t* check_multisize_bcast_new(parsec_tiled_matrix_dc_t *A, int nb, int nt) +parsec_taskpool_t* check_multisize_bcast_new(parsec_matrix_block_cyclic_t *A, int nb, int nt) { parsec_check_multisize_bcast_taskpool_t *tp = NULL; @@ -39,14 +39,5 @@ parsec_taskpool_t* check_multisize_bcast_new(parsec_tiled_matrix_dc_t *A, int nb return (parsec_taskpool_t*)tp; } -/** - * @param [INOUT] o the parsec object to destroy - */ -static void -check_multisize_bcast_destructor(parsec_check_multisize_bcast_taskpool_t *tp) -{ - parsec_matrix_del2arena(&tp->arenas_datatypes[PARSEC_check_multisize_bcast_DEFAULT_ADT_IDX]); -} - PARSEC_OBJ_CLASS_INSTANCE(parsec_check_multisize_bcast_taskpool_t, parsec_taskpool_t, - NULL, check_multisize_bcast_destructor); + NULL, NULL); diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h index 2ce692636..1fd09aebf 100644 --- a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h +++ b/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h @@ -1,6 +1,5 @@ #include "parsec/runtime.h" -#include "parsec/data_distribution.h" -#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" -parsec_taskpool_t *check_multisize_bcast_new(parsec_tiled_matrix_dc_t *A, int size, int nt); +parsec_taskpool_t *check_multisize_bcast_new(parsec_matrix_block_cyclic_t *A, int size, int nt); diff --git a/tests/dsl/ptg/check_multisize_bcast/data_gen.c b/tests/dsl/ptg/check_multisize_bcast/data_gen.c index 68fc55559..814862f81 100644 --- a/tests/dsl/ptg/check_multisize_bcast/data_gen.c +++ b/tests/dsl/ptg/check_multisize_bcast/data_gen.c @@ -7,7 +7,6 @@ #include "parsec/runtime.h" #include "data_gen.h" #include "stdarg.h" -#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include #include @@ -15,28 +14,28 @@ /* * mb, whole matrix row/column number, mt, each tile row/column number */ -parsec_tiled_matrix_dc_t *create_and_distribute_data(int rank, int world, int mb, int mt, int typesize) +parsec_matrix_block_cyclic_t *create_and_distribute_data(int rank, int world, int mb, int mt) { - two_dim_block_cyclic_t *m = (two_dim_block_cyclic_t*)malloc(sizeof(two_dim_block_cyclic_t)); + parsec_matrix_block_cyclic_t *m = (parsec_matrix_block_cyclic_t*)malloc(sizeof(parsec_matrix_block_cyclic_t)); - two_dim_block_cyclic_init(m, matrix_ComplexDouble, matrix_Tile, - rank, - mb, mb, - mt*mb, mt*mb, - 0, 0, - mt*mb, mt*mb, - 1, world, - 1, 1, 0, 0); + parsec_matrix_block_cyclic_init(m, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, + mb, mb, + mt*mb, mt*mb, + 0, 0, + mt*mb, mt*mb, + 1, world, + 1, 1, 0, 0); m->mat = parsec_data_allocate((size_t)m->super.nb_local_tiles * (size_t)m->super.bsiz * (size_t)parsec_datadist_getsizeoftype(m->super.mtype)); - return (parsec_tiled_matrix_dc_t*)m; + return (parsec_matrix_block_cyclic_t*)m; } -void free_data(parsec_tiled_matrix_dc_t *d) +void free_data(parsec_matrix_block_cyclic_t *d) { - parsec_data_collection_destroy(&d->super); + parsec_data_collection_destroy(&d->super.super); free(d); } diff --git a/tests/dsl/ptg/check_multisize_bcast/data_gen.h b/tests/dsl/ptg/check_multisize_bcast/data_gen.h index 7d9f94dd2..e4b5e5781 100644 --- a/tests/dsl/ptg/check_multisize_bcast/data_gen.h +++ b/tests/dsl/ptg/check_multisize_bcast/data_gen.h @@ -2,9 +2,9 @@ #define _DATA_GEN_H_ #include "parsec/runtime.h" -#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" -parsec_tiled_matrix_dc_t *create_and_distribute_data(int rank, int world, int nb, int nt, int typesize); -void free_data(parsec_tiled_matrix_dc_t *d); +parsec_matrix_block_cyclic_t *create_and_distribute_data(int rank, int world, int nb, int nt); +void free_data(parsec_matrix_block_cyclic_t *d); #endif diff --git a/tests/dsl/ptg/check_multisize_bcast/main.c b/tests/dsl/ptg/check_multisize_bcast/main.c index ca0df98c0..277c202e0 100644 --- a/tests/dsl/ptg/check_multisize_bcast/main.c +++ b/tests/dsl/ptg/check_multisize_bcast/main.c @@ -18,8 +18,8 @@ int main(int argc, char *argv[]) { parsec_context_t* parsec; int rank = 0, world = 1, cores = -1; - int nt = 2, nb = 16, rc; - parsec_tiled_matrix_dc_t *dcA; + int nt = 2, nb = 1, rc; + parsec_matrix_block_cyclic_t *dcA; parsec_taskpool_t *bcast; #if defined(PARSEC_HAVE_MPI) @@ -39,7 +39,7 @@ int main(int argc, char *argv[]) } nt = (int)val; if( 0 == nt ) { - printf("Bad value for nt (it canot be zero) !!!\n"); + printf("Bad value for nt (cannot be zero) !!!\n"); exit(-1); } } @@ -49,7 +49,7 @@ int main(int argc, char *argv[]) exit(1); } - dcA = create_and_distribute_data(rank, world, nb, nt, sizeof(int)); + dcA = create_and_distribute_data(rank, world, nb, nt); parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); bcast = check_multisize_bcast_new(dcA, nb, nt); From a7288f23d595fb7ec7630adc4c0ddd992d90de7d Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Sat, 3 Feb 2024 15:22:47 -0500 Subject: [PATCH 04/15] Minor typos, warnings, and naming conventions in tests --- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 2 -- parsec/remote_dep_mpi.c | 14 +++++++------- parsec/vpmap.c | 2 +- tests/collections/reshape/testing_reshape.c | 2 +- tests/dsl/ptg/CMakeLists.txt | 8 ++++---- tests/dsl/ptg/Testings.cmake | 1 + tests/dsl/ptg/check_multisize_bcast/Testings.cmake | 4 ---- .../CMakeLists.txt | 1 - tests/dsl/ptg/multisize_bcast/Testings.cmake | 4 ++++ .../check_multisize_bcast.jdf | 0 .../check_multisize_bcast_wrapper.c | 0 .../check_multisize_bcast_wrapper.h | 0 .../data_gen.c | 0 .../data_gen.h | 0 .../main.c | 0 15 files changed, 18 insertions(+), 20 deletions(-) delete mode 100644 tests/dsl/ptg/check_multisize_bcast/Testings.cmake rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/CMakeLists.txt (91%) create mode 100644 tests/dsl/ptg/multisize_bcast/Testings.cmake rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/check_multisize_bcast.jdf (100%) rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/check_multisize_bcast_wrapper.c (100%) rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/check_multisize_bcast_wrapper.h (100%) rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/data_gen.c (100%) rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/data_gen.h (100%) rename tests/dsl/ptg/{check_multisize_bcast => multisize_bcast}/main.c (100%) diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 4b3c3aa83..40569be55 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -7663,7 +7663,6 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, string_arena_t *sa_tmp_type_r = string_arena_new(256); string_arena_t *sa_temp_r = string_arena_new(1024); - int last_datatype_idx; assignment_info_t ai; expr_info_t info = EMPTY_EXPR_INFO; int nb_open_ldef; @@ -7713,7 +7712,6 @@ jdf_generate_code_iterate_successors_or_predecessors(const jdf_t *jdf, for(fl = f->dataflow; fl != NULL; fl = fl->next) { flowempty = 1; flowtomem = 0; - last_datatype_idx = -1; string_arena_init(sa_coutput); string_arena_init(sa_deps); string_arena_init(sa_datatype); diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index f11ab7ed9..224cb60e9 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -857,6 +857,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, char* packed_buffer; /* Copy the short data to some temp storage */ packed_buffer = malloc(origin->msg.length); + /* the caller already added the *position input to eager_msg */ memcpy(packed_buffer, origin->eager_msg, origin->msg.length); *position += origin->msg.length; /* move to the next order */ origin->taskpool = (parsec_taskpool_t*)packed_buffer; /* temporary storage */ @@ -869,7 +870,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if(return_defer) { return -2; } - PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tRetrieve datatype with mask 0x%x (remote_dep_get_datatypes)", local_mask); + PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tRetrieve datatype with mask 0x%x (remote_dep_get_datatypes)", (1U<msg.task_class_id = dtd_task->super.task_class->task_class_id; origin->output[k].data.remote.src_datatype = origin->output[k].data.remote.dst_datatype = PARSEC_DATATYPE_NULL; dtd_task->super.task_class->iterate_successors(es, (parsec_task_t *)dtd_task, @@ -881,7 +882,6 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, parsec_task_t task; task.taskpool = origin->taskpool; int idx, *data_sizes = (int*)origin->eager_msg; - /* Do not set the task.task_class here, because it might trigger a race condition in DTD */ task.priority = 0; /* unknown yet */ task.task_class = task.taskpool->task_classes_array[origin->msg.task_class_id]; @@ -1261,8 +1261,8 @@ static inline uint64_t remote_dep_mpi_profiling_event_id(void) static int remote_dep_mpi_pack_dep(int peer, dep_cmd_item_t* item, char* packed_buffer, - uint32_t length, - int32_t* position) + int length, + int* position) { parsec_remote_deps_t *deps = (parsec_remote_deps_t*)item->cmd.activate.task.source_deps; remote_dep_wire_activate_t* msg = &deps->msg; @@ -1286,9 +1286,9 @@ static int remote_dep_mpi_pack_dep(int peer, if( !(deps->output[k].rank_bits[peer_bank] & peer_mask) ) continue; data_idx++; } - if( (length - (*position)) < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { /* no room. bail out */ + if( (length - (*position)) < (dsize + (data_idx + 1) * (int)sizeof(uint32_t)) ) { /* no room. bail out */ PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "Can't pack at %d/%d. Bail out!", *position, length); - if( length < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { + if( length < (dsize + (data_idx + 1) * (int)sizeof(uint32_t)) ) { parsec_fatal("The header plus data cannot be sent on a single message " "(need %zd but have %zd)\n", length, dsize + data_idx * sizeof(uint32_t)); @@ -1836,7 +1836,7 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, } /* Check if the data is short-embedded in the activate */ - if((length - (*position)) >= data_sizes[ds_idx]) { + if((length - (*position)) >= (int)data_sizes[ds_idx]) { assert(NULL == data_desc->data); /* we do not support in-place tiles now, make sure it doesn't happen yet */ if(NULL == data_desc->data) { data_desc->data = remote_dep_copy_allocate(type_desc); diff --git a/parsec/vpmap.c b/parsec/vpmap.c index 99ec7236f..a9abe8a31 100644 --- a/parsec/vpmap.c +++ b/parsec/vpmap.c @@ -633,7 +633,7 @@ int parse_binding_parameter(int vp, int nbth, char * binding) break; offset += sprintf(tmp + offset, "%i ", core_tab[t]); if( offset > (sizeof(tmp)-4)){ - sprintf(tmp+offset, "..."); + sprintf(tmp+sizeof(tmp)-4, "..."); break; } } diff --git a/tests/collections/reshape/testing_reshape.c b/tests/collections/reshape/testing_reshape.c index 17cb578dd..9f2433507 100644 --- a/tests/collections/reshape/testing_reshape.c +++ b/tests/collections/reshape/testing_reshape.c @@ -70,7 +70,7 @@ int main(int argc, char *argv[]) /******************* * No local reshape - * When only type_remote is used on the dependencies, the pointer to the origianal + * When only type_remote is used on the dependencies, the pointer to the original * matrix tiles is passed to the successors tasks. Thus, the full original tiles are * set to 0. *******************/ diff --git a/tests/dsl/ptg/CMakeLists.txt b/tests/dsl/ptg/CMakeLists.txt index 5496014ac..ed03f549f 100644 --- a/tests/dsl/ptg/CMakeLists.txt +++ b/tests/dsl/ptg/CMakeLists.txt @@ -1,4 +1,4 @@ -add_Subdirectory(ptgpp) +add_subdirectory(ptgpp) parsec_addtest_executable(C strange) target_ptg_sources(strange PRIVATE "strange.jdf") @@ -17,6 +17,6 @@ target_ptg_sources(complex_deps PRIVATE "complex_deps.jdf") add_subdirectory(branching) add_subdirectory(choice) add_subdirectory(controlgather) -add_Subdirectory(user-defined-functions) -add_Subdirectory(local-indices) -add_subdirectory(check_multisize_bcast) +add_subdirectory(user-defined-functions) +add_subdirectory(local-indices) +add_subdirectory(multisize_bcast) diff --git a/tests/dsl/ptg/Testings.cmake b/tests/dsl/ptg/Testings.cmake index dbda93f27..697ad425f 100644 --- a/tests/dsl/ptg/Testings.cmake +++ b/tests/dsl/ptg/Testings.cmake @@ -1,6 +1,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/ptgpp/Testings.cmake) include(${CMAKE_CURRENT_LIST_DIR}/user-defined-functions/Testings.cmake) include(${CMAKE_CURRENT_LIST_DIR}/branching/Testings.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/multisize_bcast/Testings.cmake) parsec_addtest_cmd(dsl/ptg/startup1 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=10 -j=10 -k=10 -v=5) parsec_addtest_cmd(dsl/ptg/startup2 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=10 -j=20 -k=30 -v=5) diff --git a/tests/dsl/ptg/check_multisize_bcast/Testings.cmake b/tests/dsl/ptg/check_multisize_bcast/Testings.cmake deleted file mode 100644 index 4e2566e4d..000000000 --- a/tests/dsl/ptg/check_multisize_bcast/Testings.cmake +++ /dev/null @@ -1,4 +0,0 @@ -parsec_addtest_cmd(unit_check_multisize_bcast_shm ${SHM_TEST_CMD_LIST} ./check_multisize_bcast) -if( MPI_C_FOUND ) - parsec_addtest_cmd(unit_check_multisize_bcast_mpi ${MPI_TEST_CMD_LIST} 4 ./check_multisize_bcast) -endif( MPI_C_FOUND) diff --git a/tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt b/tests/dsl/ptg/multisize_bcast/CMakeLists.txt similarity index 91% rename from tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt rename to tests/dsl/ptg/multisize_bcast/CMakeLists.txt index 3583307ec..a5d665949 100644 --- a/tests/dsl/ptg/check_multisize_bcast/CMakeLists.txt +++ b/tests/dsl/ptg/multisize_bcast/CMakeLists.txt @@ -4,4 +4,3 @@ parsec_addtest_executable(C check_multisize_bcast SOURCES main.c check_multisize_bcast_wrapper.c data_gen.c) target_ptg_sources(check_multisize_bcast PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/check_multisize_bcast.jdf") -include(Testings.cmake) diff --git a/tests/dsl/ptg/multisize_bcast/Testings.cmake b/tests/dsl/ptg/multisize_bcast/Testings.cmake new file mode 100644 index 000000000..33a9ecc2a --- /dev/null +++ b/tests/dsl/ptg/multisize_bcast/Testings.cmake @@ -0,0 +1,4 @@ +parsec_addtest_cmd(dsl/ptg/multisize_bcast ${SHM_TEST_CMD_LIST} dsl/ptg/multisize_bcast/check_multisize_bcast) +if( MPI_C_FOUND ) + parsec_addtest_cmd(dsl/ptg/multisize_bcast:mp ${MPI_TEST_CMD_LIST} 4 dsl/ptg/multisize_bcast/check_multisize_bcast) +endif( MPI_C_FOUND) diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast.jdf similarity index 100% rename from tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast.jdf rename to tests/dsl/ptg/multisize_bcast/check_multisize_bcast.jdf diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c similarity index 100% rename from tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.c rename to tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c diff --git a/tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.h similarity index 100% rename from tests/dsl/ptg/check_multisize_bcast/check_multisize_bcast_wrapper.h rename to tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.h diff --git a/tests/dsl/ptg/check_multisize_bcast/data_gen.c b/tests/dsl/ptg/multisize_bcast/data_gen.c similarity index 100% rename from tests/dsl/ptg/check_multisize_bcast/data_gen.c rename to tests/dsl/ptg/multisize_bcast/data_gen.c diff --git a/tests/dsl/ptg/check_multisize_bcast/data_gen.h b/tests/dsl/ptg/multisize_bcast/data_gen.h similarity index 100% rename from tests/dsl/ptg/check_multisize_bcast/data_gen.h rename to tests/dsl/ptg/multisize_bcast/data_gen.h diff --git a/tests/dsl/ptg/check_multisize_bcast/main.c b/tests/dsl/ptg/multisize_bcast/main.c similarity index 100% rename from tests/dsl/ptg/check_multisize_bcast/main.c rename to tests/dsl/ptg/multisize_bcast/main.c From 93251f0daef2d1fc717086a601caa5c7612d3e8c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 6 Jan 2022 03:41:39 -0500 Subject: [PATCH 05/15] Allow sender to send data of any size. The sender can send less data than expected by the receiver. If we select the communication protocol based on how much data the receiver expects, the sender and receiver could diverge on the protocol to be used (mainly visible when the sent message is below eager, while the receiver expects more data. So, force the sender to pack the amount of data per dep, and force the receiver to abide by this amount. Signed-off-by: George Bosilca --- parsec/remote_dep_mpi.c | 33 ++++++++++++------- .../check_multisize_bcast_wrapper.c | 11 ++++++- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 224cb60e9..47eb1f944 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -1261,8 +1261,8 @@ static inline uint64_t remote_dep_mpi_profiling_event_id(void) static int remote_dep_mpi_pack_dep(int peer, dep_cmd_item_t* item, char* packed_buffer, - int length, - int* position) + uint32_t length, + int32_t* position) { parsec_remote_deps_t *deps = (parsec_remote_deps_t*)item->cmd.activate.task.source_deps; remote_dep_wire_activate_t* msg = &deps->msg; @@ -1280,15 +1280,19 @@ static int remote_dep_mpi_pack_dep(int peer, parsec_ce.pack_size(&parsec_ce, dep_count, dep_dtt, &dsize); /* reserve space for the termination detection piggybacked message */ dsize += deps->taskpool->tdm.module->outgoing_message_piggyback_size; + if( (length - (*position)) < dsize ) { /* no room. bail out */ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Can't pack termination detection piggyback data at %d/%d. Bail out!", *position, length); + return 1; + } /* count the number of data to prepare the space for their length */ for(k = 0, data_idx = 0; deps->outgoing_mask >> k; k++) { if( !((1U << k) & deps->outgoing_mask )) continue; if( !(deps->output[k].rank_bits[peer_bank] & peer_mask) ) continue; data_idx++; } - if( (length - (*position)) < (dsize + (data_idx + 1) * (int)sizeof(uint32_t)) ) { /* no room. bail out */ + if( (length - (*position)) < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { /* no room. bail out */ PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "Can't pack at %d/%d. Bail out!", *position, length); - if( length < (dsize + (data_idx + 1) * (int)sizeof(uint32_t)) ) { + if( length < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { parsec_fatal("The header plus data cannot be sent on a single message " "(need %zd but have %zd)\n", length, dsize + data_idx * sizeof(uint32_t)); @@ -1302,9 +1306,10 @@ static int remote_dep_mpi_pack_dep(int peer, data_sizes[0] = data_idx; /* save the total number of data */ assert((0 != msg->output_mask) && /* this should be preset */ (msg->output_mask & deps->outgoing_mask) == deps->outgoing_mask); - msg->length = (data_idx + 1) * (uint32_t)sizeof(uint32_t); + /* update the length of the message */ + msg->length = deps->taskpool->tdm.module->outgoing_message_piggyback_size; + msg->length += (data_idx + 1) * (uint32_t)sizeof(uint32_t); *position += msg->length; - msg->length += deps->taskpool->tdm.module->outgoing_message_piggyback_size; item->cmd.activate.task.output_mask = 0; /* clean start */ /* Treat for special cases: CTL, Short, etc... */ for(k = 0, data_idx = 1; deps->outgoing_mask >> k; k++) { @@ -1332,11 +1337,16 @@ static int remote_dep_mpi_pack_dep(int peer, /* Embed data (up to short size) with the activate msg */ parsec_ce.pack_size( &parsec_ce, type_desc->src_count, type_desc->src_datatype, &dsize); data_sizes[data_idx++] = dsize; +#ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE + /* If we want to reshape before sending, we don't do short messages. */ + if( (deps->output[k].data.data_future == NULL) && (parsec_param_short_limit) ) { +#else if( parsec_param_short_limit ) { +#endif if((length - (*position)) >= dsize) { parsec_ce.pack(&parsec_ce, ((char*)PARSEC_DATA_COPY_GET_PTR(data_desc->data)) + type_desc->src_displ, type_desc->src_count, type_desc->src_datatype, - packed_buffer, length, position); + packed_buffer, length, position); PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, " EGR\t%s\tparam %d\tshort piggyback in the activate msg (%d/%d)", tmp, k, *position, length); msg->length += dsize; @@ -1855,9 +1865,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, continue; } } - - PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tFROM\t%d\tGet DATA\t% -8s\tk=%d\twith datakey %lx (to be posted)", - deps->from, tmp, k, deps->msg.deps); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tFROM\t%d\tGet DATA\t% -8s\tk=%d\twith datakey %lx tag=%d (to be posted)", + deps->from, tmp, k, deps->msg.deps, tag+k); } assert(length == *position); @@ -1867,8 +1876,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, #if defined(PARSEC_DEBUG_NOISIER) for(int k = 0; complete_mask>>k; k++) if((1U<from, tmp, k, deps->msg.deps, deps->output[k].data.data); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tHERE\t%d\tGet PREEND\t% -8s\tk=%d\twith datakey %lx at %p ALREADY SATISFIED\t(tag=%d)", + deps->from, tmp, k, deps->msg.deps, deps->output[k].data.data, tag+k ); #endif /* If this is the only call then force the remote deps propagation */ deps = remote_dep_release_incoming(es, deps, complete_mask); diff --git a/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c index e5f1732f5..eb79b71e8 100644 --- a/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c +++ b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c @@ -39,5 +39,14 @@ parsec_taskpool_t* check_multisize_bcast_new(parsec_matrix_block_cyclic_t *A, in return (parsec_taskpool_t*)tp; } +/** + * @param [INOUT] o the parsec object to destroy + */ +static void +check_multisize_bcast_destructor(parsec_check_multisize_bcast_taskpool_t *tp) +{ + parsec_del2arena(&tp->arenas_datatypes[PARSEC_check_multisize_bcast_DEFAULT_ADT_IDX]); +} + PARSEC_OBJ_CLASS_INSTANCE(parsec_check_multisize_bcast_taskpool_t, parsec_taskpool_t, - NULL, NULL); + NULL, check_multisize_bcast_destructor); From 932e65b300c1675269dab1609bf605a98fa994be Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 24 Jul 2022 00:25:12 -0400 Subject: [PATCH 06/15] Complete the integration of the variable send sizes PR A lot of changes: - bring back short message protocol (aka data embedded directly into the activation message - enable propagation of the data sizes. This is now part of the activation message, but unfortunately they will dissapear after the first call to get_datatype, and there is no way to retrieve them. The side effect of this is that the get_datatype function should only be called once, and this is not the case today (the reshape code makes heavy usage). - clean up the reshape code. I don't understand it, it adds a lot of overhead on the critical path (a lot of initializations of useless structures, and few function calls). I barely tried to minimize the overheads, but at some point we need to understand this code and make it more user friendly. - fix a lot of typos in the tests Signed-off-by: George Bosilca --- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 2 ++ parsec/mca/device/transfer_gpu.c | 6 +++-- parsec/parsec.c | 22 +++++++++++-------- parsec/remote_dep_mpi.c | 14 +++++------- .../check_multisize_bcast_wrapper.c | 11 +--------- 5 files changed, 25 insertions(+), 30 deletions(-) diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 40569be55..3208a1aa3 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -4590,6 +4590,8 @@ static void jdf_generate_destructor( const jdf_t *jdf ) " __parsec_tp->super.super.taskpool_id, dependencies_size);\n" " parsec_profiling_add_information(\"MEMORY_USAGE\", meminfo);\n" " }\n" + "#else\n" + " (void)dependencies_size;\n" "#endif\n"); } diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 58fff5aa0..61f0895f4 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -103,8 +103,10 @@ release_task_of_gpu_d2h_task(parsec_execution_stream_t* es, static int datatype_lookup_of_gpu_d2h_task( parsec_execution_stream_t * es, - const parsec_gpu_d2h_task_t* this_task, - uint32_t * flow_mask, parsec_dep_data_description_t * data) + const parsec_gpu_d2h_task_t* this_task, + const parsec_task_t * parent_task, + uint32_t * flow_mask, + parsec_dep_data_description_t * data) { (void)es; (void)this_task; (void)flow_mask; (void)data; return PARSEC_SUCCESS; diff --git a/parsec/parsec.c b/parsec/parsec.c index eaf45d9ea..638f81cd0 100644 --- a/parsec/parsec.c +++ b/parsec/parsec.c @@ -2785,19 +2785,23 @@ int parsec_context_query(parsec_context_t *context, parsec_context_query_cmd_t c return context->my_rank; case PARSEC_CONTEXT_QUERY_DEVICES: - int device_type = va_arg(args, int), count = 0; - for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { - dev = parsec_mca_device_get(i); - if( dev->type & device_type ) count++; + { + int device_type = va_arg(args, int), count = 0; + for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { + dev = parsec_mca_device_get(i); + if( dev->type & device_type ) count++; + } + return count; } - return count; case PARSEC_CONTEXT_QUERY_CORES: - int nb_total_comp_threads = 0; - for (int idx = 0; idx < context->nb_vp; idx++) { - nb_total_comp_threads += context->virtual_processes[idx]->nb_cores; + { + int nb_total_comp_threads = 0; + for (int idx = 0; idx < context->nb_vp; idx++) { + nb_total_comp_threads += context->virtual_processes[idx]->nb_cores; + } + return nb_total_comp_threads; } - return nb_total_comp_threads; case PARSEC_CONTEXT_QUERY_ACTIVE_TASKPOOLS: return context->active_taskpools; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 47eb1f944..3e1c1c0cf 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -336,7 +336,7 @@ remote_dep_dequeue_fini(parsec_context_t* context) mpi_initialized = 0; PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Process has reshaped %zu tiles.", count_reshaping); - + (void)context; return 0; } @@ -1280,10 +1280,6 @@ static int remote_dep_mpi_pack_dep(int peer, parsec_ce.pack_size(&parsec_ce, dep_count, dep_dtt, &dsize); /* reserve space for the termination detection piggybacked message */ dsize += deps->taskpool->tdm.module->outgoing_message_piggyback_size; - if( (length - (*position)) < dsize ) { /* no room. bail out */ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Can't pack termination detection piggyback data at %d/%d. Bail out!", *position, length); - return 1; - } /* count the number of data to prepare the space for their length */ for(k = 0, data_idx = 0; deps->outgoing_mask >> k; k++) { if( !((1U << k) & deps->outgoing_mask )) continue; @@ -1865,8 +1861,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, continue; } } - PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tFROM\t%d\tGet DATA\t% -8s\tk=%d\twith datakey %lx tag=%d (to be posted)", - deps->from, tmp, k, deps->msg.deps, tag+k); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tFROM\t%d\tGet DATA\t% -8s\tk=%d\twith datakey %lx (to be posted)", + deps->from, tmp, k, deps->msg.deps); } assert(length == *position); @@ -1876,8 +1872,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es, #if defined(PARSEC_DEBUG_NOISIER) for(int k = 0; complete_mask>>k; k++) if((1U<from, tmp, k, deps->msg.deps, deps->output[k].data.data, tag+k ); + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tHERE\t%d\tGet PREEND\t% -8s\tk=%d\twith datakey %lx at %p ALREADY SATISFIED\t", + deps->from, tmp, k, deps->msg.deps, deps->output[k].data.data); #endif /* If this is the only call then force the remote deps propagation */ deps = remote_dep_release_incoming(es, deps, complete_mask); diff --git a/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c index eb79b71e8..e5f1732f5 100644 --- a/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c +++ b/tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c @@ -39,14 +39,5 @@ parsec_taskpool_t* check_multisize_bcast_new(parsec_matrix_block_cyclic_t *A, in return (parsec_taskpool_t*)tp; } -/** - * @param [INOUT] o the parsec object to destroy - */ -static void -check_multisize_bcast_destructor(parsec_check_multisize_bcast_taskpool_t *tp) -{ - parsec_del2arena(&tp->arenas_datatypes[PARSEC_check_multisize_bcast_DEFAULT_ADT_IDX]); -} - PARSEC_OBJ_CLASS_INSTANCE(parsec_check_multisize_bcast_taskpool_t, parsec_taskpool_t, - NULL, check_multisize_bcast_destructor); + NULL, NULL); From f7905a57007e941797344e6c23f74bd96d1b1a49 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 6 Feb 2024 23:15:37 -0500 Subject: [PATCH 07/15] Fix the profiling test to use the correct message size. Signed-off-by: George Bosilca --- tests/profiling/check-comms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/profiling/check-comms.py b/tests/profiling/check-comms.py index cafc0edf3..8a82e8238 100644 --- a/tests/profiling/check-comms.py +++ b/tests/profiling/check-comms.py @@ -7,7 +7,7 @@ t = pd.HDFStore(filename) result = { - 'MPI_ACTIVATE': { 'nb': 100, 'lensum': 11200 }, + 'MPI_ACTIVATE': { 'nb': 100, 'lensum': 12000 }, 'MPI_DATA_CTL': { 'nb': 100, 'lensum': 209715200 }, 'MPI_DATA_PLD_SND': { 'nb': 100, 'lensum': 209715200 }, 'MPI_DATA_PLD_RCV': { 'nb': 100, 'lensum': 209715200 } From 2022eff840b1349bf9b4a0953369e7387b32521b Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 31 Jan 2024 18:11:36 -0500 Subject: [PATCH 08/15] Prevent CI from running OOM when oversubscribing GPUs --- .github/workflows/build_cmake.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index c6b292f12..75795fcd8 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -94,6 +94,8 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + PARSEC_MCA_device_cuda_memory_use=10 + PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure - name: Save Artifact @@ -196,6 +198,8 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + PARSEC_MCA_device_cuda_memory_use=10 + PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure - name: Save Testing Artifact From eca41f6618e74361acda8e6a3c735fac87f22d59 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 1 Feb 2024 19:06:45 -0500 Subject: [PATCH 09/15] Do not enable devices without explicit parameters doing so in the ctest files --- .github/workflows/build_cmake.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index 75795fcd8..ad3552e53 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -94,6 +94,10 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + # enable devices only in tests that explicitely require them + PARSEC_MCA_device_cuda_enabled=0 + PARSEC_MCA_device_hip_enabled=0 + # restrict memory use for oversubscribed runners PARSEC_MCA_device_cuda_memory_use=10 PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure @@ -198,6 +202,10 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + # enable devices only in tests that explicitely require them + PARSEC_MCA_device_cuda_enabled=0 + PARSEC_MCA_device_hip_enabled=0 + # restrict memory use for oversubscribed runners PARSEC_MCA_device_cuda_memory_use=10 PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure From c27cdf95ef303a7779e0bf75de1ae183067075e5 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 2 Feb 2024 00:25:23 -0500 Subject: [PATCH 10/15] Fix CUDA protection macro use Be nice and print a space between strings allowing better readability of the error message. Signed-off-by: George Bosilca --- parsec/mca/device/cuda/device_cuda.h | 2 +- .../mca/device/cuda/device_cuda_component.c | 9 ++++--- parsec/mca/device/cuda/device_cuda_module.c | 27 ++++++++++--------- parsec/mca/device/device_gpu.c | 2 +- tests/dsl/dtd/dtd_test_simple_gemm.c | 6 ++--- tests/runtime/cuda/nvlink.jdf | 5 ++-- tests/runtime/cuda/nvlink_wrapper.c | 13 ++++----- tests/runtime/cuda/stage_custom.jdf | 11 ++++---- tests/runtime/cuda/stress.jdf | 3 ++- 9 files changed, 42 insertions(+), 36 deletions(-) diff --git a/parsec/mca/device/cuda/device_cuda.h b/parsec/mca/device/cuda/device_cuda.h index a10b63aec..9c8d78877 100644 --- a/parsec/mca/device/cuda/device_cuda.h +++ b/parsec/mca/device/cuda/device_cuda.h @@ -69,7 +69,7 @@ typedef parsec_data_copy_t parsec_gpu_data_copy_t; do { \ cudaError_t __cuda_error = (cudaError_t) (ERROR); \ if( cudaSuccess != __cuda_error ) { \ - parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \ + parsec_warning( "%s:%d %s %s", __FILE__, __LINE__, \ (STR), cudaGetErrorString(__cuda_error) ); \ CODE; \ } \ diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index 8483bd012..0a2b0a468 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec/parsec_config.h" @@ -132,7 +133,7 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority continue; /* The user disabled NVLINK for that GPU */ cudastatus = cudaSetDevice( source_gpu->cuda_index ); - PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaSetDevice ", cudastatus, + PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaSetDevice", cudastatus, {continue;} ); for( j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) { @@ -140,11 +141,11 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority /* Communication mask */ cudastatus = cudaDeviceCanAccessPeer( &canAccessPeer, source_gpu->cuda_index, target_gpu->cuda_index ); - PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaDeviceCanAccessPeer ", cudastatus, + PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaDeviceCanAccessPeer", cudastatus, {continue;} ); if( 1 == canAccessPeer ) { cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 ); - PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess ", cudastatus, + PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus, {continue;} ); source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 << target_gpu->super.super.device_index)); @@ -231,7 +232,7 @@ static int device_cuda_component_open(void) */ } else { - PARSEC_CUDA_CHECK_ERROR( "cudaGetDeviceCount ", cudastatus, + PARSEC_CUDA_CHECK_ERROR( "cudaGetDeviceCount", cudastatus, { parsec_mca_param_set_int(parsec_device_cuda_enabled_index, 0); return MCA_ERROR; diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index 49d7cf121..c40c4e15f 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec/parsec_config.h" @@ -160,7 +161,7 @@ parsec_cuda_memory_register(parsec_device_module_t* device, parsec_data_collecti * all devices. */ status = cudaHostRegister(ptr, length, cudaHostRegisterPortable ); - PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_memory_register) cudaHostRegister ", status, + PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_memory_register) cudaHostRegister", status, { goto restore_and_return; } ); rc = PARSEC_SUCCESS; @@ -187,7 +188,7 @@ static int parsec_cuda_memory_unregister(parsec_device_module_t* device, parsec_ * as another thread might be submitting tasks at the same time (cuda_scheduling.h) */ status = cudaHostUnregister(ptr); - PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_memory_unregister) cudaHostUnregister ", status, + PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_memory_unregister) cudaHostUnregister", status, {continue;} ); rc = PARSEC_SUCCESS; @@ -271,7 +272,7 @@ static int parsec_cuda_set_device(parsec_device_gpu_module_t *gpu) parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t *)gpu; cudaStatus = cudaSetDevice(cuda_device->cuda_index); - PARSEC_CUDA_CHECK_ERROR( "cudaSetDevice ", cudaStatus, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cudaSetDevice", cudaStatus, {return PARSEC_ERROR;} ); return PARSEC_SUCCESS; } @@ -295,11 +296,11 @@ static int parsec_cuda_memcpy_async(struct parsec_device_gpu_module_s *gpu, stru kind = cudaMemcpyHostToDevice; break; default: - PARSEC_CUDA_CHECK_ERROR( "Translate parsec_device_transfer_direction_t to cudaMemcpyKind ", cudaErrorInvalidValue, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "Translate parsec_device_transfer_direction_t to cudaMemcpyKind", cudaErrorInvalidValue, {return PARSEC_ERROR;} ); } cudaStatus = cudaMemcpyAsync( dest, source, bytes, kind, cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", cudaStatus, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", cudaStatus, {return PARSEC_ERROR;} ); return PARSEC_SUCCESS; } @@ -310,7 +311,7 @@ static int parsec_cuda_event_record(struct parsec_device_gpu_module_s *gpu, stru (void)gpu; cudaStatus = cudaEventRecord(cuda_stream->events[event_idx], cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaEventRecord ", cudaStatus, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cudaEventRecord", cudaStatus, {return PARSEC_ERROR;} ); return PARSEC_SUCCESS; } @@ -327,7 +328,7 @@ static int parsec_cuda_event_query(struct parsec_device_gpu_module_s *gpu, struc if(cudaErrorNotReady == cudaStatus) { return 0; } - PARSEC_CUDA_CHECK_ERROR( "cudaEventQuery ", cudaStatus, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cudaEventQuery", cudaStatus, {return PARSEC_ERROR;} ); return PARSEC_ERROR; /* should be unreachable */ } @@ -379,9 +380,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) *module = NULL; cudastatus = cudaSetDevice( dev_id ); - PARSEC_CUDA_CHECK_ERROR( "cudaSetDevice ", cudastatus, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cudaSetDevice", cudastatus, {return PARSEC_ERROR;} ); cudastatus = cudaGetDeviceProperties( &prop, dev_id ); - PARSEC_CUDA_CHECK_ERROR( "cudaGetDeviceProperties ", cudastatus, {return PARSEC_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cudaGetDeviceProperties", cudastatus, {return PARSEC_ERROR;} ); szName = prop.name; major = prop.major; @@ -425,7 +426,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) /* Allocate the stream */ cudastatus = cudaStreamCreate( &(cuda_stream->cuda_stream) ); - PARSEC_CUDA_CHECK_ERROR( "cudaStreamCreate ", cudastatus, + PARSEC_CUDA_CHECK_ERROR( "cudaStreamCreate", cudastatus, {goto release_device;} ); exec_stream->workspace = NULL; PARSEC_OBJ_CONSTRUCT(&exec_stream->infos, parsec_info_object_array_t); @@ -445,7 +446,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) cuda_stream->events[k] = NULL; exec_stream->tasks[k] = NULL; cudastatus = cudaEventCreateWithFlags(&(cuda_stream->events[k]), cudaEventDisableTiming); - PARSEC_CUDA_CHECK_ERROR( "(INIT) cudaEventCreateWithFlags ", (cudaError_t)cudastatus, + PARSEC_CUDA_CHECK_ERROR( "(INIT) cudaEventCreateWithFlags", (cudaError_t)cudastatus, {goto release_device;} ); } if(j == 0) { @@ -611,7 +612,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device) int j, k; status = cudaSetDevice( cuda_device->cuda_index ); - PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_device_fini) cudaSetDevice ", status, + PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_device_fini) cudaSetDevice", status, {continue;} ); /* Release the registered memory */ @@ -632,7 +633,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device) for( k = 0; k < exec_stream->max_events; k++ ) { assert( NULL == exec_stream->tasks[k] ); status = cudaEventDestroy(cuda_stream->events[k]); - PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_device_fini) cudaEventDestroy ", status, + PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_device_fini) cudaEventDestroy", status, {continue;} ); } exec_stream->max_events = 0; diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 67943b0ee..60a02d461 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -700,7 +700,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device, mem_elem_per_gpu = total_size / eltsize; } rc = gpu_device->memory_allocate(gpu_device, total_size, &base_ptr); - if(PARSEC_SUCCESS != rc) { + if(PARSEC_SUCCESS != rc) { parsec_warning("GPU[%s] Allocating %zu bytes of memory on the GPU device failed", gpu_device->super.name, total_size); gpu_device->memory = NULL; diff --git a/tests/dsl/dtd/dtd_test_simple_gemm.c b/tests/dsl/dtd/dtd_test_simple_gemm.c index 222dd8418..9ac71a079 100644 --- a/tests/dsl/dtd/dtd_test_simple_gemm.c +++ b/tests/dsl/dtd/dtd_test_simple_gemm.c @@ -209,7 +209,7 @@ int gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device, this_task->taskpool->context->my_rank, gpu_stream->name, delta); - PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2 ", status, + PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status, { return PARSEC_HOOK_RETURN_ERROR; }); return PARSEC_HOOK_RETURN_DONE; @@ -401,11 +401,11 @@ static void *allocate_one_on_device(void *obj, void *p) cudaError_t cr; cr = cudaMallocManaged(&one_device, sizeof(double), cudaMemAttachGlobal); - PARSEC_CUDA_CHECK_ERROR("cudaMalloc ", cr, + PARSEC_CUDA_CHECK_ERROR("cudaMalloc", cr, { return NULL; }); cr = cudaMemcpy(one_device, &one_host, sizeof(double), cudaMemcpyHostToDevice); - PARSEC_CUDA_CHECK_ERROR("cudaMemcpy ", cr, + PARSEC_CUDA_CHECK_ERROR("cudaMemcpy", cr, { return NULL; }); return one_device; diff --git a/tests/runtime/cuda/nvlink.jdf b/tests/runtime/cuda/nvlink.jdf index 4e5029692..0cafd49c2 100644 --- a/tests/runtime/cuda/nvlink.jdf +++ b/tests/runtime/cuda/nvlink.jdf @@ -3,6 +3,7 @@ extern "C" %{ * Copyright (c) 2019-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec/parsec_config.h" @@ -147,7 +148,7 @@ BODY [type=CUDA &alpha, (double*)A, descA->super.mb, (double*)A, descA->super.mb, &beta, (double*)C, descA->super.mb ); - PARSEC_CUDA_CHECK_ERROR( "cublasDgemm_v2 ", status, + PARSEC_CUDA_CHECK_ERROR( "cublasDgemm_v2", status, {return -1;} ); } END @@ -191,7 +192,7 @@ BODY [type=CUDA &alpha, (double*)A, descA->super.mb, (double*)A, descA->super.mb, &beta, (double*)C, descA->super.mb ); - PARSEC_CUDA_CHECK_ERROR( "cublasDgemm_v2 ", status, + PARSEC_CUDA_CHECK_ERROR( "cublasDgemm_v2", status, {return -1;} ); } END diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c index e23f96092..14c873bad 100644 --- a/tests/runtime/cuda/nvlink_wrapper.c +++ b/tests/runtime/cuda/nvlink_wrapper.c @@ -3,6 +3,7 @@ * Copyright (c) 2019-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec.h" @@ -76,9 +77,9 @@ __parsec_nvlink_destructor( parsec_nvlink_taskpool_t* nvlink_taskpool) parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, userM->super.super.myrank); parsec_data_copy_t *gpu_copy = parsec_data_get_copy(dta, cuda_device->super.super.device_index); cudaError_t status = cudaSetDevice( cuda_device->cuda_index ); - PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaSetDevice ", status, {} ); + PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaSetDevice", status, {} ); status = (cudaError_t)cudaFree( gpu_copy->device_private ); - PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaFree ", status, {} ); + PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaFree", status, {} ); gpu_copy->device_private = NULL; parsec_data_copy_detach(dta, gpu_copy, cuda_device->super.super.device_index); PARSEC_OBJ_RELEASE(gpu_copy); @@ -86,7 +87,7 @@ __parsec_nvlink_destructor( parsec_nvlink_taskpool_t* nvlink_taskpool) } } parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)nvlink_taskpool->_g_userM ); - + free(dcA); free(userM); } @@ -189,10 +190,10 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb parsec_data_copy_t *gpu_copy = PARSEC_OBJ_NEW(parsec_data_copy_t); /* We chose the GPU */ cudaError_t status = cudaSetDevice( cuda_device->cuda_index ); - PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaSetDevice ", status, {return NULL;} ); + PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaSetDevice", status, {return NULL;} ); /* Allocate memory on it, for one tile */ status = (cudaError_t)cudaMalloc( &gpu_copy->device_private, mb*mb*parsec_datadist_getsizeoftype(PARSEC_MATRIX_DOUBLE) ); - PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMalloc ", status, {return NULL;} ); + PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMalloc", status, {return NULL;} ); /* Attach this copy to the data, on the corresponding device */ parsec_data_copy_attach(dta, gpu_copy, cuda_device->super.super.device_index); /* We also need to tell PaRSEC that the owner of this data is the GPU, or the @@ -203,7 +204,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb cpu_copy->device_private, dta->nb_elts, cudaMemcpyHostToDevice ); - PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMemcpy ", status, {return NULL;} ); + PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMemcpy", status, {return NULL;} ); g++; } } diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index d8bc81e30..abfd01376 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -3,6 +3,7 @@ extern "C" %{ * Copyright (c) 2019-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec/parsec_config.h" @@ -57,14 +58,14 @@ stage_stride_in(parsec_gpu_task_t *gtask, width, height, cudaMemcpyHostToDevice, cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } ); }else{ ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, copy_in->device_private, copy_in->original->nb_elts, cudaMemcpyDeviceToDevice, cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } ); } } @@ -102,7 +103,7 @@ stage_stride_out(parsec_gpu_task_t *gtask, width, height, cudaMemcpyDeviceToHost, cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } ); } } return PARSEC_SUCCESS; @@ -160,7 +161,7 @@ BODY [type=CUDA (double*)A, ldam, lbeta, (double*)A, ldam ); status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasDgemm ", status, + PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status, {return -1;} ); } END @@ -201,7 +202,7 @@ BODY [type=CUDA (double*)B, ldbm, lbeta, (double*)B, ldbm ); status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasDgemm ", status, + PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status, {return -1;} ); } diff --git a/tests/runtime/cuda/stress.jdf b/tests/runtime/cuda/stress.jdf index edd2942ef..64f983bdf 100644 --- a/tests/runtime/cuda/stress.jdf +++ b/tests/runtime/cuda/stress.jdf @@ -3,6 +3,7 @@ extern "C" %{ * Copyright (c) 2019-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec/parsec_config.h" @@ -140,7 +141,7 @@ BODY [type=CUDA (double*)B, descA->super.mb, 1.0, (double*)C, descA->super.mb ); status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm", status, {return -1;} ); } END From b4440a99527b65508b4b4bfe7924545a4b9b3659 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 31 Jan 2024 14:01:38 -0500 Subject: [PATCH 11/15] Move comm profiling initialization into comm thread Signed-off-by: Joseph Schuchart --- parsec/remote_dep_mpi.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 3e1c1c0cf..8add18a84 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -190,6 +190,12 @@ static int remote_dep_nothread_memcpy(parsec_execution_stream_t* es, int remote_dep_ce_reconfigure(parsec_context_t* context); +#ifdef PARSEC_PROF_TRACE +static void remote_dep_mpi_profiling_init(void); +#else +#define remote_dep_mpi_profiling_init() do {} while(0) +#endif // PARSEC_PROF_TRACE + static void remote_dep_mpi_params(parsec_context_t* context) { (void)context; @@ -271,6 +277,8 @@ remote_dep_dequeue_init(parsec_context_t* context) pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); + remote_dep_mpi_profiling_init(); + /* From now on the communication capabilities are enabled */ parsec_communication_engine_up = 1; if(context->nb_nodes == 1) { @@ -426,6 +434,11 @@ void* remote_dep_dequeue_main(parsec_context_t* context) pthread_mutex_lock(&mpi_thread_mutex); pthread_cond_signal(&mpi_thread_condition); +#ifdef PARSEC_PROF_TRACE + parsec_comm_es.es_profile = parsec_profiling_stream_init( 2*1024*1024, "Comm thread"); + parsec_profiling_set_default_thread(parsec_comm_es.es_profile); +#endif // PARSEC_PROF_TRACE + /* This is the main loop. Wait until being woken up by the main thread, do * the MPI stuff until we get the OFF or FINI commands. Then react the them. * However, the first time do the delayed initialization that could not have @@ -1219,9 +1232,6 @@ static void remote_dep_mpi_profiling_init(void) sizeof(parsec_profile_remote_dep_mpi_info_t), parsec_profile_remote_dep_mpi_info_to_string, &MPI_Data_pldr_sk, &MPI_Data_pldr_ek); - - parsec_comm_es.es_profile = parsec_profiling_stream_init( 2*1024*1024, "Comm thread"); - parsec_profiling_set_default_thread(parsec_comm_es.es_profile); } static void remote_dep_mpi_profiling_fini(void) @@ -1241,7 +1251,6 @@ static inline uint64_t remote_dep_mpi_profiling_event_id(void) } #else -#define remote_dep_mpi_profiling_init() do {} while(0) #define remote_dep_mpi_profiling_fini() do {} while(0) #define remote_dep_mpi_profiling_event_id() (0UL) @@ -2268,7 +2277,6 @@ remote_dep_ce_init(parsec_context_t* context) 1); /* Lazy or delayed initializations */ remote_dep_mpi_initialize_execution_stream(context); - remote_dep_mpi_profiling_init(); return PARSEC_SUCCESS; } From e7dca15b9ebcdc5b07df07574caada814a11b97a Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 31 Jan 2024 13:25:52 -0500 Subject: [PATCH 12/15] Fix alignment in device stats and printout of nan values when 0 tasks have run --- parsec/mca/device/device.c | 44 ++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c index 07051f666..b095ec235 100644 --- a/parsec/mca/device/device.c +++ b/parsec/mca/device/device.c @@ -26,6 +26,7 @@ #if defined(PARSEC_HAVE_STRING_H) #include #endif /* defined(PARSEC_HAVE_STRING_H) */ +#include #if defined(__WINDOWS__) #include #endif /* defined(__WINDOWS__) */ @@ -375,7 +376,6 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t float best_required_in, best_required_out; char *data_in_unit, *data_out_unit, *d2d_unit; char *required_in_unit, *required_out_unit; - char percent1[64], percent2[64], percent3[64]; parsec_device_module_t *device; uint32_t i; @@ -407,9 +407,8 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t } /* Print statistics */ - if( 0 == total_data_in ) total_data_in = 1; - if( 0 == total_data_out ) total_data_out = 1; gtotal = (float)total_tasks; + double percent_in, percent_out, percent_d2d; printf("+----------------------------------------------------------------------------------------------------------------------------+\n"); printf("| | | Data In | Data Out |\n"); @@ -425,14 +424,16 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t parsec_compute_best_unit( transferred_out[i], &best_data_out, &data_out_unit ); parsec_compute_best_unit( transferred_d2d[i], &best_d2d, &d2d_unit ); + percent_in = (0 == required_in[i])? nan(""): (((double)transferred_in[i]) / (double)required_in[i] ) * 100.0; + percent_d2d = (0 == required_in[i])? nan(""): (((double)transferred_d2d[i]) / (double)required_in[i] ) * 100.0; + percent_out = (0 == required_out[i])? nan(""): (((double)transferred_out[i]) / (double)required_out[i] ) * 100.0; + printf("| Dev %2d |%10"PRIu64" | %6.2f | %8.2f%2s | %8.2f%2s(%5.2f) | %8.2f%2s(%5.2f) | %8.2f%2s | %8.2f%2s(%5.2f) | %s\n", device->device_index, executed_tasks[i], (executed_tasks[i]/gtotal)*100.00, - best_required_in, required_in_unit, best_data_in, data_in_unit, - (((double)transferred_in[i]) / (double)required_in[i] ) * 100.0, - best_d2d, d2d_unit, - (((double)transferred_d2d[i])/ (double)required_in[i]) * 100.0, - best_required_out, required_out_unit, best_data_out, data_out_unit, - (((double)transferred_out[i]) / (double)required_out[i]) * 100.0, device->name ); + best_required_in, required_in_unit, best_data_in, data_in_unit, percent_in, + best_d2d, d2d_unit, percent_d2d, + best_required_out, required_out_unit, best_data_out, data_out_unit, percent_out, + device->name ); } printf("|---------|-----------|--------|------------|-----------------------|-----------------------|------------|-------------------|\n"); @@ -443,26 +444,17 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t parsec_compute_best_unit( total_data_out, &best_data_out, &data_out_unit ); parsec_compute_best_unit( total_d2d, &best_d2d, &d2d_unit ); - if( 0 == total_required_in ) { - snprintf(percent1, 64, "nan"); - snprintf(percent2, 64, "nan"); - } else { - snprintf(percent1, 64, "%5.2f", ((double)total_data_in / (double)total_required_in ) * 100.0); - snprintf(percent2, 64, "%5.2f", ((double)total_d2d / (double)total_required_in) * 100.0); - } - if( 0 == total_required_out ) { - snprintf(percent3, 64, "nan"); - } else { - snprintf(percent3, 64, "%5.2f", ((double)total_data_out / (double)total_required_out) * 100.0); - } - printf("|All Devs |%10"PRIu64" | %5.2f | %8.2f%2s | %8.2f%2s(%s) | %8.2f%2s(%s) | %8.2f%2s | %8.2f%2s(%s) |\n", + percent_in = (0 == total_required_in)? nan(""): (((double)total_data_in) / (double)total_required_in) * 100.0; + percent_d2d = (0 == total_required_in)? nan(""): (((double)total_d2d) / (double)total_required_in) * 100.0; + percent_out = (0 == total_required_out)? nan(""): (((double)total_data_out) / (double)total_required_out) * 100.0; + + printf("|All Devs |%10"PRIu64" | %6.2f | %8.2f%2s | %8.2f%2s(%5.2f) | %8.2f%2s(%5.2f) | %8.2f%2s | %8.2f%2s(%5.2f) |\n", total_tasks, (total_tasks/gtotal)*100.00, - best_required_in, required_in_unit, best_data_in, data_in_unit, percent1, - best_d2d, d2d_unit, percent2, - best_required_out, required_out_unit, best_data_out, data_out_unit, percent3); + best_required_in, required_in_unit, best_data_in, data_in_unit, percent_in, + best_d2d, d2d_unit, percent_d2d, + best_required_out, required_out_unit, best_data_out, data_out_unit, percent_out); printf("+----------------------------------------------------------------------------------------------------------------------------+\n"); - parsec_devices_free_statistics(&end_stats); } From b22edb3e0189e66830b50d28ccc2c17eb92c1b07 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 1 Feb 2024 18:50:11 -0500 Subject: [PATCH 13/15] fix units in device outputs --- parsec/mca/device/CMakeLists.txt | 2 +- parsec/mca/device/cuda/device_cuda_module.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parsec/mca/device/CMakeLists.txt b/parsec/mca/device/CMakeLists.txt index 32fd11730..85d23ba0d 100644 --- a/parsec/mca/device/CMakeLists.txt +++ b/parsec/mca/device/CMakeLists.txt @@ -10,7 +10,7 @@ set_property(TARGET parsec mca/device/device_gpu.h) set(PARSEC_HAVE_DEV_CPU_SUPPORT 1 CACHE BOOL "PaRSEC has support for CPU kernels") -set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL "PaRSEC has support for CPU kernels") +set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL "PaRSEC has support for Recursive CPU kernels") if(PARSEC_HAVE_CUDA) set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA") endif(PARSEC_HAVE_CUDA) diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index c40c4e15f..3b11f4a16 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -552,7 +552,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) prop.pciBusID, prop.pciDeviceID, prop.pciDomainID, streaming_multiprocessor, freqHz*1e-9f, - fp64, fp32, tf32, fp16, + fp64*1e-3, fp32*1e-3, tf32*1e-3, fp16*1e-3, 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6, prop.memoryClockRate*1e-6, prop.memoryBusWidth, (concurrency == 1)? "yes": "no", computemode); From 354ce08d9f87d46633ff039db82865022d5e2b90 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 14 Feb 2024 14:33:05 -0500 Subject: [PATCH 14/15] bug: even when DEP_SHORT==0, we need space to store data_sizes in the packed message. --- parsec/remote_dep_mpi.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 8add18a84..cc5a3c7b5 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -87,7 +87,7 @@ remote_dep_cmd_to_string(remote_dep_wire_activate_t* origin, #define dep_dtt parsec_datatype_int8_t #define dep_count sizeof(remote_dep_wire_activate_t) #define dep_extent dep_count -#define DEP_SHORT_BUFFER_SIZE (dep_extent+RDEP_MSG_SHORT_LIMIT) +#define DEP_SHORT_BUFFER_SIZE (dep_extent+RDEP_MSG_SHORT_LIMIT+(1+MAX_DEP_IN_COUNT)*sizeof(uint32_t)) #if PARSEC_SIZEOF_VOID_P == 4 #define datakey_dtt parsec_datatype_int32_t #else @@ -1270,8 +1270,8 @@ static inline uint64_t remote_dep_mpi_profiling_event_id(void) static int remote_dep_mpi_pack_dep(int peer, dep_cmd_item_t* item, char* packed_buffer, - uint32_t length, - int32_t* position) + int length, + int* position) { parsec_remote_deps_t *deps = (parsec_remote_deps_t*)item->cmd.activate.task.source_deps; remote_dep_wire_activate_t* msg = &deps->msg; @@ -1295,9 +1295,9 @@ static int remote_dep_mpi_pack_dep(int peer, if( !(deps->output[k].rank_bits[peer_bank] & peer_mask) ) continue; data_idx++; } - if( (length - (*position)) < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { /* no room. bail out */ + if( (length - (*position)) < (dsize + (data_idx + 1) * (int)sizeof(uint32_t)) ) { /* no room. bail out */ PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "Can't pack at %d/%d. Bail out!", *position, length); - if( length < (dsize + (data_idx + 1) * (uint32_t)sizeof(uint32_t)) ) { + if( length < (dsize + (data_idx + 1) * (int)sizeof(uint32_t)) ) { parsec_fatal("The header plus data cannot be sent on a single message " "(need %zd but have %zd)\n", length, dsize + data_idx * sizeof(uint32_t)); From c3c3ae49c7022a54fd4148d30e2cc11a7e9d2c67 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 14 Feb 2024 16:44:34 -0500 Subject: [PATCH 15/15] cleanup and merge errors in 321 --- .github/workflows/build_cmake.yml | 6 ---- parsec/mca/device/transfer_gpu.c | 2 +- tests/apps/stencil/stencil_1D.jdf | 48 +++++++++++++++---------------- 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index ad3552e53..30a9c6b05 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -94,12 +94,6 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh - # enable devices only in tests that explicitely require them - PARSEC_MCA_device_cuda_enabled=0 - PARSEC_MCA_device_hip_enabled=0 - # restrict memory use for oversubscribed runners - PARSEC_MCA_device_cuda_memory_use=10 - PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure - name: Save Artifact diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 61f0895f4..d4b69ca6e 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -108,7 +108,7 @@ datatype_lookup_of_gpu_d2h_task( parsec_execution_stream_t * es, uint32_t * flow_mask, parsec_dep_data_description_t * data) { - (void)es; (void)this_task; (void)flow_mask; (void)data; + (void)es; (void)this_task; (void)parent_task; (void)flow_mask; (void)data; return PARSEC_SUCCESS; } diff --git a/tests/apps/stencil/stencil_1D.jdf b/tests/apps/stencil/stencil_1D.jdf index 91b392107..cffe28807 100644 --- a/tests/apps/stencil/stencil_1D.jdf +++ b/tests/apps/stencil/stencil_1D.jdf @@ -10,33 +10,33 @@ extern "C" %{ const int sizeof_datatype = sizeof(DTYPE); /** - * @brief stencil_1D copy data to ghost region + * @brief stencil_1D copy data to ghost region * - * @param [out] A0: output data - * @param [in] AL: left input data - * @param [in] AR: right input data - * @param [in] MB: row tile size - * @param [in] NB: column tile size - * @param [in] myrank: my rank - * @param [in] rank_L: rank of left neighbor - * @param [in] rank_R: rank of right neighbor - * @param [in] R: radius of ghost region - * @param [in] n: column index - * @param [in] n_max: max column index + * @param [out] A0: output data + * @param [in] AL: left input data + * @param [in] AR: right input data + * @param [in] MB: row tile size + * @param [in] NB: column tile size + * @param [in] myrank: my rank + * @param [in] rank_L: rank of left neighbor + * @param [in] rank_R: rank of right neighbor + * @param [in] R: radius of ghost region + * @param [in] n: column index + * @param [in] n_max: max column index */ static void CORE_copydata_stencil_1D(DTYPE *A0, DTYPE *AL, DTYPE *AR, int MB, int NB, int myrank, int rank_L, int rank_R, int R, int n, int n_max) { int disp_AL, disp_AR, disp_A0; - /* Displacement of AL */ + /* Displacement of AL */ if( myrank == rank_L ) { disp_AL = MB * (NB - 2 * R); } else { - disp_AL = 0; + disp_AL = 0; } - /* Copy AL to left ghost region of A0 */ + /* Copy AL to left ghost region of A0 */ if( n > 0 ) { memcpy((void *)A0, (void *)(AL+disp_AL), MB*R*sizeof(DTYPE)); } @@ -71,7 +71,7 @@ R [ type = "int" ] task(t, n) t = 0 .. iter -m = t % descA->lmt +m = t % descA->lmt n = 0 .. descA->lnt-1 myrank = descA->super.myrank @@ -104,7 +104,7 @@ END extern "C" %{ /** - * @brief Stencil 1D, no-blocking + * @brief Stencil 1D, no-blocking * * @param [inout] dcA: the data, already distributed and allocated * @param [in] iter: iterations @@ -132,14 +132,14 @@ parsec_stencil_1D_New(parsec_tiled_matrix_t *dcA, int iter, int R) exit(1); } - taskpool = parsec_stencil_1D_new(dcA, iter, R); + taskpool = parsec_stencil_1D_new(dcA, iter, R); stencil_1D_taskpool = (parsec_taskpool_t*)taskpool; parsec_add2arena( &taskpool->arenas_datatypes[PARSEC_stencil_1D_FULL_ADT_IDX], MY_TYPE, PARSEC_MATRIX_FULL, 1, dcA->mb, dcA->nb, dcA->mb, PARSEC_ARENA_ALIGNMENT_SSE, -1 ); - + parsec_add2arena( &taskpool->arenas_datatypes[PARSEC_stencil_1D_LR_ADT_IDX], MY_TYPE, PARSEC_MATRIX_FULL, 1, dcA->mb, R, dcA->mb, @@ -160,11 +160,11 @@ void parsec_stencil_1D_Destruct(parsec_taskpool_t *taskpool) } /** - * @brief Stencil 1D - * + * @brief Stencil 1D + * * @param [inout] dcA: the data, already distributed and allocated - * @param [in] iter: iterations - * @param [in] R: radius + * @param [in] iter: iterations + * @param [in] R: radius */ int parsec_stencil_1D(parsec_context_t *parsec, parsec_tiled_matrix_t *A, @@ -172,7 +172,7 @@ int parsec_stencil_1D(parsec_context_t *parsec, { parsec_taskpool_t *parsec_stencil_1D = NULL; - parsec_stencil_1D = parsec_stencil_1D_New(A, iter, R); + parsec_stencil_1D = parsec_stencil_1D_New(A, iter, R); if( parsec_stencil_1D != NULL ){ parsec_enqueue(parsec, parsec_stencil_1D);