diff --git a/mpi-proxy-split/Makefile b/mpi-proxy-split/Makefile index 345303ce4..5ad1fdbb6 100644 --- a/mpi-proxy-split/Makefile +++ b/mpi-proxy-split/Makefile @@ -30,7 +30,7 @@ WRAPPERS_SRCDIR=mpi-wrappers # As you add new files to your plugin library, add the object file names here. LIBOBJS = mpi_plugin.o p2p_drain_send_recv.o p2p_log_replay.o \ - record-replay.o seq_num.o \ + seq_num.o virtual-ids.o \ split_process.o ${LOWER_HALF_SRCDIR}/procmapsutils.o #MANA_COORD_OBJS = mana_coordinator.o @@ -67,8 +67,18 @@ libmana.so: ${LIBOBJS} ${WRAPPERS_SRCDIR}/libmpiwrappers.a ${CXX} -shared -fPIC -g3 -O0 -o $@ ${LIBOBJS} -Wl,--whole-archive ${WRAPPERS_SRCDIR}/libmpiwrappers.a -Wl,--no-whole-archive mpi_unimplemented_wrappers.cpp: generate-mpi-unimplemented-wrappers.py \ - mpi_unimplemented_wrappers.txt - python $^ > $@ + mpi_unimplemented_wrappers_mpich.txt \ + mpi_unimplemented_wrappers_openmpi.txt \ + mpi_unimplemented_wrappers_exampi.txt + if mpiexec -h | grep -q 'mpich'; then + python generate-mpi-unimplemented-wrappers.py mpi_unimplemented_wrappers_mpich.txt > $@ + elif mpixec -h | grep -q 'open-mpi'; then + python generate-mpi-unimplemented-wrappers.py mpi_unimplemented_wrappers_openmpi.txt > $@ + elif mpiexec -h | grep -q 'ExaMPI'; then + python generate-mpi-unimplemented-wrappers.py mpi_unimplemented_wrappers_exampi.txt > $@ + else + $(error The MPI flavor could not be identified.) + fi .c.o: ${MPICC} ${CFLAGS} -c -o $@ $< diff --git a/mpi-proxy-split/lower-half/libproxy.c b/mpi-proxy-split/lower-half/libproxy.c index 47ff41136..29d7365ce 100644 --- a/mpi-proxy-split/lower-half/libproxy.c +++ b/mpi-proxy-split/lower-half/libproxy.c @@ -68,8 +68,29 @@ static void* MPI_Fnc_Ptrs[] = { NULL, }; +#define INIT_CONST_MAP(const) mpi_constants[LH_MPI_##const] = MPI_##const; + +static int mpi_constants_initialized = 0; +static void* mpi_constants[LH_MPI_Constant_Invalid + 1]; // Local functions +void* +get_lh_mpi_constant(enum MPI_Constants constant) +{ + if (!mpi_constants_initialized) { + mpi_constants[LH_MPI_Constant_NULL] = NULL; + FOREACH_CONSTANT(INIT_CONST_MAP) + mpi_constants[LH_MPI_ERRORS_RETURN] = 0; + mpi_constants[LH_MPI_Constant_Invalid] = NULL; + mpi_constants_initialized = 1; + } + if (constant < LH_MPI_Constant_NULL || + constant > LH_MPI_Constant_Invalid) { + return NULL; + } + return mpi_constants[constant]; +} + LhCoreRegions_t* getLhRegionsList(int *num) { @@ -346,6 +367,7 @@ void first_constructor() lh_info.g_appContext = (void*)&g_appContext; lh_info.lh_dlsym = (void*)&mydlsym; lh_info.getRankFptr = (void*)&getRank; + lh_info.lh_mpi_constants = (void*)&get_lh_mpi_constant; #ifdef SINGLE_CART_REORDER lh_info.getCoordinatesFptr = (void*)&getCoordinates; diff --git a/mpi-proxy-split/lower-half/libproxy.h b/mpi-proxy-split/lower-half/libproxy.h index a45c41053..8647ee113 100644 --- a/mpi-proxy-split/lower-half/libproxy.h +++ b/mpi-proxy-split/lower-half/libproxy.h @@ -19,6 +19,9 @@ * . * ****************************************************************************/ +// Included to retrieve MPICH, OPEN_MPI, EXAMPI +#include + #ifndef _LIBPROXY_H #define _LIBPROXY_H @@ -40,6 +43,7 @@ do { \ #define INFO 2 // Informational logs #define ERROR 1 // Highest error/exception level +#if defined(MPICH) #define FOREACH_FNC(MACRO) \ MACRO(Init), \ MACRO(Finalize), \ @@ -404,5 +408,356 @@ do { \ MACRO(Wtime), \ MACRO(MANA_Internal), \ MACRO(Aint_diff), +#elif defined(OPEN_MPI) +#define FOREACH_FNC(MACRO) \ + MACRO(Init), \ + MACRO(Finalize), \ + MACRO(Send), \ + MACRO(Recv), \ + MACRO(Type_size), \ + MACRO(Iprobe), \ + MACRO(Get_count), \ + MACRO(Isend), \ + MACRO(Irecv), \ + MACRO(Wait), \ + MACRO(Test), \ + MACRO(Bcast), \ + MACRO(Abort), \ + MACRO(Barrier), \ + MACRO(Reduce), \ + MACRO(Allreduce), \ + MACRO(Alltoall), \ + MACRO(Alltoallv), \ + MACRO(Comm_split), \ + MACRO(Add_error_class), \ + MACRO(Add_error_code), \ + MACRO(Add_error_string), \ + MACRO(Allgather), \ + MACRO(Iallgather), \ + MACRO(Allgatherv), \ + MACRO(Iallgatherv), \ + MACRO(Iallreduce), \ + MACRO(Ialltoall), \ + MACRO(Ialltoallv), \ + MACRO(Alltoallw), \ + MACRO(Ialltoallw), \ + MACRO(Bsend), \ + MACRO(Ibcast), \ + MACRO(Bsend_init), \ + MACRO(Cancel), \ + MACRO(Cart_coords), \ + MACRO(Cart_create), \ + MACRO(Cart_get), \ + MACRO(Cart_rank), \ + MACRO(Cart_shift), \ + MACRO(Cart_sub), \ + MACRO(Comm_compare), \ + MACRO(Comm_create_group), \ + MACRO(Comm_create), \ + MACRO(Comm_dup), \ + MACRO(Comm_free), \ + MACRO(Comm_get_name), \ + MACRO(Comm_group), \ + MACRO(Comm_rank), \ + MACRO(Comm_remote_group), \ + MACRO(Comm_remote_size), \ + MACRO(Comm_set_errhandler), \ + MACRO(Comm_set_name), \ + MACRO(Comm_size), \ + MACRO(Comm_split_type), \ + MACRO(Comm_test_inter), \ + MACRO(Error_class), \ + MACRO(Error_string), \ + MACRO(Exscan), \ + MACRO(Iexscan), \ + MACRO(Finalized), \ + MACRO(Gather), \ + MACRO(Igather), \ + MACRO(Gatherv), \ + MACRO(Igatherv), \ + MACRO(Get_address), \ + MACRO(Get_library_version), \ + MACRO(Get_processor_name), \ + MACRO(Get_version), \ + MACRO(Group_compare), \ + MACRO(Group_difference), \ + MACRO(Group_excl), \ + MACRO(Group_free), \ + MACRO(Group_incl), \ + MACRO(Group_intersection), \ + MACRO(Group_rank), \ + MACRO(Group_size), \ + MACRO(Group_translate_ranks), \ + MACRO(Group_union), \ + MACRO(Ibsend), \ + MACRO(Info_create), \ + MACRO(Info_delete), \ + MACRO(Info_dup), \ + MACRO(Info_free), \ + MACRO(Info_get), \ + MACRO(Info_get_nkeys), \ + MACRO(Info_get_nthkey), \ + MACRO(Info_get_valuelen), \ + MACRO(Info_set), \ + MACRO(Initialized), \ + MACRO(Init_thread), \ + MACRO(Irsend), \ + MACRO(Issend), \ + MACRO(Is_thread_main), \ + MACRO(Op_create), \ + MACRO(Op_free), \ + MACRO(Pack), \ + MACRO(Pack_size), \ + MACRO(Probe), \ + MACRO(Recv_init), \ + MACRO(Ireduce), \ + MACRO(Request_free), \ + MACRO(Rsend), \ + MACRO(Rsend_init), \ + MACRO(Scan), \ + MACRO(Iscan), \ + MACRO(Scatter), \ + MACRO(Iscatter), \ + MACRO(Scatterv), \ + MACRO(Iscatterv), \ + MACRO(Send_init), \ + MACRO(Sendrecv), \ + MACRO(Ssend_init), \ + MACRO(Ssend), \ + MACRO(Start), \ + MACRO(Startall), \ + MACRO(Status_set_cancelled), \ + MACRO(Testall), \ + MACRO(Testany), \ + MACRO(Test_cancelled), \ + MACRO(Testsome), \ + MACRO(Topo_test), \ + MACRO(Type_commit), \ + MACRO(Type_contiguous), \ + MACRO(Type_create_hvector), \ + MACRO(Type_create_indexed_block), \ + MACRO(Type_create_struct), \ + MACRO(Type_create_subarray), \ + MACRO(Type_create_resized), \ + MACRO(Type_free), \ + MACRO(Type_get_extent), \ + MACRO(Type_get_name), \ + MACRO(Type_indexed), \ + MACRO(Type_set_name), \ + MACRO(Type_size_x), \ + MACRO(Type_vector), \ + MACRO(Unpack), \ + MACRO(Waitall), \ + MACRO(Waitany), \ + MACRO(Waitsome), \ + MACRO(Wtick), \ + MACRO(Wtime), \ + MACRO(MANA_Internal), +#elif defined(EXAMPI) +#define FOREACH_FNC(MACRO) \ + MACRO(Init), \ + MACRO(Finalize), \ + MACRO(Send), \ + MACRO(Recv), \ + MACRO(Type_size), \ + MACRO(Iprobe), \ + MACRO(Get_count), \ + MACRO(Isend), \ + MACRO(Irecv), \ + MACRO(Wait), \ + MACRO(Test), \ + MACRO(Bcast), \ + MACRO(Abort), \ + MACRO(Barrier), \ + MACRO(Reduce), \ + MACRO(Allreduce), \ + MACRO(Alltoall), \ + MACRO(Alltoallv), \ + MACRO(Comm_split), \ + MACRO(Add_error_class), \ + MACRO(Add_error_code), \ + MACRO(Add_error_string), \ + MACRO(Allgather), \ + MACRO(Iallgather), \ + MACRO(Allgatherv), \ + MACRO(Iallgatherv), \ + MACRO(Iallreduce), \ + MACRO(Ialltoall), \ + MACRO(Ialltoallv), \ + MACRO(Alltoallw), \ + MACRO(Ialltoallw), \ + MACRO(Bsend), \ + MACRO(Ibcast), \ + MACRO(Bsend_init), \ + MACRO(Cancel), \ + MACRO(Cart_coords), \ + MACRO(Cart_create), \ + MACRO(Cart_get), \ + MACRO(Cart_rank), \ + MACRO(Cart_shift), \ + MACRO(Cart_sub), \ + MACRO(Comm_compare), \ + MACRO(Comm_create_group), \ + MACRO(Comm_create), \ + MACRO(Comm_dup), \ + MACRO(Comm_free), \ + MACRO(Comm_get_name), \ + MACRO(Comm_group), \ + MACRO(Comm_rank), \ + MACRO(Comm_remote_group), \ + MACRO(Comm_remote_size), \ + MACRO(Comm_set_errhandler), \ + MACRO(Comm_set_name), \ + MACRO(Comm_size), \ + MACRO(Comm_split_type), \ + MACRO(Comm_test_inter), \ + MACRO(Error_class), \ + MACRO(Error_string), \ + MACRO(Exscan), \ + MACRO(Iexscan), \ + MACRO(Finalized), \ + MACRO(Gather), \ + MACRO(Igather), \ + MACRO(Gatherv), \ + MACRO(Igatherv), \ + MACRO(Get_address), \ + MACRO(Get_library_version), \ + MACRO(Get_processor_name), \ + MACRO(Get_version), \ + MACRO(Group_compare), \ + MACRO(Group_difference), \ + MACRO(Group_excl), \ + MACRO(Group_free), \ + MACRO(Group_incl), \ + MACRO(Group_intersection), \ + MACRO(Group_rank), \ + MACRO(Group_size), \ + MACRO(Group_translate_ranks), \ + MACRO(Group_union), \ + MACRO(Ibsend), \ + MACRO(Info_create), \ + MACRO(Info_delete), \ + MACRO(Info_dup), \ + MACRO(Info_free), \ + MACRO(Info_get), \ + MACRO(Info_get_nkeys), \ + MACRO(Info_get_nthkey), \ + MACRO(Info_get_valuelen), \ + MACRO(Info_set), \ + MACRO(Initialized), \ + MACRO(Init_thread), \ + MACRO(Irsend), \ + MACRO(Issend), \ + MACRO(Is_thread_main), \ + MACRO(Op_create), \ + MACRO(Op_free), \ + MACRO(Pack), \ + MACRO(Pack_size), \ + MACRO(Probe), \ + MACRO(Recv_init), \ + MACRO(Ireduce), \ + MACRO(Request_free), \ + MACRO(Rsend), \ + MACRO(Rsend_init), \ + MACRO(Scan), \ + MACRO(Iscan), \ + MACRO(Scatter), \ + MACRO(Iscatter), \ + MACRO(Scatterv), \ + MACRO(Iscatterv), \ + MACRO(Send_init), \ + MACRO(Sendrecv), \ + MACRO(Ssend_init), \ + MACRO(Ssend), \ + MACRO(Start), \ + MACRO(Startall), \ + MACRO(Status_set_cancelled), \ + MACRO(Testall), \ + MACRO(Testany), \ + MACRO(Test_cancelled), \ + MACRO(Testsome), \ + MACRO(Topo_test), \ + MACRO(Type_commit), \ + MACRO(Type_contiguous), \ + MACRO(Type_create_hvector), \ + MACRO(Type_create_indexed_block), \ + MACRO(Type_create_struct), \ + MACRO(Type_create_subarray), \ + MACRO(Type_create_resized), \ + MACRO(Type_free), \ + MACRO(Type_get_extent), \ + MACRO(Type_get_name), \ + MACRO(Type_indexed), \ + MACRO(Type_set_name), \ + MACRO(Type_size_x), \ + MACRO(Type_vector), \ + MACRO(Unpack), \ + MACRO(Waitall), \ + MACRO(Waitany), \ + MACRO(Waitsome), \ + MACRO(Wtick), \ + MACRO(Wtime), \ + MACRO(MANA_Internal), +#else +#error "Could not find an MPI implementation" +#endif // ifdef MPICH elseif OPEN_MPI elseif EXAMPI + +#define FOREACH_CONSTANT(MACRO) \ + MACRO(GROUP_NULL) \ + MACRO(COMM_NULL) \ + MACRO(REQUEST_NULL) \ + MACRO(OP_NULL) \ + MACRO(INFO_NULL) \ + MACRO(COMM_WORLD) \ + MACRO(COMM_SELF) \ + MACRO(GROUP_EMPTY) \ + MACRO(MAX) \ + MACRO(MIN) \ + MACRO(SUM) \ + MACRO(PROD) \ + MACRO(BAND) \ + MACRO(LOR) \ + MACRO(BOR) \ + MACRO(MAXLOC) \ + MACRO(MINLOC) \ + MACRO(DATATYPE_NULL) \ + MACRO(BYTE) \ + MACRO(PACKED) \ + MACRO(CHAR) \ + MACRO(SHORT) \ + MACRO(INT) \ + MACRO(LONG) \ + MACRO(FLOAT) \ + MACRO(DOUBLE) \ + MACRO(LONG_DOUBLE) \ + MACRO(UNSIGNED_CHAR) \ + MACRO(SIGNED_CHAR) \ + MACRO(UNSIGNED_SHORT) \ + MACRO(UNSIGNED_LONG) \ + MACRO(UNSIGNED) \ + MACRO(FLOAT_INT) \ + MACRO(DOUBLE_INT) \ + MACRO(LONG_DOUBLE_INT) \ + MACRO(LONG_INT) \ + MACRO(SHORT_INT) \ + MACRO(2INT) \ + MACRO(WCHAR) \ + MACRO(LONG_LONG_INT) \ + MACRO(LONG_LONG) \ + MACRO(UNSIGNED_LONG_LONG) \ + MACRO(INT8_T) \ + MACRO(UINT8_T) \ + MACRO(INT16_T) \ + MACRO(UINT16_T) \ + MACRO(INT32_T) \ + MACRO(UINT32_T) \ + MACRO(INT64_T) \ + MACRO(UINT64_T) \ + MACRO(AINT) \ + MACRO(CXX_BOOL) \ + MACRO(CXX_FLOAT_COMPLEX) \ + MACRO(CXX_DOUBLE_COMPLEX) \ + MACRO(CXX_LONG_DOUBLE_COMPLEX) \ + MACRO(ERRORS_RETURN) \ #endif // define _LIBPROXY_H diff --git a/mpi-proxy-split/lower-half/lower_half_api.h b/mpi-proxy-split/lower-half/lower_half_api.h index af030eb97..240614dff 100644 --- a/mpi-proxy-split/lower-half/lower_half_api.h +++ b/mpi-proxy-split/lower-half/lower_half_api.h @@ -27,6 +27,7 @@ #include "libproxy.h" #define GENERATE_ENUM(ENUM) MPI_Fnc_##ENUM +#define GENERATE_CONSTANT_ENUM(ENUM) LH_MPI_##ENUM, #define GENERATE_FNC_PTR(FNC) &MPI_##FNC #define GENERATE_FNC_STRING(FNC) "MPI_" #FNC #define PAGE_SIZE 0x1000 @@ -93,6 +94,9 @@ typedef struct _LowerHalfInfo void *g_appContext; // Pointer to ucontext_t of upper half application (defined in the lower half) void *lh_dlsym; // Pointer to mydlsym() function in the lower half void *getRankFptr; // Pointer to getRank() function in the lower half + void *lh_mpi_constants; // Open MPI can save its MPI constants at a different + // at a different address each time. Copy to uh. + #ifdef SINGLE_CART_REORDER void *getCoordinatesFptr; // Pointer to getCoordinates() function in the lower half void *getCartesianCommunicatorFptr; // Pointer to getCartesianCommunicator() function in the lower half @@ -113,6 +117,13 @@ enum MPI_Fncs { MPI_Fnc_Invalid, }; +enum MPI_Constants { + LH_MPI_Constant_NULL, + FOREACH_CONSTANT(GENERATE_CONSTANT_ENUM) + // LH_MPI_ERRORS_RETURN, + LH_MPI_Constant_Invalid, +}; + __attribute__ ((unused)) static const char *MPI_Fnc_strings[] = { "MPI_Fnc_NULL", @@ -134,6 +145,7 @@ typedef int (*libcFptr_t) (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL), void *); typedef void* (*proxyDlsym_t)(enum MPI_Fncs fnc); +typedef void* (*lh_constant_t)(enum MPI_Constants constant); typedef void* (*updateEnviron_t)(char **environ); typedef void (*resetMmappedList_t)(); typedef MmapInfo_t* (*getMmappedList_t)(int **num); @@ -148,6 +160,7 @@ extern LowerHalfInfo_t lh_info; // the transient lh_proxy process in DMTCP_EVENT_INIT. // initializeLowerHalf() will initialize this to: (proxyDlsym_t)lh_info.lh_dlsym extern proxyDlsym_t pdlsym; +extern lh_constant_t lh_mpi_constants; extern LhCoreRegions_t lh_regions_list[MAX_LH_REGIONS]; // API @@ -156,6 +169,9 @@ extern LhCoreRegions_t lh_regions_list[MAX_LH_REGIONS]; // the given enum value extern void *mydlsym(enum MPI_Fncs fnc); +// Gets the value of an MPI constant in the lower half, for these can differ. +extern void *get_lh_mpi_constant(enum MPI_Constants constant); + // Initializes the MPI library in the lower half (by calling MPI_Init()) and // returns the MPI rank of the current process extern int getRank(); diff --git a/mpi-proxy-split/mpi-wrappers/Makefile b/mpi-proxy-split/mpi-wrappers/Makefile index 57a5dea0d..6afb5747c 100644 --- a/mpi-proxy-split/mpi-wrappers/Makefile +++ b/mpi-proxy-split/mpi-wrappers/Makefile @@ -52,8 +52,19 @@ ${LIBNAME}.a: ${LIBWRAPPER_OBJS} ar cr $@ $^ mpi_unimplemented_wrappers.cpp: generate-mpi-unimplemented-wrappers.py \ - mpi_unimplemented_wrappers.txt - python3 $^ > $@ + mpi_unimplemented_wrappers_mpich.txt \ + mpi_unimplemented_wrappers_openmpi.txt \ + mpi_unimplemented_wrappers_exampi.txt + if mpiexec -h | grep -q 'mpich'; then \ + python3 generate-mpi-unimplemented-wrappers.py mpi_unimplemented_wrappers_mpich.txt > $@; \ + elif mpiexec -h | grep -q 'open-mpi'; then \ + python3 generate-mpi-unimplemented-wrappers.py mpi_unimplemented_wrappers_openmpi.txt > $@; \ + elif mpiexec -h | grep -q 'ExaMPI'; then \ + python3 generate-mpi-unimplemented-wrappers.py mpi_unimplemented_wrappers_exampi.txt > $@; \ + else \ + echo 'Could not identify the MPICH flavor'; \ + exit 1; \ + fi .c.o: ${MPICC} ${CFLAGS} -c -o $@ $< @@ -101,12 +112,23 @@ mpi_stub_wrappers.c: generate-mpi-stub-wrappers.py mpi_stub_wrappers.txt python3 $^ >> $$tmp && mv -f $$tmp $@ || (rm -f $$tmp && false) mpi_fortran_wrappers.cpp: generate-mpi-fortran-wrappers.py \ - mpi_fortran_wrappers.txt + mpi_fortran_wrappers_openmpi.txt \ + mpi_fortran_wrappers_mpich.txt rm -f $@ tmp=$@.tmp.$$$$ ; \ printf "%s\n\n" \ "// *** THIS FILE IS AUTO-GENERATED! DO 'make' TO UPDATE. ***" >$$tmp;\ - python3 $^ >> $$tmp && mv -f $$tmp $@ || (rm -f $$tmp && false) + if mpiexec -h | grep -q 'mpich'; then \ + python generate-mpi-fortran-wrappers.py mpi_fortran_wrappers_mpich.txt > $@; \ + elif mpixec -h | grep -q 'open-mpi'; then \ + python generate-mpi-fortran-wrappers.py mpi_fortran_wrappers_openmpi.txt > $@; \ + elif mpiexec -h | grep -q 'ExaMPI'; then \ + echo 'ExaMPI has no fortran support.'; \ + exit 1; \ + else \ + echo 'The MPI flavor could not be identified.'; \ + exit 1; \ + fi mpi_stub_wrappers.o: mpi_stub_wrappers.c ${MPICC} ${CFLAGS} -c -o $@ $< diff --git a/mpi-proxy-split/mpi-wrappers/mpi_cart_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_cart_wrappers.cpp index 082c7ea22..20909b87a 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_cart_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_cart_wrappers.cpp @@ -28,7 +28,6 @@ #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" #ifdef SINGLE_CART_REORDER #include "two-phase-algo.h" @@ -37,8 +36,6 @@ #endif #include "p2p_drain_send_recv.h" -using namespace dmtcp_mpi; - USER_DEFINED_WRAPPER(int, Cart_coords, (MPI_Comm) comm, (int) rank, (int) maxdims, (int*) coords) { @@ -65,6 +62,7 @@ USER_DEFINED_WRAPPER(int, Cart_get, (MPI_Comm) comm, (int) maxdims, return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Cart_map, (MPI_Comm) comm, (int) ndims, (const int*) dims, (const int*) periods, (int *) newrank) { @@ -75,14 +73,14 @@ USER_DEFINED_WRAPPER(int, Cart_map, (MPI_Comm) comm, (int) ndims, // FIXME: Need to virtualize this newrank?? retval = NEXT_FUNC(Cart_map)(realComm, ndims, dims, periods, newrank); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { - FncArg ds = CREATE_LOG_BUF(dims, ndims * sizeof(int)); - FncArg ps = CREATE_LOG_BUF(periods, ndims * sizeof(int)); - LOG_CALL(restoreCarts, Cart_map, comm, ndims, ds, ps, newrank); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { + // FncArg ds = CREATE_LOG_BUF(dims, ndims * sizeof(int)); + // FncArg ps = CREATE_LOG_BUF(periods, ndims * sizeof(int)); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) USER_DEFINED_WRAPPER(int, Cart_rank, (MPI_Comm) comm, (const int*) coords, (int *) rank) @@ -107,9 +105,7 @@ USER_DEFINED_WRAPPER(int, Cart_shift, (MPI_Comm) comm, (int) direction, retval = NEXT_FUNC(Cart_shift)(realComm, direction, disp, rank_source, rank_dest); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { - LOG_CALL(restoreCarts, Cart_shift, comm, direction, - disp, *rank_source, *rank_dest); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -125,20 +121,18 @@ USER_DEFINED_WRAPPER(int, Cart_sub, (MPI_Comm) comm, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Cart_sub)(realComm, remain_dims, new_comm); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { - int ndims = 0; - MPI_Cartdim_get(comm, &ndims); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*new_comm); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *new_comm = virtComm; active_comms.insert(virtComm); - FncArg rs = CREATE_LOG_BUF(remain_dims, ndims * sizeof(int)); - LOG_CALL(restoreCarts, Cart_sub, comm, ndims, rs, virtComm); + // FncArg rs = CREATE_LOG_BUF(remain_dims, ndims * sizeof(int)); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Cartdim_get, (MPI_Comm) comm, (int *) ndims) { int retval; @@ -161,6 +155,7 @@ USER_DEFINED_WRAPPER(int, Dims_create, (int)nnodes, (int)ndims, (int *)dims) DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) #ifdef SINGLE_CART_REORDER // This variable holds the cartesian properties and is only used at the time of @@ -200,16 +195,15 @@ USER_DEFINED_WRAPPER(int, Cart_create, (MPI_Comm)old_comm, (int)ndims, g_cartesian_properties.ndims, g_cartesian_properties.coordinates); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*comm_cart); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *comm_cart = virtComm; active_comms.insert(virtComm); - FncArg ds = CREATE_LOG_BUF(dims, ndims * sizeof(int)); - FncArg ps = CREATE_LOG_BUF(periods, ndims * sizeof(int)); - LOG_CALL(restoreCarts, Cart_create, old_comm, ndims, ds, ps, reorder, - virtComm); + // FncArg ds = CREATE_LOG_BUF(dims, ndims * sizeof(int)); + // FncArg ps = CREATE_LOG_BUF(periods, ndims * sizeof(int)); + // virtComm); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -249,15 +243,14 @@ USER_DEFINED_WRAPPER(int, Cart_create, (MPI_Comm) old_comm, (int) ndims, retval = NEXT_FUNC(Cart_create)(realComm, ndims, dims, periods, reorder, comm_cart); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*comm_cart); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *comm_cart = virtComm; active_comms.insert(virtComm); - FncArg ds = CREATE_LOG_BUF(dims, ndims * sizeof(int)); - FncArg ps = CREATE_LOG_BUF(periods, ndims * sizeof(int)); - LOG_CALL(restoreCarts, Cart_create, old_comm, ndims, - ds, ps, reorder, virtComm); + // FncArg ds = CREATE_LOG_BUF(dims, ndims * sizeof(int)); + // FncArg ps = CREATE_LOG_BUF(periods, ndims * sizeof(int)); + // ds, ps, reorder, virtComm); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -272,13 +265,15 @@ PMPI_IMPL(int, MPI_Cart_create, MPI_Comm old_comm, int ndims, MPI_Comm *comm_cart) PMPI_IMPL(int, MPI_Cart_get, MPI_Comm comm, int maxdims, int dims[], int periods[], int coords[]) -PMPI_IMPL(int, MPI_Cart_map, MPI_Comm comm, int ndims, - const int dims[], const int periods[], int *newrank) PMPI_IMPL(int, MPI_Cart_rank, MPI_Comm comm, const int coords[], int *rank) PMPI_IMPL(int, MPI_Cart_shift, MPI_Comm comm, int direction, int disp, int *rank_source, int *rank_dest) PMPI_IMPL(int, MPI_Cart_sub, MPI_Comm comm, const int remain_dims[], MPI_Comm *new_comm) + +#if defined(MPICH) PMPI_IMPL(int, MPI_Cartdim_get, MPI_Comm comm, int *ndims) PMPI_IMPL(int, MPI_Dims_create, int nnodes, int ndims, int *dims) - +PMPI_IMPL(int, MPI_Cart_map, MPI_Comm comm, int ndims, + const int dims[], const int periods[], int *newrank) +#endif // defined(MPICH) diff --git a/mpi-proxy-split/mpi-wrappers/mpi_collective_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_collective_wrappers.cpp index eb1026946..0f8526e62 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_collective_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_collective_wrappers.cpp @@ -27,7 +27,6 @@ #include "protectedfds.h" #include "mpi_plugin.h" -#include "record-replay.h" #include "mpi_nextfunc.h" #include "seq_num.h" #include "virtual-ids.h" @@ -50,7 +49,6 @@ isUsingCollectiveToP2p() { #endif } -using namespace dmtcp_mpi; #ifndef MPI_COLLECTIVE_P2P #ifdef NO_BARRIER_BCAST @@ -121,11 +119,9 @@ USER_DEFINED_WRAPPER(int, Ibcast, retval = NEXT_FUNC(Ibcast)(buffer, count, realType, root, realComm, request); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Request virtRequest = ADD_NEW_REQUEST(*request); *request = virtRequest; - LOG_CALL(restoreRequests, Ibcast, buffer, count, datatype, - root, comm, *request); #ifdef USE_REQUEST_LOG logRequestInfo(*request, IBCAST_REQUEST); #endif @@ -149,6 +145,7 @@ USER_DEFINED_WRAPPER(int, Barrier, (MPI_Comm) comm) return retval; } +#if defined(MPICH) EXTERNC USER_DEFINED_WRAPPER(int, Ibarrier, (MPI_Comm) comm, (MPI_Request *) request) { @@ -158,10 +155,9 @@ USER_DEFINED_WRAPPER(int, Ibarrier, (MPI_Comm) comm, (MPI_Request *) request) JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Ibarrier)(realComm, request); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Request virtRequest = ADD_NEW_REQUEST(*request); *request = virtRequest; - LOG_CALL(restoreRequests, Ibarrier, comm, *request); #ifdef USE_REQUEST_LOG logRequestInfo(*request, IBARRIER_REQUEST); #endif @@ -169,6 +165,7 @@ USER_DEFINED_WRAPPER(int, Ibarrier, (MPI_Comm) comm, (MPI_Request *) request) DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) /******************************************************************************* * This version, MPI_Allreduce_reproducible, can be called from @@ -201,6 +198,7 @@ USER_DEFINED_WRAPPER(int, Ibarrier, (MPI_Comm) comm, (MPI_Request *) request) * non-deterministically. ******************************************************************************/ +#if defined(MPICH) int MPI_Allreduce_reproducible(const void *sendbuf, void *recvbuf, @@ -226,8 +224,13 @@ MPI_Allreduce_reproducible(const void *sendbuf, MPI_Type_size(datatype, &type_size); JASSERT(count * comm_size * type_size <= MAX_ALL_SENDBUF_SIZE); +#ifndef EXAMPI JASSERT(sendbuf != FORTRAN_MPI_IN_PLACE && sendbuf != MPI_IN_PLACE) .Text("MANA: MPI_Allreduce_reproducible: MPI_IN_PLACE not yet supported."); +#else + JASSERT(sendbuf != MPI_IN_PLACE) + .Text("MANA: MPI_Allreduce_reproducible: MPI_IN_PLACE not yet supported."); +#endif // Gather the operands from all ranks in the comm MPI_Gather(sendbuf, count, datatype, tmpbuf, count, datatype, 0, comm); @@ -247,20 +250,28 @@ MPI_Allreduce_reproducible(const void *sendbuf, return rc; } +#endif // defined(MPICH) USER_DEFINED_WRAPPER(int, Allreduce, (const void *) sendbuf, (void *) recvbuf, (int) count, (MPI_Datatype) datatype, (MPI_Op) op, (MPI_Comm) comm) { +#if defined(MPICH) char *s = getenv("MANA_USE_ALLREDUCE_REPRODUCIBLE"); int use_allreduce_reproducible = (s != NULL) ? atoi(s) : 0; +#endif // defined(MPICH) bool passthrough = false; commit_begin(comm, passthrough); int retval; DMTCP_PLUGIN_DISABLE_CKPT(); + +#ifndef EXAMPI get_fortran_constants(); +#endif + +#if defined(MPICH) if (use_allreduce_reproducible) retval = MPI_Allreduce_reproducible(sendbuf, recvbuf, count, datatype, op, comm, 0); @@ -269,14 +280,32 @@ USER_DEFINED_WRAPPER(int, Allreduce, MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Allreduce)(sendbuf, recvbuf, count, realType, realOp, realComm); RETURN_TO_UPPER_HALF(); } +#else // defined(MPICH) + MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); + MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); + MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); + // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI + if (sendbuf == FORTRAN_MPI_IN_PLACE) { + sendbuf = MPI_IN_PLACE; + } +#endif + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + retval = + NEXT_FUNC(Allreduce)(sendbuf, recvbuf, count, realType, realOp, realComm); + RETURN_TO_UPPER_HALF(); + +#endif // defined(MPICH) DMTCP_PLUGIN_ENABLE_CKPT(); commit_finish(comm, passthrough); return retval; @@ -295,9 +324,11 @@ USER_DEFINED_WRAPPER(int, Reduce, MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Reduce)(sendbuf, recvbuf, count, realType, realOp, root, realComm); @@ -318,18 +349,18 @@ USER_DEFINED_WRAPPER(int, Ireduce, MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Ireduce)(sendbuf, recvbuf, count, realType, realOp, root, realComm, request); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Request virtRequest = ADD_NEW_REQUEST(*request); *request = virtRequest; - LOG_CALL(restoreRequests, Ireduce, sendbuf, recvbuf, - count, datatype, op, root, comm, *request); #ifdef USE_REQUEST_LOG logRequestInfo(*request, IREDUCE_REQUEST); #endif @@ -338,6 +369,7 @@ USER_DEFINED_WRAPPER(int, Ireduce, return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Reduce_scatter, (const void *) sendbuf, (void *) recvbuf, (const int) recvcounts[], (MPI_Datatype) datatype, @@ -351,9 +383,11 @@ USER_DEFINED_WRAPPER(int, Reduce_scatter, MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef MANA_EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Reduce_scatter)(sendbuf, recvbuf, recvcounts, realType, realOp, realComm); @@ -362,6 +396,7 @@ USER_DEFINED_WRAPPER(int, Reduce_scatter, commit_finish(comm, passthrough); return retval; } +#endif // defined(MPICH) #endif // #ifndef MPI_COLLECTIVE_P2P // NOTE: This C++ function in needed by p2p_drain_send_recv.cpp @@ -382,9 +417,11 @@ MPI_Alltoall_internal(const void *sendbuf, int sendcount, MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); MPI_Datatype realRecvType = VIRTUAL_TO_REAL_TYPE(recvtype); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Alltoall)(sendbuf, sendcount, realSendType, recvbuf, recvcount, realRecvType, realComm); @@ -420,9 +457,11 @@ USER_DEFINED_WRAPPER(int, Alltoallv, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); @@ -445,17 +484,21 @@ USER_DEFINED_WRAPPER(int, Gather, (const void *) sendbuf, (int) sendcount, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); MPI_Datatype realRecvType = VIRTUAL_TO_REAL_TYPE(recvtype); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Gather)(sendbuf, sendcount, realSendType, recvbuf, recvcount, realRecvType, @@ -475,9 +518,11 @@ USER_DEFINED_WRAPPER(int, Gatherv, (const void *) sendbuf, (int) sendcount, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); @@ -500,9 +545,11 @@ USER_DEFINED_WRAPPER(int, Scatter, (const void *) sendbuf, (int) sendcount, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (recvbuf == FORTRAN_MPI_IN_PLACE) { recvbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); @@ -526,9 +573,11 @@ USER_DEFINED_WRAPPER(int, Scatterv, (const void *) sendbuf, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (recvbuf == FORTRAN_MPI_IN_PLACE) { recvbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); @@ -551,9 +600,11 @@ USER_DEFINED_WRAPPER(int, Allgather, (const void *) sendbuf, (int) sendcount, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); @@ -577,9 +628,11 @@ USER_DEFINED_WRAPPER(int, Allgatherv, (const void *) sendbuf, (int) sendcount, commit_begin(comm, passthrough); int retval; // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } +#endif DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realSendType = VIRTUAL_TO_REAL_TYPE(sendtype); @@ -605,10 +658,12 @@ USER_DEFINED_WRAPPER(int, Scan, (const void *) sendbuf, (void *) recvbuf, MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); // FIXME: Ideally, check FORTRAN_MPI_IN_PLACE only in the Fortran wrapper. +#ifndef EXAMPI if (sendbuf == FORTRAN_MPI_IN_PLACE) { sendbuf = MPI_IN_PLACE; } - MPI_Op realOp = VIRTUAL_TO_REAL_TYPE(op); +#endif + MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Scan)(sendbuf, recvbuf, count, realType, realOp, realComm); @@ -631,12 +686,11 @@ USER_DEFINED_WRAPPER(int, Comm_split, (MPI_Comm) comm, (int) color, (int) key, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Comm_split)(realComm, color, key, newcomm); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*newcomm); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *newcomm = virtComm; active_comms.insert(virtComm); - LOG_CALL(restoreComms, Comm_split, comm, color, key, *newcomm); } DMTCP_PLUGIN_ENABLE_CKPT(); commit_finish(comm, passthrough); @@ -653,12 +707,11 @@ USER_DEFINED_WRAPPER(int, Comm_dup, (MPI_Comm) comm, (MPI_Comm *) newcomm) JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Comm_dup)(realComm, newcomm); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*newcomm); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *newcomm = virtComm; active_comms.insert(virtComm); - LOG_CALL(restoreComms, Comm_dup, comm, *newcomm); } DMTCP_PLUGIN_ENABLE_CKPT(); commit_finish(comm, passthrough); @@ -672,7 +725,9 @@ PMPI_IMPL(int, MPI_Bcast, void *buffer, int count, MPI_Datatype datatype, PMPI_IMPL(int, MPI_Ibcast, void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm, MPI_Request *request) PMPI_IMPL(int, MPI_Barrier, MPI_Comm comm) +#if defined(MPICH) PMPI_IMPL(int, MPI_Ibarrier, MPI_Comm comm, MPI_Request * request) +#endif // defined(MPICH) PMPI_IMPL(int, MPI_Allreduce, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) PMPI_IMPL(int, MPI_Reduce, const void *sendbuf, void *recvbuf, int count, @@ -711,3 +766,5 @@ PMPI_IMPL(int, MPI_Scan, const void *sendbuf, void *recvbuf, int count, PMPI_IMPL(int, MPI_Comm_split, MPI_Comm comm, int color, int key, MPI_Comm *newcomm) PMPI_IMPL(int, MPI_Comm_dup, MPI_Comm comm, MPI_Comm *newcomm) + + diff --git a/mpi-proxy-split/mpi-wrappers/mpi_comm_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_comm_wrappers.cpp index 9ae2a36d7..7ae6da072 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_comm_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_comm_wrappers.cpp @@ -34,12 +34,10 @@ #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" #include "seq_num.h" #include "p2p_drain_send_recv.h" -using namespace dmtcp_mpi; // TODO // - validate operation status (right now we assume them to be successful by @@ -60,6 +58,7 @@ using namespace dmtcp_mpi; // multiple communicators can use the same attribute key for different // attributes. Thus, we use a structure like the following: // [keyval -> {[communicator -> attribute], extra_state, copy_fn, delete_fn}] +#if defined(MPICH) struct KeyvalTuple { KeyvalTuple () = default; @@ -76,9 +75,12 @@ struct KeyvalTuple { MPI_Comm_delete_attr_function *_deleteFn; std::unordered_map _attributeMap; }; +#endif // defined(MPICH) static std::vector keyvalVec; +#if defined(MPICH) static std::unordered_map tupleMap; +#endif // defined(MPICH) // The following are prvalues. So we need an address to point them to. // On Cori, this is 2^22 - 1, but it just needs to be greater than 2^15 - 1. @@ -126,19 +128,11 @@ USER_DEFINED_WRAPPER(int, Comm_create, (MPI_Comm) comm, (MPI_Group) group, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Comm_create)(realComm, realGroup, newcomm); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*newcomm); - unsigned int gid = VirtualGlobalCommId::instance() - .createGlobalId(virtComm); + grant_ggid(virtComm); *newcomm = virtComm; active_comms.insert(virtComm); - std::map::iterator it = - seq_num.find(gid); - if (it == seq_num.end()) { - seq_num[gid] = 0; - target[gid] = 0; - } - LOG_CALL(restoreComms, Comm_create, comm, group, virtComm); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -190,6 +184,7 @@ USER_DEFINED_WRAPPER(int, Comm_free, (MPI_Comm *) comm) // attribute, but since our structure here is a bit different, we check, for // each key value, if a communicator/attribute value pairing exists, and if // it does then we call the callback function. +#if defined(MPICH) for (auto &tuplePair : tupleMap) { KeyvalTuple *tuple = &tuplePair.second; std::unordered_map *attributeMap = &tuple->_attributeMap; @@ -203,26 +198,24 @@ USER_DEFINED_WRAPPER(int, Comm_free, (MPI_Comm *) comm) attributeMap->erase(*comm); } } +#endif // defined(MPICH) DMTCP_PLUGIN_DISABLE_CKPT(); int retval = MPI_Comm_free_internal(comm); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { // NOTE: We cannot remove the old comm from the map, since // we'll need to replay this call to reconstruct any other comms that // might have been created using this comm. // - // realComm = REMOVE_OLD_COMM(*comm); - // CLEAR_COMM_LOGS(*comm); + // + // FIXME: Now, we remove it. O(1) decode-recode changes this. + REMOVE_OLD_COMM(*comm); active_comms.erase(*comm); - unsigned int gid = VirtualGlobalCommId::instance().getGlobalId(*comm); -#if 0 - seq_num.erase(gid); -#endif - LOG_CALL(restoreComms, Comm_free, *comm); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Comm_get_attr, (MPI_Comm) comm, (int) comm_keyval, (void *) attribute_val, (int *) flag) { @@ -263,7 +256,9 @@ USER_DEFINED_WRAPPER(int, Comm_get_attr, (MPI_Comm) comm, } return retval; } +#endif // defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Comm_set_attr, (MPI_Comm) comm, (int) comm_keyval, (void *) attribute_val) { @@ -290,7 +285,9 @@ USER_DEFINED_WRAPPER(int, Comm_set_attr, (MPI_Comm) comm, attributeMap->emplace(comm, attribute_val); return retval; } +#endif // defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Comm_delete_attr, (MPI_Comm) comm, (int) comm_keyval) { int retval = MPI_SUCCESS; @@ -314,6 +311,7 @@ USER_DEFINED_WRAPPER(int, Comm_delete_attr, (MPI_Comm) comm, (int) comm_keyval) } return retval; } +#endif // defined(MPICH) USER_DEFINED_WRAPPER(int, Comm_set_errhandler, (MPI_Comm) comm, (MPI_Errhandler) errhandler) @@ -324,8 +322,7 @@ USER_DEFINED_WRAPPER(int, Comm_set_errhandler, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Comm_set_errhandler)(realComm, errhandler); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { - LOG_CALL(restoreComms, Comm_set_errhandler, comm, errhandler); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -353,18 +350,17 @@ USER_DEFINED_WRAPPER(int, Comm_split_type, (MPI_Comm) comm, (int) split_type, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Comm_split_type)(realComm, split_type, key, inf, newcomm); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*newcomm); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *newcomm = virtComm; active_comms.insert(virtComm); - LOG_CALL(restoreComms, Comm_split_type, comm, - split_type, key, inf, virtComm); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Attr_get, (MPI_Comm) comm, (int) keyval, (void*) attribute_val, (int*) flag) { @@ -380,7 +376,9 @@ USER_DEFINED_WRAPPER(int, Attr_get, (MPI_Comm) comm, (int) keyval, DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Attr_delete, (MPI_Comm) comm, (int) keyval) { @@ -393,13 +391,14 @@ USER_DEFINED_WRAPPER(int, Attr_delete, (MPI_Comm) comm, (int) keyval) JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Attr_delete)(realComm, realCommKeyval); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { - LOG_CALL(restoreComms, Attr_delete, comm, keyval); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Attr_put, (MPI_Comm) comm, (int) keyval, (void*) attribute_val) { @@ -412,13 +411,14 @@ USER_DEFINED_WRAPPER(int, Attr_put, (MPI_Comm) comm, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Attr_put)(realComm, realCommKeyval, attribute_val); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { - LOG_CALL(restoreComms, Attr_put, comm, keyval, attribute_val); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Comm_create_keyval, (MPI_Comm_copy_attr_function *) comm_copy_attr_fn, (MPI_Comm_delete_attr_function *) comm_delete_attr_fn, @@ -450,7 +450,9 @@ USER_DEFINED_WRAPPER(int, Comm_create_keyval, std::unordered_map())); return retval; } +#endif // defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Comm_free_keyval, (int *) comm_keyval) { int retval = MPI_SUCCESS; @@ -463,6 +465,7 @@ USER_DEFINED_WRAPPER(int, Comm_free_keyval, (int *) comm_keyval) } return retval; } +#endif // defined(MPICH) int MPI_Comm_create_group_internal(MPI_Comm comm, MPI_Group group, int tag, @@ -484,12 +487,11 @@ USER_DEFINED_WRAPPER(int, Comm_create_group, (MPI_Comm) comm, { std::function realBarrierCb = [=]() { int retval = MPI_Comm_create_group_internal(comm, group, tag, newcomm); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Comm virtComm = ADD_NEW_COMM(*newcomm); - VirtualGlobalCommId::instance().createGlobalId(virtComm); + grant_ggid(virtComm); *newcomm = virtComm; active_comms.insert(virtComm); - LOG_CALL(restoreComms, Comm_create_group, comm, group, tag, virtComm); } return retval; }; @@ -503,15 +505,18 @@ PMPI_IMPL(int, MPI_Comm_create, MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm) PMPI_IMPL(int, MPI_Comm_compare, MPI_Comm comm1, MPI_Comm comm2, int *result) PMPI_IMPL(int, MPI_Comm_free, MPI_Comm *comm) +#if defined(MPICH) PMPI_IMPL(int, MPI_Comm_get_attr, MPI_Comm comm, int comm_keyval, void *attribute_val, int *flag) PMPI_IMPL(int, MPI_Comm_set_attr, MPI_Comm comm, int comm_keyval, void *attribute_val) +#endif // defined(MPICH) PMPI_IMPL(int, MPI_Comm_set_errhandler, MPI_Comm comm, MPI_Errhandler errhandler) PMPI_IMPL(int, MPI_Topo_test, MPI_Comm comm, int* status) PMPI_IMPL(int, MPI_Comm_split_type, MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm) +#if defined(MPICH) PMPI_IMPL(int, MPI_Attr_get, MPI_Comm comm, int keyval, void *attribute_val, int *flag) PMPI_IMPL(int, MPI_Attr_delete, MPI_Comm comm, int keyval) @@ -521,5 +526,6 @@ PMPI_IMPL(int, MPI_Comm_create_keyval, MPI_Comm_delete_attr_function * comm_delete_attr_fn, int *comm_keyval, void *extra_state) PMPI_IMPL(int, MPI_Comm_free_keyval, int *comm_keyval) +#endif // defined(MPICH) PMPI_IMPL(int, MPI_Comm_create_group, MPI_Comm comm, MPI_Group group, int tag, MPI_Comm *newcomm) diff --git a/mpi-proxy-split/mpi-wrappers/mpi_error_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_error_wrappers.cpp index 6d23cd29b..1b2405e70 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_error_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_error_wrappers.cpp @@ -28,7 +28,6 @@ #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" DEFINE_FNC(int, Error_class, (int) errorcode, (int *) errorclass); diff --git a/mpi-proxy-split/mpi-wrappers/mpi_file_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_file_wrappers.cpp index f3ff2abac..35175db30 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_file_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_file_wrappers.cpp @@ -29,11 +29,9 @@ #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" #include "seq_num.h" -using namespace dmtcp_mpi; std::unordered_map g_params_map; @@ -150,6 +148,10 @@ USER_DEFINED_WRAPPER(int, File_set_view, (MPI_File) fh, (MPI_Offset) disp, return retval; } +// FIXME: This is the only wrapper function in the entirety of MANA that uses +// REAL_TO_VIRTUAL. O(1) real-to-virtual translation was a goal of the vid +// refactoring, but we need to think more carefully about this. Maybe +// REAL_TO_VIRTUAL could be eliminated, if it's only used once. USER_DEFINED_WRAPPER(int, File_get_view, (MPI_File) fh, (MPI_Offset*) disp, (MPI_Datatype*) etype, (MPI_Datatype*) filetype, (char*) datarep) @@ -163,8 +165,8 @@ USER_DEFINED_WRAPPER(int, File_get_view, (MPI_File) fh, (MPI_Offset*) disp, retval = NEXT_FUNC(File_get_view)(realFile, disp, &realEtype, &realFtype, datarep); RETURN_TO_UPPER_HALF(); - *etype = REAL_TO_VIRTUAL_TYPE(realEtype); - *filetype = REAL_TO_VIRTUAL_TYPE(realFtype); + // *etype = REAL_TO_VIRTUAL_TYPE(realEtype); + // *filetype = REAL_TO_VIRTUAL_TYPE(realFtype); DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } @@ -442,4 +444,4 @@ PMPI_IMPL(int, MPI_File_set_errhandler, MPI_File file, MPI_Errhandler errhandler) PMPI_IMPL(int, MPI_File_get_errhandler, MPI_File file, MPI_Errhandler *errhandler) -PMPI_IMPL(int, MPI_File_delete, const char *filename, MPI_Info info) \ No newline at end of file +PMPI_IMPL(int, MPI_File_delete, const char *filename, MPI_Info info) diff --git a/mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers.txt b/mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers_mpich.txt similarity index 100% rename from mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers.txt rename to mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers_mpich.txt diff --git a/mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers_openmpi.txt b/mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers_openmpi.txt new file mode 100644 index 000000000..841eee847 --- /dev/null +++ b/mpi-proxy-split/mpi-wrappers/mpi_fortran_wrappers_openmpi.txt @@ -0,0 +1,127 @@ +#include ; + +#include "dmtcp.h"; +#include "jassert.h"; + +int MPI_Finalize(); +int MPI_Finalized(int* flag); +int MPI_Get_processor_name(char* name, int* resultlen); +double MPI_Wtime(); +int MPI_Initialized(int* flag); +int MPI_Get_count(const MPI_Status* status, MPI_Datatype datatype, int* count); + +int MPI_Bcast(void* buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm); +int MPI_Barrier(MPI_Comm comm); +int MPI_Allreduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +int MPI_Reduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm); +int MPI_Reduce_local(const void* inbuf, void* inoutbuf, int count, MPI_Datatype datatype, MPI_Op op); +int MPI_Reduce_scatter(const void* sendbuf, void* recvbuf, const int* recvcounts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +int MPI_Alltoall(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Alltoallv(const void* sendbuf, const int* sendcounts, const int* sdispls, MPI_Datatype sendtype, void* recvbuf, const int* recvcounts, const int* rdispls, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Allgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Allgatherv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, const int* recvcount, const int* displs, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Gather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); +int MPI_Gatherv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, const int* recvcounts, const int* displs, MPI_Datatype recvtype, int root, MPI_Comm comm); +int MPI_Scatter(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); +int MPI_Scatterv(const void* sendbuf, const int* sendcounts, const int* displs, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); +int MPI_Scan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); + +int MPI_Comm_size(MPI_Comm comm, int* world_size); +int MPI_Comm_rank(MPI_Comm comm, int* world_rank); +int MPI_Abort(MPI_Comm comm, int errorcode); +int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm* newcomm); +int MPI_Comm_dup(MPI_Comm comm, MPI_Comm* newcomm); +int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm* newcomm); +int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int* result); +int MPI_Comm_free(MPI_Comm* comm); +int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler); +int MPI_Topo_test(MPI_Comm comm, int* status); +int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm* newcomm); +int MPI_Comm_create_keyval(MPI_Comm_copy_attr_function* comm_copy_attr_fn, MPI_Comm_delete_attr_function* comm_delete_attr_fn, int* comm_keyval, void* extra_state); +int MPI_Comm_free_keyval(int* comm_keyval); +int MPI_Comm_create_group(MPI_Comm comm, MPI_Group group, int tag, MPI_Comm* newcomm); + +int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int* coords); +int MPI_Cart_create(MPI_Comm old_comm, int ndims, const int* dims, const int* periods, int reorder, MPI_Comm* comm_cart); +int MPI_Cart_get(MPI_Comm comm, int maxdims, int* dims, int* periods, int* coords); +int MPI_Cart_map(MPI_Comm comm, int ndims, const int* dims, const int* periods, int* newrank); +int MPI_Cart_rank(MPI_Comm comm, const int* coords, int* rank); +int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int* rank_source, int* rank_dest); +int MPI_Cart_sub(MPI_Comm comm, const int* remain_dims, MPI_Comm* new_comm); +int MPI_Cartdim_get(MPI_Comm comm, int* ndims); +int MPI_Dims_create(int nnodes, int ndims, int* dims); + +int MPI_Test(MPI_Request* request, int* flag, MPI_Status* status); +int MPI_Wait(MPI_Request* request, MPI_Status* status); +int MPI_Iprobe(int source, int tag, MPI_Comm comm, int* flag, MPI_Status* status); +int MPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status* status); +int MPI_Waitall(int count, MPI_Request* array_of_requests, MPI_Status* array_of_statuses); +int MPI_Waitany(int count, MPI_Request* array_of_requests, int* index, MPI_Status* status); +int MPI_Testall(int count, MPI_Request* array_of_requests, int* flag, MPI_Status* array_of_statuses); +int MPI_Testany(int count, MPI_Request* array_of_requests, int* index, int* flag, MPI_Status* status); + +int MPI_Comm_group(MPI_Comm comm, MPI_Group* group); +int MPI_Group_size(MPI_Group group, int* size); +int MPI_Group_free(MPI_Group* group); +int MPI_Group_compare(MPI_Group group1, MPI_Group group2, int* result); +int MPI_Group_rank(MPI_Group group, int* rank); +int MPI_Group_incl(MPI_Group group, int n, const int* ranks, MPI_Group* newgroup); + +int MPI_Type_size(MPI_Datatype datatype, int* size); +int MPI_Type_commit(MPI_Datatype* type); +int MPI_Type_contiguous(int count, MPI_Datatype oldtype, MPI_Datatype* newtype); +int MPI_Type_free(MPI_Datatype* type); +int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype* newtype); +int MPI_Type_create_struct(int count, const int* array_of_blocklengths, const MPI_Aint* array_of_displacements, MPI_Datatype* array_of_types, MPI_Datatype* newtype); +int MPI_Type_indexed(int count, const int* array_of_blocklengths, const int* array_of_displacements, MPI_Datatype oldtype, MPI_Datatype* newtype); +int MPI_Type_get_extent(MPI_Datatype type, MPI_Aint* lb, MPI_Aint* extent); +int MPI_Type_create_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype, MPI_Datatype* newtype); +int MPI_Type_create_hindexed(int count, const int* array_of_blocklengths, const MPI_Aint* array_of_displacements, MPI_Datatype oldtype, MPI_Datatype* newtype); +int MPI_Type_create_hindexed_block(int count, int blocklength, const MPI_Aint* array_of_displacements, MPI_Datatype oldtype, MPI_Datatype* newtype); +int MPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent, MPI_Datatype* newtype); +int MPI_Type_dup(MPI_Datatype type, MPI_Datatype* newtype); +int MPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int* size); +int MPI_Pack(const void* inbuf, int incount, MPI_Datatype datatype, void* outbuf, int outsize, int* position, MPI_Comm comm); + +int MPI_Op_create(MPI_User_function* user_fn, int commute, MPI_Op* op); +int MPI_Op_free(MPI_Op* op); + +int MPI_Send(const void* buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); +int MPI_Isend(const void* buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request* request); +int MPI_Recv(void* buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status* status); +int MPI_Irecv(void* buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request* request); +int MPI_Sendrecv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, int dest, int sendtag, void* recvbuf, int recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status* status); +int MPI_Sendrecv_replace(void* buf, int count, MPI_Datatype datatype, int dest, int sendtag, int source, int recvtag, MPI_Comm comm, MPI_Status* status); +int MPI_Rsend(const void* ibuf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); +int MPI_Ibarrier(MPI_Comm comm, MPI_Request* request); +int MPI_Ibcast(void* buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm, MPI_Request* request); +int MPI_Ireduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm, MPI_Request* request); + +int MPI_Group_translate_ranks(MPI_Group group1, int n, const int* ranks1, MPI_Group group2, int* ranks2); +int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void* baseptr); +int MPI_Free_mem(void* base); + +int MPI_Error_string(int errorcode, char* string, int* resultlen); + +int MPI_File_open(MPI_Comm comm, const char* filename, int amode, MPI_Info info, MPI_File* fh); +int MPI_File_get_atomicity(MPI_File fh, int* flag); +int MPI_File_set_atomicity(MPI_File fh, int flag); +int MPI_File_set_size(MPI_File fh, MPI_Offset size); +int MPI_File_get_size(MPI_File fh, MPI_Offset* size); +int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, const char* datarep, MPI_Info info); +int MPI_File_get_view(MPI_File fh, MPI_Offset* disp, MPI_Datatype* etype, MPI_Datatype* filetype, char* datarep); +int MPI_File_read(MPI_File fh, void* buf, int count, MPI_Datatype datatype, MPI_Status* status); +int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void* buf, int count, MPI_Datatype datatype, MPI_Status* status); +int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void* buf, int count, MPI_Datatype datatype, MPI_Status* status); +int MPI_File_write(MPI_File fh, const void* buf, int count, MPI_Datatype datatype, MPI_Status* status); +int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void* buf, int count, MPI_Datatype datatype, MPI_Status* status); +int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void* buf, int count, MPI_Datatype datatype, MPI_Status* status); +int MPI_File_sync(MPI_File fh); +int MPI_File_get_position(MPI_File fh, MPI_Offset* offset); +int MPI_File_seek(MPI_File fh, MPI_Offset offset, int whence); +int MPI_File_close(MPI_File* fh); +int MPI_File_set_errhandler(MPI_File fh, MPI_Errhandler errhandler); +int MPI_File_get_errhandler(MPI_File fh, MPI_Errhandler* errhandler); +int MPI_File_delete(const char* filename, MPI_Info info); +int MPI_Get_library_version(char* version, int* resultlen); +int MPI_Get_address(const void* location, MPI_Aint* address); diff --git a/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp index 2ceb0fcca..3cf868e73 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp @@ -28,11 +28,8 @@ #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" -using namespace dmtcp_mpi; - USER_DEFINED_WRAPPER(int, Comm_group, (MPI_Comm) comm, (MPI_Group *) group) { int retval; @@ -41,10 +38,9 @@ USER_DEFINED_WRAPPER(int, Comm_group, (MPI_Comm) comm, (MPI_Group *) group) JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Comm_group)(realComm, group); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Group virtGroup = ADD_NEW_GROUP(*group); *group = virtGroup; - LOG_CALL(restoreGroups, Comm_group, comm, *group); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -96,14 +92,13 @@ USER_DEFINED_WRAPPER(int, Group_free, (MPI_Group *) group) { DMTCP_PLUGIN_DISABLE_CKPT(); int retval = MPI_Group_free_internal(group); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { // NOTE: We cannot remove the old group, since we'll need // to replay this call to reconstruct any comms that might // have been created using this group. // - // realGroup = REMOVE_OLD_GROUP(*group); - // CLEAR_GROUP_LOGS(*group); - LOG_CALL(restoreGroups, Group_free, *group); + // FIXME: See comment in Comm_free wrapper. + REMOVE_OLD_GROUP(*group); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -144,11 +139,10 @@ USER_DEFINED_WRAPPER(int, Group_incl, (MPI_Group) group, (int) n, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Group_incl)(realGroup, n, ranks, newgroup); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Group virtGroup = ADD_NEW_GROUP(*newgroup); *newgroup = virtGroup; - FncArg rs = CREATE_LOG_BUF(ranks, n * sizeof(int)); - LOG_CALL(restoreGroups, Group_incl, group, n, rs, *newgroup); + // FncArg rs = CREATE_LOG_BUF(ranks, n * sizeof(int)); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; diff --git a/mpi-proxy-split/mpi-wrappers/mpi_op_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_op_wrappers.cpp index 540ca431c..b29dac79b 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_op_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_op_wrappers.cpp @@ -27,12 +27,8 @@ #include "jfilesystem.h" #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" -using namespace dmtcp_mpi; - - USER_DEFINED_WRAPPER(int, Op_create, (MPI_User_function *) user_fn, (int) commute, (MPI_Op *) op) @@ -42,10 +38,15 @@ USER_DEFINED_WRAPPER(int, Op_create, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Op_create)(user_fn, commute, op); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Op virtOp = ADD_NEW_OP(*op); + // FIXME HACK: Since MPI does not provide any functions To deserialize an + // operator, we get the data at creation time. + // + // FIXME: Do we also have to reconstruct the MPI_User_function? + op_desc_t* desc = VIRTUAL_TO_DESC_OP(virtOp); + update_op_desc_t(desc, user_fn, commute); *op = virtOp; - LOG_CALL(restoreOps, Op_create, user_fn, commute, virtOp); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -62,18 +63,19 @@ USER_DEFINED_WRAPPER(int, Op_free, (MPI_Op*) op) JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Op_free)(&realOp); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { // NOTE: We cannot remove the old op, since we'll need // to replay this call to reconstruct any new op that might // have been created using this op. // - // realOp = REMOVE_OLD_OP(*op); - LOG_CALL(restoreOps, Op_free, *op); + // FIXME: See comment in Comm_free wrapper. + REMOVE_OLD_OP(*op); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Reduce_local, (const void *) inbuf, (void *) inoutbuf, (int) count, (MPI_Datatype) datatype, (MPI_Op) op) @@ -89,10 +91,14 @@ USER_DEFINED_WRAPPER(int, Reduce_local, DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) + PMPI_IMPL(int, MPI_Op_create, MPI_User_function *user_fn, int commute, MPI_Op *op) PMPI_IMPL(int, MPI_Op_free, MPI_Op *op) +#if defined(MPICH) PMPI_IMPL(int, MPI_Reduce_local, const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype, MPI_Op op) +#endif // defined(MPICH) diff --git a/mpi-proxy-split/mpi-wrappers/mpi_p2p_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_p2p_wrappers.cpp index f1a0c5ef8..4121a0d35 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_p2p_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_p2p_wrappers.cpp @@ -31,9 +31,11 @@ #include "protectedfds.h" #include "mpi_nextfunc.h" #include "virtual-ids.h" -#include "record-replay.h" // To support MANA_P2P_LOG and MANA_P2P_REPLAY: + +#if defined(MPICH) #include "p2p-deterministic.h" +#endif // defined(MPICH) extern int p2p_deterministic_skip_save_request; @@ -58,9 +60,15 @@ USER_DEFINED_WRAPPER(int, Send, if (retval != MPI_SUCCESS) { return retval; } + +#if defined(MPICH) p2p_deterministic_skip_save_request = 1; retval = MPI_Wait(&req, &st); p2p_deterministic_skip_save_request = 0; +#else + retval = MPI_Wait(&req, &st); +#endif // defined(MPICH) + #endif return retval; } @@ -149,7 +157,9 @@ USER_DEFINED_WRAPPER(int, Recv, if (retval != MPI_SUCCESS) { return retval; } +#if defined(MPICH) p2p_deterministic_skip_save_request = 0; +#endif // defined(MPICH) retval = MPI_Wait(&req, status); #endif // updateLocalRecvs(); @@ -210,8 +220,8 @@ USER_DEFINED_WRAPPER(int, Irecv, DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } - LOG_PRE_Irecv(&status); - REPLAY_PRE_Irecv(count,datatype,source,tag,comm); + // LOG_PRE_Irecv(&status); + // REPLAY_PRE_Irecv(count,datatype,source,tag,comm); MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(datatype); @@ -228,7 +238,7 @@ USER_DEFINED_WRAPPER(int, Irecv, logRequestInfo(*request, IRECV_REQUEST); #endif } - LOG_POST_Irecv(source,tag,comm,&status,request,buf); + // LOG_POST_Irecv(source,tag,comm,&status,request,buf); DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } @@ -251,7 +261,11 @@ USER_DEFINED_WRAPPER(int, Sendrecv, (const void *) sendbuf, (int) sendcount, RETURN_TO_UPPER_HALF(); DMTCP_PLUGIN_ENABLE_CKPT(); #else + + #ifndef EXAMPI get_fortran_constants(); + #endif + MPI_Request reqs[2]; MPI_Status sts[2]; // FIXME: The send and receive need to be atomic @@ -268,9 +282,17 @@ USER_DEFINED_WRAPPER(int, Sendrecv, (const void *) sendbuf, (int) sendcount, retval = MPI_Waitall(2, reqs, sts); // Set status only when the status is neither MPI_STATUS_IGNORE nor // FORTRAN_MPI_STATUS_IGNORE + + #ifdef EXAMPI + if (status != MPI_STATUS_IGNORE) { + *status = sts[1]; + } + #else if (status != MPI_STATUS_IGNORE && status != FORTRAN_MPI_STATUS_IGNORE) { *status = sts[1]; } + #endif + if (retval == MPI_SUCCESS) { // updateLocalRecvs(); } @@ -310,9 +332,15 @@ USER_DEFINED_WRAPPER(int, Sendrecv_replace, (void *) buf, (int) count, memcpy(buf, tmpbuf, count * type_size); // Set status, free buffer, and return +#ifdef EXAMPI if (status != MPI_STATUS_IGNORE && status != FORTRAN_MPI_STATUS_IGNORE) { *status = sts[0]; } +#else + if (status != MPI_STATUS_IGNORE) { + *status = sts[0]; + } +#endif // defined(EXAMPI) free(tmpbuf); return retval; diff --git a/mpi-proxy-split/mpi-wrappers/mpi_request_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_request_wrappers.cpp index bb683af51..2d438b248 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_request_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_request_wrappers.cpp @@ -28,16 +28,15 @@ #include "jfilesystem.h" #include "protectedfds.h" -#include "record-replay.h" #include "p2p_log_replay.h" #include "p2p_drain_send_recv.h" #include "mpi_plugin.h" #include "mpi_nextfunc.h" #include "virtual-ids.h" // To support MANA_P2P_LOG and MANA_P2P_REPLAY: -#include "p2p-deterministic.h" +// #include "p2p-deterministic.h" -extern int p2p_deterministic_skip_save_request; +// extern int p2p_deterministic_skip_save_request; int MPI_Test_internal(MPI_Request *request, int *flag, MPI_Status *status, bool isRealRequest) @@ -62,26 +61,32 @@ USER_DEFINED_WRAPPER(int, Test, (MPI_Request*) request, (int*) flag, (MPI_Status*) status) { int retval; - if (*request == MPI_REQUEST_NULL) { + if (*request == REAL_CONSTANT(REQUEST_NULL)) { // *request might be in read-only memory. So we can't overwrite it with - // MPI_REQUEST_NULL later. + // REAL_CONSTANT(REQUEST_NULL) later. *flag = true; return MPI_SUCCESS; } - LOG_PRE_Test(status); + // LOG_PRE_Test(status); DMTCP_PLUGIN_DISABLE_CKPT(); MPI_Status statusBuffer; MPI_Status *statusPtr = status; - if (statusPtr == MPI_STATUS_IGNORE || +#ifdef EXAMPI + if (statusPtr == MPI_STATUS_IGNORE) { + statusPtr = &statusBuffer; + } +#else +if (statusPtr == MPI_STATUS_IGNORE || statusPtr == FORTRAN_MPI_STATUS_IGNORE) { statusPtr = &statusBuffer; } +#endif // defined(EXAMPI) MPI_Request realRequest; realRequest = VIRTUAL_TO_REAL_REQUEST(*request); - if (*request != MPI_REQUEST_NULL && realRequest == MPI_REQUEST_NULL) { + if (*request != REAL_CONSTANT(REQUEST_NULL) && realRequest == REAL_CONSTANT(REQUEST_NULL)) { *flag = 1; REMOVE_OLD_REQUEST(*request); - *request = MPI_REQUEST_NULL; + *request = REAL_CONSTANT(REQUEST_NULL); DMTCP_PLUGIN_ENABLE_CKPT(); // FIXME: We should also fill in the status return MPI_SUCCESS; @@ -91,7 +96,7 @@ USER_DEFINED_WRAPPER(int, Test, (MPI_Request*) request, // Updating global counter of recv bytes // FIXME: This if statement should be merged into // clearPendingRequestFromLog() - if (*flag && *request != MPI_REQUEST_NULL + if (*flag && *request != REAL_CONSTANT(REQUEST_NULL) && g_nonblocking_calls.find(*request) != g_nonblocking_calls.end() && g_nonblocking_calls[*request]->type == IRECV_REQUEST) { int count = 0; @@ -108,12 +113,11 @@ USER_DEFINED_WRAPPER(int, Test, (MPI_Request*) request, fflush(stdout); #endif } - LOG_POST_Test(request, statusPtr); - if (retval == MPI_SUCCESS && *flag && MPI_LOGGING()) { + // LOG_POST_Test(request, statusPtr); + if (retval == MPI_SUCCESS && *flag && mana_state != RESTART_REPLAY) { clearPendingRequestFromLog(*request); REMOVE_OLD_REQUEST(*request); - LOG_REMOVE_REQUEST(*request); // remove from record-replay log - *request = MPI_REQUEST_NULL; + *request = REAL_CONSTANT(REQUEST_NULL); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -136,6 +140,15 @@ USER_DEFINED_WRAPPER(int, Testall, (int) count, for (int i = 0; i < count; i++) { // FIXME: Ideally, we should only check FORTRAN_MPI_STATUS_IGNORE // in the Fortran wrapper. +#ifdef EXAMPI + if (local_array_of_statuses != MPI_STATUSES_IGNORE) { + retval = MPI_Test(&local_array_of_requests[i], local_flag, + &local_array_of_statuses[i]); + } else { + retval = MPI_Test(&local_array_of_requests[i], local_flag, + MPI_STATUS_IGNORE); + } +#else if (local_array_of_statuses != MPI_STATUSES_IGNORE && local_array_of_statuses != FORTRAN_MPI_STATUSES_IGNORE) { retval = MPI_Test(&local_array_of_requests[i], local_flag, @@ -144,6 +157,7 @@ USER_DEFINED_WRAPPER(int, Testall, (int) count, retval = MPI_Test(&local_array_of_requests[i], local_flag, MPI_STATUS_IGNORE); } +#endif // defined(EXAMPI) if (retval != MPI_SUCCESS) { *local_flag = 0; break; @@ -181,7 +195,7 @@ USER_DEFINED_WRAPPER(int, Testany, (int) count, *local_flag = 1; *local_index = MPI_UNDEFINED; for (int i = 0; i < local_count; i++) { - if (local_array_of_requests[i] == MPI_REQUEST_NULL) { + if (local_array_of_requests[i] == REAL_CONSTANT(REQUEST_NULL)) { continue; } retval = MPI_Test(&local_array_of_requests[i], local_flag, local_status); @@ -219,12 +233,17 @@ USER_DEFINED_WRAPPER(int, Waitall, (int) count, MPI_Request *local_array_of_requests = array_of_requests; MPI_Status *local_array_of_statuses = array_of_statuses; + #ifndef EXAMPI get_fortran_constants(); + #endif + for (int i = 0; i < count; i++) { /* FIXME: Is there a chance it gets a valid C address, which we shouldn't * ignore? Ideally, we should only check FORTRAN_MPI_STATUSES_IGNORE * in the Fortran wrapper. */ + + #ifndef EXAMPI if (local_array_of_statuses != MPI_STATUSES_IGNORE && local_array_of_statuses != FORTRAN_MPI_STATUSES_IGNORE) { retval = MPI_Wait(&local_array_of_requests[i], @@ -232,9 +251,21 @@ USER_DEFINED_WRAPPER(int, Waitall, (int) count, } else { retval = MPI_Wait(&local_array_of_requests[i], MPI_STATUS_IGNORE); } + #else + if (local_array_of_statuses != MPI_STATUSES_IGNORE) { + retval = MPI_Wait(&local_array_of_requests[i], + &local_array_of_statuses[i]); + } else { + retval = MPI_Wait(&local_array_of_requests[i], MPI_STATUS_IGNORE); + } + #endif + if (retval != MPI_SUCCESS) { break; } + + + } #endif return retval; @@ -256,11 +287,11 @@ USER_DEFINED_WRAPPER(int, Waitany, (int) count, *local_index = MPI_UNDEFINED; int was_null[count] = {0}; for (int i = 0; i < count; i++) { - was_null[i] = local_array_of_requests[i] == MPI_REQUEST_NULL ? 1 : 0; + was_null[i] = local_array_of_requests[i] == REAL_CONSTANT(REQUEST_NULL) ? 1 : 0; } while (1) { for (int i = 0; i < count; i++) { - if (local_array_of_requests[i] == MPI_REQUEST_NULL) { + if (local_array_of_requests[i] == REAL_CONSTANT(REQUEST_NULL)) { if (was_null[i]) { continue; } else { @@ -278,7 +309,7 @@ USER_DEFINED_WRAPPER(int, Waitany, (int) count, } if (flag) { MPI_Request *request = &local_array_of_requests[i]; - if (*request != MPI_REQUEST_NULL + if (*request != REAL_CONSTANT(REQUEST_NULL) && g_nonblocking_calls.find(*request) != g_nonblocking_calls.end() && g_nonblocking_calls[*request]->type == IRECV_REQUEST) { int count = 0; @@ -289,17 +320,17 @@ USER_DEFINED_WRAPPER(int, Waitany, (int) count, MPI_Comm comm = g_nonblocking_calls[*request]->comm; int worldRank = localRankToGlobalRank(local_status->MPI_SOURCE, comm); g_recvBytesByRank[worldRank] += count * size; - } else if (*request == MPI_REQUEST_NULL) { + } else if (*request == REAL_CONSTANT(REQUEST_NULL)) { if (!was_null[i]) { *local_index = i; return retval; } } - if (MPI_LOGGING()) { + if (mana_state != RESTART_REPLAY) { clearPendingRequestFromLog(local_array_of_requests[i]); REMOVE_OLD_REQUEST(local_array_of_requests[i]); - local_array_of_requests[i] = MPI_REQUEST_NULL; + local_array_of_requests[i] = REAL_CONSTANT(REQUEST_NULL); } *local_index = i; @@ -319,9 +350,9 @@ USER_DEFINED_WRAPPER(int, Waitany, (int) count, USER_DEFINED_WRAPPER(int, Wait, (MPI_Request*) request, (MPI_Status*) status) { int retval; - if (*request == MPI_REQUEST_NULL) { + if (*request == REAL_CONSTANT(REQUEST_NULL)) { // *request might be in read-only memory. So we can't overwrite it with - // MPI_REQUEST_NULL later. + // REAL_CONSTANT(REQUEST_NULL) later. return MPI_SUCCESS; } int flag = 0; @@ -329,10 +360,16 @@ USER_DEFINED_WRAPPER(int, Wait, (MPI_Request*) request, (MPI_Status*) status) MPI_Status *statusPtr = status; // FIXME: Ideally, we should only check FORTRAN_MPI_STATUS_IGNORE // in the Fortran wrapper. +#ifndef EXAMPI if (statusPtr == MPI_STATUS_IGNORE || statusPtr == FORTRAN_MPI_STATUS_IGNORE) { statusPtr = &statusBuffer; } +#else + if (statusPtr == MPI_STATUS_IGNORE) { + statusPtr = &statusBuffer; + } +#endif // ifndef EXAMPI // FIXME: We translate the virtual request in every iteration. // We want to translate it only once, and update the real request // after restart if we checkpoint in the while loop. @@ -343,7 +380,7 @@ USER_DEFINED_WRAPPER(int, Wait, (MPI_Request*) request, (MPI_Status*) status) // Updating global counter of recv bytes // FIXME: This if statement should be merged into // clearPendingRequestFromLog() - if (flag && *request != MPI_REQUEST_NULL + if (flag && *request != REAL_CONSTANT(REQUEST_NULL) && g_nonblocking_calls.find(*request) != g_nonblocking_calls.end() && g_nonblocking_calls[*request]->type == IRECV_REQUEST) { int count = 0; @@ -363,11 +400,10 @@ USER_DEFINED_WRAPPER(int, Wait, (MPI_Request*) request, (MPI_Status*) status) if (p2p_deterministic_skip_save_request == 0) { if (flag) LOG_POST_Wait(request, statusPtr); } - if (flag && MPI_LOGGING()) { + if (flag && mana_state != RESTART_REPLAY) { clearPendingRequestFromLog(*request); // Remove from g_nonblocking_calls REMOVE_OLD_REQUEST(*request); // Remove from virtual id - LOG_REMOVE_REQUEST(*request); // Remove from record-replay log - *request = MPI_REQUEST_NULL; + *request = REAL_CONSTANT(REQUEST_NULL); } DMTCP_PLUGIN_ENABLE_CKPT(); } @@ -402,6 +438,7 @@ USER_DEFINED_WRAPPER(int, Iprobe, (int) source, (int) tag, (MPI_Comm) comm, return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Request_get_status, (MPI_Request) request, (int *) flag, (MPI_Status *) status) { @@ -414,11 +451,14 @@ USER_DEFINED_WRAPPER(int, Request_get_status, (MPI_Request) request, DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) +#if defined(MPICH) DEFINE_FNC(int, Get_elements, (const MPI_Status *) status, (MPI_Datatype) datatype, (int *) count); DEFINE_FNC(int, Get_elements_x, (const MPI_Status *) status, (MPI_Datatype) datatype, (MPI_Count *) count); +#endif // defined(MPICH) PMPI_IMPL(int, MPI_Test, MPI_Request* request, int* flag, MPI_Status* status) PMPI_IMPL(int, MPI_Wait, MPI_Request* request, MPI_Status* status) @@ -434,10 +474,11 @@ PMPI_IMPL(int, MPI_Testall, int count, MPI_Request array_of_requests[], int *flag, MPI_Status *array_of_statuses) PMPI_IMPL(int, MPI_Testany, int count, MPI_Request array_of_requests[], int *index, int *flag, MPI_Status *status); +#if defined(MPICH) PMPI_IMPL(int, MPI_Get_elements, const MPI_Status *status, MPI_Datatype datatype, int *count) PMPI_IMPL(int, MPI_Get_elements_x, const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count) PMPI_IMPL(int, MPI_Request_get_status, MPI_Request request, int* flag, MPI_Status *status) - +#endif // defined(MPICH) diff --git a/mpi-proxy-split/mpi-wrappers/mpi_type_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_type_wrappers.cpp index fcdd8ffe9..26c35e6e7 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_type_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_type_wrappers.cpp @@ -27,11 +27,8 @@ #include "jfilesystem.h" #include "protectedfds.h" #include "mpi_nextfunc.h" -#include "record-replay.h" #include "virtual-ids.h" -using namespace dmtcp_mpi; - USER_DEFINED_WRAPPER(int, Type_size, (MPI_Datatype) datatype, (int *) size) { int retval; @@ -52,13 +49,14 @@ USER_DEFINED_WRAPPER(int, Type_free, (MPI_Datatype *) type) JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Type_free)(&realType); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { // NOTE: We cannot remove the old type, since we'll need // to replay this call to reconstruct any new type that might // have been created using this type. // - // realType = REMOVE_OLD_TYPE(*type); - LOG_CALL(restoreTypes, Type_free, *type); + // FIXME: Now, we remove this type. Otherwise, if we try to decode a type + // that has been freed in the lower half, MPI will be upset. + REMOVE_OLD_TYPE(*type); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -74,10 +72,6 @@ USER_DEFINED_WRAPPER(int, Type_commit, (MPI_Datatype *) type) RETURN_TO_UPPER_HALF(); if (retval != MPI_SUCCESS) { realType = REMOVE_OLD_TYPE(*type); - } else { - if (MPI_LOGGING()) { - LOG_CALL(restoreTypes, Type_commit, *type); - } } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -92,15 +86,15 @@ USER_DEFINED_WRAPPER(int, Type_contiguous, (int) count, (MPI_Datatype) oldtype, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Type_contiguous)(count, realType, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - LOG_CALL(restoreTypes, Type_contiguous, count, oldtype, virtType); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Type_hvector, (int) count, (int) blocklength, (MPI_Aint) stride, (MPI_Datatype) oldtype, (MPI_Datatype*) newtype) @@ -112,34 +106,60 @@ USER_DEFINED_WRAPPER(int, Type_hvector, (int) count, (int) blocklength, retval = NEXT_FUNC(Type_hvector)(count, blocklength, stride, realType, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - LOG_CALL(restoreTypes, Type_hvector, count, blocklength, - stride, oldtype, virtType); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } + USER_DEFINED_WRAPPER(int, Type_create_hvector, (int) count, (int) blocklength, (MPI_Aint) stride, (MPI_Datatype) oldtype, (MPI_Datatype*) newtype) { return MPI_Type_hvector(count, blocklength, stride, oldtype, newtype); } +#else // defined(MPICH) + USER_DEFINED_WRAPPER(int, Type_create_hvector, (int) count, (int) blocklength, + (MPI_Aint) stride, (MPI_Datatype) oldtype, + (MPI_Datatype*) newtype) +{ + int retval; + DMTCP_PLUGIN_DISABLE_CKPT(); + MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(oldtype); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + retval = NEXT_FUNC(Type_create_hvector)(count, blocklength, + stride, realType, newtype); + RETURN_TO_UPPER_HALF(); + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { + MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); + *newtype = virtType; + } + DMTCP_PLUGIN_ENABLE_CKPT(); + return retval; +} +#endif // defined(MPICH) USER_DEFINED_WRAPPER(int, Type_vector, (int) count, (int) blocklength, (int) stride, (MPI_Datatype) oldtype, (MPI_Datatype*) newtype) { - int size; - int retval = MPI_Type_size(oldtype, &size); - if(retval != MPI_SUCCESS) { - return retval; - } + int retval; + DMTCP_PLUGIN_DISABLE_CKPT(); - return MPI_Type_hvector(count, blocklength, stride*size, oldtype, newtype); + MPI_Datatype realType = VIRTUAL_TO_REAL_TYPE(oldtype); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + retval = NEXT_FUNC(Type_vector)(count, blocklength, + stride, realType, newtype); + RETURN_TO_UPPER_HALF(); + if (retval == MPI_SUCCESS) { + MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); + *newtype = virtType; + } + DMTCP_PLUGIN_ENABLE_CKPT(); + return retval; } // int MPI_Type_create_struct(int count, @@ -165,18 +185,18 @@ USER_DEFINED_WRAPPER(int, Type_create_struct, (int) count, array_of_displacements, realTypes, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - FncArg bs = CREATE_LOG_BUF(array_of_blocklengths, count * sizeof(int)); - FncArg ds = CREATE_LOG_BUF(array_of_displacements, count * sizeof(MPI_Aint)); - FncArg ts = CREATE_LOG_BUF(array_of_types, count * sizeof(MPI_Datatype)); - LOG_CALL(restoreTypes, Type_create_struct, count, bs, ds, ts, virtType); + // FncArg bs = CREATE_LOG_BUF(array_of_blocklengths, count * sizeof(int)); + // FncArg ds = CREATE_LOG_BUF(array_of_displacements, count * sizeof(MPI_Aint)); + // FncArg ts = CREATE_LOG_BUF(array_of_types, count * sizeof(MPI_Datatype)); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) // Perlmutter cray_mpich both implement MPI 3.1. However, they use different // APIs. We use MPICH_NUMVERSION (3.4a2) to differentiate the cray-mpich on Cori // and Perlmuttter. This ad-hoc workaround should be removed once the cray-mpich @@ -197,7 +217,10 @@ USER_DEFINED_WRAPPER(int, Type_struct, (int) count, array_of_displacements, array_of_types, newtype ); } +#endif // if defined(MPICH) + +#if defined(MPICH) #if MPICH_NUMVERSION < MPICH_CALC_VERSION(3,4,0,0,2) && defined(CRAY_MPICH_VERSION) USER_DEFINED_WRAPPER(int, Type_hindexed, (int) count, (const int*) array_of_blocklengths, @@ -218,18 +241,19 @@ USER_DEFINED_WRAPPER(int, Type_hindexed, (int) count, array_of_displacements, realType, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - FncArg bs = CREATE_LOG_BUF(array_of_blocklengths, count * sizeof(int)); - FncArg ds = CREATE_LOG_BUF(array_of_displacements, - count * sizeof(MPI_Aint)); - LOG_CALL(restoreTypes, Type_hindexed, count, bs, ds, oldtype, virtType); + // FncArg bs = CREATE_LOG_BUF(array_of_blocklengths, count * sizeof(int)); + // FncArg ds = CREATE_LOG_BUF(array_of_displacements, + // count * sizeof(MPI_Aint)); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // if defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Type_create_hindexed, (int) count, (const int*) array_of_blocklengths, (const MPI_Aint*) array_of_displacements, @@ -250,7 +274,9 @@ USER_DEFINED_WRAPPER(int, Type_create_hindexed, (int) count, return ret; #endif } +#endif // if defined(MPICH) +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Type_create_hindexed_block, (int) count, (int) blocklength, (const MPI_Aint*) array_of_displacements, @@ -272,6 +298,7 @@ USER_DEFINED_WRAPPER(int, Type_create_hindexed_block, (int) count, return ret; #endif } +#endif // defined(MPICH) USER_DEFINED_WRAPPER(int, Type_hindexed_block, (int) count, (int) blocklength, @@ -296,17 +323,17 @@ USER_DEFINED_WRAPPER(int, Type_indexed, (int) count, array_of_displacements, realType, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - FncArg bs = CREATE_LOG_BUF(array_of_blocklengths, count * sizeof(int)); - FncArg ds = CREATE_LOG_BUF(array_of_displacements, count * sizeof(int)); - LOG_CALL(restoreTypes, Type_indexed, count, bs, ds, oldtype, virtType); + // FncArg bs = CREATE_LOG_BUF(array_of_blocklengths, count * sizeof(int)); + // FncArg ds = CREATE_LOG_BUF(array_of_displacements, count * sizeof(int)); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Type_dup, (MPI_Datatype) oldtype, (MPI_Datatype*) newtype) { @@ -316,14 +343,14 @@ USER_DEFINED_WRAPPER(int, Type_dup, (MPI_Datatype) oldtype, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Type_dup)(realType, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - LOG_CALL(restoreTypes, Type_dup, oldtype, virtType); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; } +#endif // defined(MPICH) USER_DEFINED_WRAPPER(int, Type_create_resized, (MPI_Datatype) oldtype, (MPI_Aint) lb, (MPI_Aint) extent, (MPI_Datatype*) newtype) @@ -334,10 +361,9 @@ USER_DEFINED_WRAPPER(int, Type_create_resized, (MPI_Datatype) oldtype, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Type_create_resized)(realType, lb, extent, newtype); RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS && MPI_LOGGING()) { + if (retval == MPI_SUCCESS && mana_state != RESTART_REPLAY) { MPI_Datatype virtType = ADD_NEW_TYPE(*newtype); *newtype = virtType; - LOG_CALL(restoreTypes, Type_create_resized, oldtype, lb, extent, virtType); } DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -394,16 +420,19 @@ PMPI_IMPL(int, MPI_Type_commit, MPI_Datatype *type) PMPI_IMPL(int, MPI_Type_contiguous, int count, MPI_Datatype oldtype, MPI_Datatype *newtype) PMPI_IMPL(int, MPI_Type_free, MPI_Datatype *type) +#if defined(MPICH) PMPI_IMPL(int, MPI_Type_vector, int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype *newtype) PMPI_IMPL(int, MPI_Type_hvector, int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype, MPI_Datatype *newtype) +#endif // defined(MPICH) PMPI_IMPL(int, MPI_Type_create_hvector, int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype, MPI_Datatype *newtype) PMPI_IMPL(int, MPI_Type_create_struct, int count, const int array_of_blocklengths[], const MPI_Aint array_of_displacements[], const MPI_Datatype array_of_types[], MPI_Datatype *newtype) +#if defined(MPICH) #if MPICH_NUMVERSION < MPICH_CALC_VERSION(3,4,0,0,2) && defined(CRAY_MPICH_VERSION) PMPI_IMPL(int, MPI_Type_struct, int count, const int array_of_blocklengths[], const MPI_Aint array_of_displacements[], const MPI_Datatype array_of_types[], @@ -419,6 +448,7 @@ PMPI_IMPL(int, MPI_Type_hindexed, int count, int array_of_blocklengths[], MPI_Aint array_of_displacements[], MPI_Datatype oldtype, MPI_Datatype *newtype); #endif +#endif // endif defined(MPICH) PMPI_IMPL(int, MPI_Type_size_x, MPI_Datatype type, MPI_Count *size) PMPI_IMPL(int, MPI_Type_indexed, int count, const int array_of_blocklengths[], @@ -432,7 +462,9 @@ PMPI_IMPL(int, MPI_Pack, const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, int outsize, int *position, MPI_Comm comm) PMPI_IMPL(int, MPI_Type_create_resized, MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent, MPI_Datatype *newtype); +#if defined(MPICH) PMPI_IMPL(int, MPI_Type_dup, MPI_Datatype type, MPI_Datatype *newtype); +#endif PMPI_IMPL(int, MPI_Type_create_hindexed, int count, const int array_of_blocklengths[], diff --git a/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_exampi.txt b/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_exampi.txt new file mode 100644 index 000000000..5899c44bb --- /dev/null +++ b/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_exampi.txt @@ -0,0 +1,176 @@ +#include ; +#include "jassert.h"; + +int MPI_Add_error_class(int *errorclass); +int MPI_Add_error_code(int errorclass, int *errorcode); +int MPI_Add_error_string(int errorcode, const char *string); +int MPI_Iallgather(const void *sendbuf, int sendcount, + MPI_Datatype sendtype, void *recvbuf, int recvcount, + MPI_Datatype recvtype, MPI_Comm comm, + MPI_Request *request); +int MPI_Iallgatherv(const void *sendbuf, int sendcount, + MPI_Datatype sendtype, void *recvbuf, + const int recvcounts[], const int displs[], + MPI_Datatype recvtype, MPI_Comm comm, + MPI_Request *request); +int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, + MPI_Request *request); +int MPI_Ialltoall(const void *sendbuf, int sendcount, + MPI_Datatype sendtype, void *recvbuf, int recvcount, + MPI_Datatype recvtype, MPI_Comm comm, + MPI_Request *request); +int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], + const int sdispls[], MPI_Datatype sendtype, + void *recvbuf, const int recvcounts[], + const int rdispls[], MPI_Datatype recvtype, + MPI_Comm comm, MPI_Request *request); +int MPI_Alltoallw(const void *sendbuf, const int sendcounts[], + const int sdispls[], const MPI_Datatype sendtypes[], + void *recvbuf, const int recvcounts[], + const int rdispls[], const MPI_Datatype recvtypes[], + MPI_Comm comm); +int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], + const int sdispls[], const MPI_Datatype sendtypes[], + void *recvbuf, const int recvcounts[], + const int rdispls[], const MPI_Datatype recvtypes[], + MPI_Comm comm, MPI_Request *request); +int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, + int dest, int tag, MPI_Comm comm); +int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, + int dest, int tag, MPI_Comm comm, + MPI_Request *request); +int MPI_Buffer_attach(void *buffer, int size); +int MPI_Buffer_detach(void *buffer, int *size); +int MPI_Cancel(MPI_Request *request); +int MPI_Close_port(const char *port_name); +int MPI_Comm_accept(const char *port_name, MPI_Info info, int root, MPI_Comm comm, MPI_Comm *newcomm); + +int MPI_Comm_call_errhandler(MPI_Comm comm, int errorcode); +int MPI_Comm_connect(const char *port_name, MPI_Info info, int root, MPI_Comm comm, MPI_Comm *newcomm); +int MPI_Comm_disconnect(MPI_Comm *comm); +int MPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request); +int MPI_Comm_dup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm); +int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int nodes[], const int degrees[], const int targets[], const int weights[], MPI_Info info, int reorder, MPI_Comm * newcomm); +int MPI_Dist_graph_create_adjacent(MPI_Comm comm_old, int indegree, const int sources[], const int sourceweights[], int outdegree, const int destinations[], const int destweights[], MPI_Info info, int reorder, MPI_Comm *comm_dist_graph); +int MPI_Dist_graph_neighbors(MPI_Comm comm, int maxindegree, int sources[], int sourceweights[], int maxoutdegree, int destinations[], int destweights[]); +int MPI_Dist_graph_neighbors_count(MPI_Comm comm, int *inneighbors, int *outneighbors, int *weighted); +int MPI_Comm_get_errhandler(MPI_Comm comm, MPI_Errhandler *erhandler); +int MPI_Comm_get_info(MPI_Comm comm, MPI_Info *info_used); +int MPI_Comm_get_name(MPI_Comm comm, char *comm_name, int *resultlen); +int MPI_Comm_get_parent(MPI_Comm *parent); +int MPI_Comm_join(int fd, MPI_Comm *intercomm); +int MPI_Comm_remote_group(MPI_Comm comm, MPI_Group *group); +int MPI_Comm_remote_size(MPI_Comm comm, int *size); +int MPI_Comm_set_info(MPI_Comm comm, MPI_Info info); +int MPI_Comm_set_name(MPI_Comm comm, const char *comm_name); +int MPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info info, int root, MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]); +int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_of_argv[], const int array_of_maxprocs[], const MPI_Info array_of_info[], int root, MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]); +int MPI_Comm_test_inter(MPI_Comm comm, int *flag); +int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); + + +int MPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Get_version(int *version, int *subversion); +int MPI_Graph_create(MPI_Comm comm_old, int nnodes, const int index[], const int edges[], int reorder, MPI_Comm *comm_graph); +int MPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges, int index[], int edges[]); +int MPI_Graph_map(MPI_Comm comm, int nnodes, const int index[], const int edges[], int *newrank); +int MPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors); +int MPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors, int neighbors[]); +int MPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges); + +int MPI_Group_difference(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup); +int MPI_Group_excl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup); + +int MPI_Group_intersection(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup); +int MPI_Group_range_excl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup); +int MPI_Group_range_incl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup); +int MPI_Group_union(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup); +int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); + +int MPI_Info_create(MPI_Info *info); +int MPI_Info_delete(MPI_Info info, const char *key); +int MPI_Info_dup(MPI_Info info, MPI_Info *newinfo); + +int MPI_Info_free(MPI_Info *info); +int MPI_Info_get(MPI_Info info, const char *key, int valuelen, char *value, int *flag); +int MPI_Info_get_nkeys(MPI_Info info, int *nkeys); +int MPI_Info_get_nthkey(MPI_Info info, int n, char *key); +int MPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen, int *flag); +int MPI_Info_set(MPI_Info info, const char *key, const char *value); +int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader, MPI_Comm bridge_comm, int remote_leader, int tag, MPI_Comm *newintercomm); +int MPI_Intercomm_merge(MPI_Comm intercomm, int high, MPI_Comm *newintercomm); +int MPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Is_thread_main(int *flag); +int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name); + + +int MPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm); +int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request); + +int MPI_Op_commutative(MPI_Op op, int *commute); +int MPI_Open_port(MPI_Info info, char *port_name); + +int MPI_Pack_external(const char datarep[], const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position); +int MPI_Pack_external_size(const char datarep[], int incount, MPI_Datatype datatype, MPI_Aint *size); +int MPI_Publish_name(const char *service_name, MPI_Info info, const char *port_name); +int MPI_Query_thread(int *provided); +int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); +int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); + + +int MPI_Request_free(MPI_Request *request); +int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); +int MPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); +int MPI_Start(MPI_Request *request); +int MPI_Startall(int count, MPI_Request array_of_requests[]); + + +int MPI_Status_set_cancelled(MPI_Status *status, int flag); +int MPI_Status_set_elements(MPI_Status *status, MPI_Datatype datatype, int count); +int MPI_Status_set_elements_x(MPI_Status *status, MPI_Datatype datatype, MPI_Count count); +int MPI_Test_cancelled(const MPI_Status *status, int *flag); +int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]); + +int MPI_Type_create_darray(int size, int rank, int ndims, const int gsize_array[], const int distrib_array[], const int darg_array[], const int psize_array[], int order, MPI_Datatype oldtype, MPI_Datatype *newtype); +int MPI_Type_create_f90_complex(int p, int r, MPI_Datatype *newtype); +int MPI_Type_create_f90_integer(int r, MPI_Datatype *newtype); +int MPI_Type_create_f90_real(int p, int r, MPI_Datatype *newtype); +int MPI_Type_create_indexed_block(int count, int blocklength, const int array_of_displacements[], MPI_Datatype oldtype, MPI_Datatype *newtype); +int MPI_Type_create_subarray(int ndims, const int size_array[], const int subsize_array[], const int start_array[], int order, MPI_Datatype oldtype, MPI_Datatype *newtype); +int MPI_Type_delete_attr(MPI_Datatype type, int type_keyval); +int MPI_Type_free_keyval(int *type_keyval); + +int MPI_Type_get_attr(MPI_Datatype type, int type_keyval, void *attribute_val, int *flag); +int MPI_Type_get_contents(MPI_Datatype mtype, int max_integers, int max_addresses, int max_datatypes, int array_of_integers[], MPI_Aint array_of_addresses[], MPI_Datatype array_of_datatypes[]); +int MPI_Type_get_envelope(MPI_Datatype type, int *num_integers, int *num_addresses, int *num_datatypes, int *combiner); +int MPI_Type_get_extent_x(MPI_Datatype type, MPI_Count *lb, MPI_Count *extent); +int MPI_Type_get_name(MPI_Datatype type, char *type_name, int *resultlen); +int MPI_Type_get_true_extent(MPI_Datatype datatype, MPI_Aint *true_lb, MPI_Aint *true_extent); +int MPI_Type_get_true_extent_x(MPI_Datatype datatype, MPI_Count *true_lb, MPI_Count *true_extent); +int MPI_Type_match_size(int typeclass, int size, MPI_Datatype *type); +int MPI_Type_set_attr(MPI_Datatype type, int type_keyval, void *attr_val); +int MPI_Type_set_name(MPI_Datatype type, const char *type_name); +int MPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount, MPI_Datatype datatype, MPI_Comm comm); +int MPI_Unpublish_name(const char *service_name, MPI_Info info, const char *port_name); +int MPI_Unpack_external (const char datarep[], const void *inbuf, MPI_Aint insize, MPI_Aint *position, void *outbuf, int outcount, MPI_Datatype datatype); +int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]); +double MPI_Wtick(void); diff --git a/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers.txt b/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_mpich.txt similarity index 100% rename from mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers.txt rename to mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_mpich.txt diff --git a/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_openmpi.txt b/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_openmpi.txt new file mode 100644 index 000000000..5899c44bb --- /dev/null +++ b/mpi-proxy-split/mpi-wrappers/mpi_unimplemented_wrappers_openmpi.txt @@ -0,0 +1,176 @@ +#include ; +#include "jassert.h"; + +int MPI_Add_error_class(int *errorclass); +int MPI_Add_error_code(int errorclass, int *errorcode); +int MPI_Add_error_string(int errorcode, const char *string); +int MPI_Iallgather(const void *sendbuf, int sendcount, + MPI_Datatype sendtype, void *recvbuf, int recvcount, + MPI_Datatype recvtype, MPI_Comm comm, + MPI_Request *request); +int MPI_Iallgatherv(const void *sendbuf, int sendcount, + MPI_Datatype sendtype, void *recvbuf, + const int recvcounts[], const int displs[], + MPI_Datatype recvtype, MPI_Comm comm, + MPI_Request *request); +int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, + MPI_Request *request); +int MPI_Ialltoall(const void *sendbuf, int sendcount, + MPI_Datatype sendtype, void *recvbuf, int recvcount, + MPI_Datatype recvtype, MPI_Comm comm, + MPI_Request *request); +int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], + const int sdispls[], MPI_Datatype sendtype, + void *recvbuf, const int recvcounts[], + const int rdispls[], MPI_Datatype recvtype, + MPI_Comm comm, MPI_Request *request); +int MPI_Alltoallw(const void *sendbuf, const int sendcounts[], + const int sdispls[], const MPI_Datatype sendtypes[], + void *recvbuf, const int recvcounts[], + const int rdispls[], const MPI_Datatype recvtypes[], + MPI_Comm comm); +int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], + const int sdispls[], const MPI_Datatype sendtypes[], + void *recvbuf, const int recvcounts[], + const int rdispls[], const MPI_Datatype recvtypes[], + MPI_Comm comm, MPI_Request *request); +int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, + int dest, int tag, MPI_Comm comm); +int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, + int dest, int tag, MPI_Comm comm, + MPI_Request *request); +int MPI_Buffer_attach(void *buffer, int size); +int MPI_Buffer_detach(void *buffer, int *size); +int MPI_Cancel(MPI_Request *request); +int MPI_Close_port(const char *port_name); +int MPI_Comm_accept(const char *port_name, MPI_Info info, int root, MPI_Comm comm, MPI_Comm *newcomm); + +int MPI_Comm_call_errhandler(MPI_Comm comm, int errorcode); +int MPI_Comm_connect(const char *port_name, MPI_Info info, int root, MPI_Comm comm, MPI_Comm *newcomm); +int MPI_Comm_disconnect(MPI_Comm *comm); +int MPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request); +int MPI_Comm_dup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm); +int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int nodes[], const int degrees[], const int targets[], const int weights[], MPI_Info info, int reorder, MPI_Comm * newcomm); +int MPI_Dist_graph_create_adjacent(MPI_Comm comm_old, int indegree, const int sources[], const int sourceweights[], int outdegree, const int destinations[], const int destweights[], MPI_Info info, int reorder, MPI_Comm *comm_dist_graph); +int MPI_Dist_graph_neighbors(MPI_Comm comm, int maxindegree, int sources[], int sourceweights[], int maxoutdegree, int destinations[], int destweights[]); +int MPI_Dist_graph_neighbors_count(MPI_Comm comm, int *inneighbors, int *outneighbors, int *weighted); +int MPI_Comm_get_errhandler(MPI_Comm comm, MPI_Errhandler *erhandler); +int MPI_Comm_get_info(MPI_Comm comm, MPI_Info *info_used); +int MPI_Comm_get_name(MPI_Comm comm, char *comm_name, int *resultlen); +int MPI_Comm_get_parent(MPI_Comm *parent); +int MPI_Comm_join(int fd, MPI_Comm *intercomm); +int MPI_Comm_remote_group(MPI_Comm comm, MPI_Group *group); +int MPI_Comm_remote_size(MPI_Comm comm, int *size); +int MPI_Comm_set_info(MPI_Comm comm, MPI_Info info); +int MPI_Comm_set_name(MPI_Comm comm, const char *comm_name); +int MPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info info, int root, MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]); +int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_of_argv[], const int array_of_maxprocs[], const MPI_Info array_of_info[], int root, MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]); +int MPI_Comm_test_inter(MPI_Comm comm, int *flag); +int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); + + +int MPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Get_version(int *version, int *subversion); +int MPI_Graph_create(MPI_Comm comm_old, int nnodes, const int index[], const int edges[], int reorder, MPI_Comm *comm_graph); +int MPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges, int index[], int edges[]); +int MPI_Graph_map(MPI_Comm comm, int nnodes, const int index[], const int edges[], int *newrank); +int MPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors); +int MPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors, int neighbors[]); +int MPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges); + +int MPI_Group_difference(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup); +int MPI_Group_excl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup); + +int MPI_Group_intersection(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup); +int MPI_Group_range_excl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup); +int MPI_Group_range_incl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup); +int MPI_Group_union(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup); +int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); + +int MPI_Info_create(MPI_Info *info); +int MPI_Info_delete(MPI_Info info, const char *key); +int MPI_Info_dup(MPI_Info info, MPI_Info *newinfo); + +int MPI_Info_free(MPI_Info *info); +int MPI_Info_get(MPI_Info info, const char *key, int valuelen, char *value, int *flag); +int MPI_Info_get_nkeys(MPI_Info info, int *nkeys); +int MPI_Info_get_nthkey(MPI_Info info, int n, char *key); +int MPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen, int *flag); +int MPI_Info_set(MPI_Info info, const char *key, const char *value); +int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader, MPI_Comm bridge_comm, int remote_leader, int tag, MPI_Comm *newintercomm); +int MPI_Intercomm_merge(MPI_Comm intercomm, int high, MPI_Comm *newintercomm); +int MPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Is_thread_main(int *flag); +int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name); + + +int MPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm); +int MPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request); +int MPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm); +int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request); + +int MPI_Op_commutative(MPI_Op op, int *commute); +int MPI_Open_port(MPI_Info info, char *port_name); + +int MPI_Pack_external(const char datarep[], const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position); +int MPI_Pack_external_size(const char datarep[], int incount, MPI_Datatype datatype, MPI_Aint *size); +int MPI_Publish_name(const char *service_name, MPI_Info info, const char *port_name); +int MPI_Query_thread(int *provided); +int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); +int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); + + +int MPI_Request_free(MPI_Request *request); +int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); +int MPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request); +int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); +int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); +int MPI_Start(MPI_Request *request); +int MPI_Startall(int count, MPI_Request array_of_requests[]); + + +int MPI_Status_set_cancelled(MPI_Status *status, int flag); +int MPI_Status_set_elements(MPI_Status *status, MPI_Datatype datatype, int count); +int MPI_Status_set_elements_x(MPI_Status *status, MPI_Datatype datatype, MPI_Count count); +int MPI_Test_cancelled(const MPI_Status *status, int *flag); +int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]); + +int MPI_Type_create_darray(int size, int rank, int ndims, const int gsize_array[], const int distrib_array[], const int darg_array[], const int psize_array[], int order, MPI_Datatype oldtype, MPI_Datatype *newtype); +int MPI_Type_create_f90_complex(int p, int r, MPI_Datatype *newtype); +int MPI_Type_create_f90_integer(int r, MPI_Datatype *newtype); +int MPI_Type_create_f90_real(int p, int r, MPI_Datatype *newtype); +int MPI_Type_create_indexed_block(int count, int blocklength, const int array_of_displacements[], MPI_Datatype oldtype, MPI_Datatype *newtype); +int MPI_Type_create_subarray(int ndims, const int size_array[], const int subsize_array[], const int start_array[], int order, MPI_Datatype oldtype, MPI_Datatype *newtype); +int MPI_Type_delete_attr(MPI_Datatype type, int type_keyval); +int MPI_Type_free_keyval(int *type_keyval); + +int MPI_Type_get_attr(MPI_Datatype type, int type_keyval, void *attribute_val, int *flag); +int MPI_Type_get_contents(MPI_Datatype mtype, int max_integers, int max_addresses, int max_datatypes, int array_of_integers[], MPI_Aint array_of_addresses[], MPI_Datatype array_of_datatypes[]); +int MPI_Type_get_envelope(MPI_Datatype type, int *num_integers, int *num_addresses, int *num_datatypes, int *combiner); +int MPI_Type_get_extent_x(MPI_Datatype type, MPI_Count *lb, MPI_Count *extent); +int MPI_Type_get_name(MPI_Datatype type, char *type_name, int *resultlen); +int MPI_Type_get_true_extent(MPI_Datatype datatype, MPI_Aint *true_lb, MPI_Aint *true_extent); +int MPI_Type_get_true_extent_x(MPI_Datatype datatype, MPI_Count *true_lb, MPI_Count *true_extent); +int MPI_Type_match_size(int typeclass, int size, MPI_Datatype *type); +int MPI_Type_set_attr(MPI_Datatype type, int type_keyval, void *attr_val); +int MPI_Type_set_name(MPI_Datatype type, const char *type_name); +int MPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount, MPI_Datatype datatype, MPI_Comm comm); +int MPI_Unpublish_name(const char *service_name, MPI_Info info, const char *port_name); +int MPI_Unpack_external (const char datarep[], const void *inbuf, MPI_Aint insize, MPI_Aint *position, void *outbuf, int outcount, MPI_Datatype datatype); +int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]); +double MPI_Wtick(void); diff --git a/mpi-proxy-split/mpi-wrappers/mpi_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_wrappers.cpp index 342e8e8d3..588644e05 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_wrappers.cpp @@ -28,14 +28,12 @@ #include "jassert.h" #include "jfilesystem.h" #include "protectedfds.h" -#include "record-replay.h" #include "mpi_nextfunc.h" #include "virtual-ids.h" #include "p2p_drain_send_recv.h" #include "mana_header.h" #include "seq_num.h" -using namespace dmtcp_mpi; #if 0 DEFINE_FNC(int, Init, (int *) argc, (char ***) argv) @@ -68,14 +66,17 @@ USER_DEFINED_WRAPPER(int, Init, (int *) argc, (char ***) argv) { JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Init)(argc, argv); - // Create a duplicate of MPI_COMM_WORLD for internal use. - NEXT_FUNC(Comm_dup)(MPI_COMM_WORLD, &g_world_comm); + RETURN_TO_UPPER_HALF(); + init_lh_constants_map(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + // Create a duplicate of REAL_CONSTANT(COMM_WORLD) for internal use. + NEXT_FUNC(Comm_dup)(REAL_CONSTANT(COMM_WORLD), &g_world_comm); RETURN_TO_UPPER_HALF(); recordPostMpiInitMaps(); + init_comm_world(); g_world_comm = ADD_NEW_COMM(g_world_comm); - LOG_CALL(restoreComms, Comm_dup, MPI_COMM_WORLD, g_world_comm); initialize_drain_send_recv(); DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -93,14 +94,17 @@ USER_DEFINED_WRAPPER(int, Init_thread, (int *) argc, (char ***) argv, JUMP_TO_LOWER_HALF(lh_info.fsaddr); retval = NEXT_FUNC(Init_thread)(argc, argv, required, provided); - // Create a duplicate of MPI_COMM_WORLD for internal use. - NEXT_FUNC(Comm_dup)(MPI_COMM_WORLD, &g_world_comm); + RETURN_TO_UPPER_HALF(); + init_lh_constants_map(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + // Create a duplicate of REAL_CONSTANT(COMM_WORLD) for internal use. + NEXT_FUNC(Comm_dup)(REAL_CONSTANT(COMM_WORLD), &g_world_comm); RETURN_TO_UPPER_HALF(); recordPostMpiInitMaps(); + init_comm_world(); g_world_comm = ADD_NEW_COMM(g_world_comm); - LOG_CALL(restoreComms, Comm_dup, MPI_COMM_WORLD, g_world_comm); initialize_drain_send_recv(); DMTCP_PLUGIN_ENABLE_CKPT(); return retval; @@ -136,9 +140,15 @@ USER_DEFINED_WRAPPER(int, Finalize, (void)) return MPI_SUCCESS; } +#if defined(MPICH) USER_DEFINED_WRAPPER(int, Get_count, (const MPI_Status *) status, (MPI_Datatype) datatype, (int *) count) +#else +USER_DEFINED_WRAPPER(int, Get_count, + (MPI_Status *) status, (MPI_Datatype) datatype, + (int *) count) +#endif // defined(MPICH) { int retval; DMTCP_PLUGIN_DISABLE_CKPT(); @@ -177,7 +187,6 @@ USER_DEFINED_WRAPPER(int, Get_address, (const void *) location, // FOR DEBUGGING ONLY: // This defines a call to MPI_MANA_Internal in the lower half, which // is especially useful in debugging restart. It is called -// from mpi-proxy-split/mpi_plugin.cpp, just before doing record-replay. // In mpi-proxy-split/lower-half, redefine MPI_MANA_Internal() // to do whatever is desired. Then do: // rm bin/lh_proxy @@ -210,7 +219,11 @@ PMPI_IMPL(double, MPI_Wtime, void) PMPI_IMPL(int, MPI_Initialized, int *flag) PMPI_IMPL(int, MPI_Init_thread, int *argc, char ***argv, int required, int *provided) +#if defined(MPICH) PMPI_IMPL(int, MPI_Get_count, const MPI_Status *status, MPI_Datatype datatype, int *count) +#else +PMPI_IMPL(int, MPI_Get_count, MPI_Status *status, MPI_Datatype datatype, +#endif // defined(MPICH) PMPI_IMPL(int, MPI_Get_library_version, char *version, int *resultlen) PMPI_IMPL(int, MPI_Get_address, const void *location, MPI_Aint *address) diff --git a/mpi-proxy-split/mpi_plugin.cpp b/mpi-proxy-split/mpi_plugin.cpp index 0141322e8..4b2d5d0ec 100644 --- a/mpi-proxy-split/mpi_plugin.cpp +++ b/mpi-proxy-split/mpi_plugin.cpp @@ -46,7 +46,6 @@ #include "split_process.h" #include "p2p_log_replay.h" #include "p2p_drain_send_recv.h" -#include "record-replay.h" #include "seq_num.h" #include "mpi_nextfunc.h" #include "virtual-ids.h" @@ -72,7 +71,9 @@ extern CartesianProperties g_cartesian_properties; #endif extern ManaHeader g_mana_header; +#if defined(MPICH) extern std::unordered_map g_params_map; +#endif // defined(MPICH) constexpr const char *MANA_FILE_REGEX_ENV = "MANA_FILE_REGEX"; constexpr const char *MANA_SEGV_DEBUG_LOOP = "MANA_SEGV_DEBUG_LOOP"; @@ -881,6 +882,7 @@ save_mana_header(const char *filename) close(fd); } +#if defined(MPICH) const char * get_mpi_file_filename() { @@ -986,6 +988,7 @@ restore_mpi_files(const char *filename) } } +#endif // defined(MPICH) #ifdef SINGLE_CART_REORDER const char * @@ -1110,6 +1113,11 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) } case DMTCP_EVENT_PRECHECKPOINT: { + // FIXME: We want update_descriptors to capture all userland descriptors, + // but not any internal descriptors. A function here was creating an + // errant descriptor, but that should be fixed by PR #348. + update_descriptors(); + dmtcp_global_barrier("MPI:recordMpiInitMaps"); recordMpiInitMaps(); recordOpenFds(); dmtcp_local_barrier("MPI:GetLocalLhMmapList"); @@ -1168,18 +1176,21 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) mana_state = RESTART_REPLAY; #ifdef SINGLE_CART_REORDER dmtcp_global_barrier("MPI:setCartesianCommunicator"); - // record-replay.cpp setCartesianCommunicator(lh_info.getCartesianCommunicatorFptr); #endif - dmtcp_global_barrier("MPI:restoreMpiLogState"); - restoreMpiLogState(); // record-replay.cpp - dmtcp_global_barrier("MPI:record-replay.cpp-void"); + dmtcp_global_barrier("MPI:reconstruct_with_descriptors"); + reconstruct_with_descriptors(); + // FIXME: I place reconstruct_with_descriptors in the analogous place to + // restoreMpiLogState. + dmtcp_global_barrier("MPI:virtual-ids.cpp-void"); replayMpiP2pOnRestart(); // p2p_log_replay.cpp dmtcp_local_barrier("MPI:p2p_log_replay.cpp-void"); seq_num_reset(RESTART); dmtcp_local_barrier("MPI:seq_num_reset"); +#if defined(MPICH) const char *file = get_mpi_file_filename(); restore_mpi_files(file); +#endif // defined(MPICH) dmtcp_local_barrier("MPI:Restore-MPI-Files"); mana_state = RUNNING; break; diff --git a/mpi-proxy-split/p2p_drain_send_recv.cpp b/mpi-proxy-split/p2p_drain_send_recv.cpp index 84a29b3bd..b19d720f0 100644 --- a/mpi-proxy-split/p2p_drain_send_recv.cpp +++ b/mpi-proxy-split/p2p_drain_send_recv.cpp @@ -345,7 +345,7 @@ localRankToGlobalRank(int localRank, MPI_Comm localComm) MPI_Group worldGroup, localGroup; MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(localComm); JUMP_TO_LOWER_HALF(lh_info.fsaddr); - NEXT_FUNC(Comm_group)(MPI_COMM_WORLD, &worldGroup); + NEXT_FUNC(Comm_group)(REAL_CONSTANT(COMM_WORLD), &worldGroup); NEXT_FUNC(Comm_group)(realComm, &localGroup); NEXT_FUNC(Group_translate_ranks)(localGroup, 1, &localRank, worldGroup, &worldRank); diff --git a/mpi-proxy-split/record-replay.cpp b/mpi-proxy-split/record-replay.cpp deleted file mode 100644 index 5ef79e303..000000000 --- a/mpi-proxy-split/record-replay.cpp +++ /dev/null @@ -1,1134 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2019-2021 by Gene Cooperman, Rohan Garg, Yao Xu * - * gene@ccs.neu.edu, rohgarg@ccs.neu.edu, xu.yao1@northeastern.edu * - * * - * This file is part of DMTCP. * - * * - * DMTCP is free software: you can redistribute it and/or * - * modify it under the terms of the GNU Lesser General Public License as * - * published by the Free Software Foundation, either version 3 of the * - * License, or (at your option) any later version. * - * * - * DMTCP is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public * - * License in the files COPYING and COPYING.LESSER. If not, see * - * . * - ****************************************************************************/ - -#ifdef SINGLE_CART_REORDER -#include -#include "cartesian.h" -#endif - -#include -#include "jassert.h" -#include "jconvert.h" - -#include "record-replay.h" -#include "virtual-ids.h" -#include "p2p_log_replay.h" - -using namespace dmtcp_mpi; - -static int restoreCommSplit(MpiRecord& rec); -static int restoreCommSplitType(MpiRecord& rec); -static int restoreCommDup(MpiRecord& rec); -static int restoreCommCreate(MpiRecord& rec); -static int restoreCommCreateGroup(MpiRecord& rec); -static int restoreCommErrHandler(MpiRecord& rec); -static int restoreCommFree(MpiRecord& rec); -static int restoreAttrPut(MpiRecord& rec); -static int restoreAttrDelete(MpiRecord& rec); -static int restoreCommCreateKeyval(MpiRecord& rec); -static int restoreCommFreeKeyval(MpiRecord& rec); - -static int restoreCommGroup(MpiRecord& rec); -static int restoreGroupFree(MpiRecord& rec); -static int restoreGroupIncl(MpiRecord& rec); - -static int restoreTypeContiguous(MpiRecord& rec); -static int restoreTypeCommit(MpiRecord& rec); -static int restoreTypeHVector(MpiRecord& rec); -static int restoreTypeIndexed(MpiRecord& rec); -static int restoreTypeHIndexed(MpiRecord& rec); -static int restoreTypeFree(MpiRecord& rec); -static int restoreTypeCreateStruct(MpiRecord& rec); -static int restoreTypeDup(MpiRecord& rec); -static int restoreTypeCreateResized(MpiRecord& rec); - -static int restoreCartCreate(MpiRecord& rec); -static int restoreCartMap(MpiRecord& rec); -static int restoreCartShift(MpiRecord& rec); -static int restoreCartSub(MpiRecord& rec); - -static int restoreOpCreate(MpiRecord& rec); -static int restoreOpFree(MpiRecord& rec); - -static int restoreIbcast(MpiRecord& rec); -static int restoreIreduce(MpiRecord& rec); -static int restoreIbarrier(MpiRecord& rec); - -#ifdef SINGLE_CART_REORDER -void create_cartesian_info_mpi_datatype(MPI_Datatype *cidt); -void load_restart_cartesian_mapping(CartesianProperties *cp, - CartesianInfo *ci, - CartesianInfo restart_mapping[]); -void compare_comm_old_and_cart_cartesian_mapping( - CartesianProperties *cp, - CartesianInfo checkpoint_mapping[], - CartesianInfo restart_mapping[], - int *comm_old_ranks_order, - int *comm_cart_ranks_order); -void create_comm_old_communicator(CartesianProperties *cp, - int *comm_old_ranks_order); -void create_comm_cart_communicator(CartesianProperties *cp, - int *comm_cart_ranks_order); -#endif - -void -restoreMpiLogState() -{ - JASSERT(RESTORE_MPI_STATE() == MPI_SUCCESS) - .Text("Failed to restore MPI state"); -} - -int -dmtcp_mpi::restoreComms(MpiRecord &rec) -{ - int rc = -1; - JTRACE("Restoring MPI communicators"); - switch (rec.getType()) { - case GENERATE_ENUM(Comm_split): - JTRACE("restoreCommSplit"); - rc = restoreCommSplit(rec); - break; - case GENERATE_ENUM(Comm_split_type): - JTRACE("restoreCommSplitType"); - rc = restoreCommSplitType(rec); - break; - case GENERATE_ENUM(Comm_dup): - JTRACE("restoreCommDup"); - rc = restoreCommDup(rec); - break; - case GENERATE_ENUM(Comm_create): - JTRACE("restoreCommCreate"); - rc = restoreCommCreate(rec); - break; - case GENERATE_ENUM(Comm_create_group): - JTRACE("restoreCommCreateGroup"); - rc = restoreCommCreateGroup(rec); - break; - case GENERATE_ENUM(Comm_set_errhandler): - JTRACE("restoreCommErrHandler"); - rc = restoreCommErrHandler(rec); - break; - case GENERATE_ENUM(Comm_free): - JTRACE("restoreCommFree"); - rc = restoreCommFree(rec); - break; - case GENERATE_ENUM(Attr_put): - JTRACE("restoreAtrrPut"); - rc = restoreAttrPut(rec); - break; - case GENERATE_ENUM(Attr_delete): - JTRACE("restoreAtrrDelete"); - rc = restoreAttrDelete(rec); - break; - case GENERATE_ENUM(Comm_create_keyval): - JTRACE("restoreCommCreateKeyval"); - rc = restoreCommCreateKeyval(rec); - break; - case GENERATE_ENUM(Comm_free_keyval): - JTRACE("restoreCommFreeKeyval"); - rc = restoreCommFreeKeyval(rec); - break; - default: - JWARNING(false)(rec.getType()).Text("Unknown call"); - break; - } - return rc; -} - -int -dmtcp_mpi::restoreGroups(MpiRecord &rec) -{ - int rc = -1; - JTRACE("Restoring MPI groups"); - switch (rec.getType()) { - case GENERATE_ENUM(Comm_group): - JTRACE("restoreCommGroup"); - rc = restoreCommGroup(rec); - break; - case GENERATE_ENUM(Group_free): - JTRACE("restoreGroupFree"); - rc = restoreGroupFree(rec); - break; - case GENERATE_ENUM(Group_incl): - JTRACE("restoreGroupIncl"); - rc = restoreGroupIncl(rec); - break; - default: - JWARNING(false)(rec.getType()).Text("Unknown call"); - break; - } - return rc; -} - -int -dmtcp_mpi::restoreTypes(MpiRecord &rec) -{ - int rc = -1; - JTRACE("Restoring MPI derived types"); - switch (rec.getType()) { - case GENERATE_ENUM(Type_contiguous): - JTRACE("restoreTypeContiguous"); - rc = restoreTypeContiguous(rec); - break; - case GENERATE_ENUM(Type_commit): - JTRACE("restoreTypeCommit"); - rc = restoreTypeCommit(rec); - break; - case GENERATE_ENUM(Type_hvector): - JTRACE("restoreTypeHVector"); - rc = restoreTypeHVector(rec); - break; - case GENERATE_ENUM(Type_indexed): - JTRACE("restoreTypeIndexed"); - rc = restoreTypeIndexed(rec); - break; - case GENERATE_ENUM(Type_free): - JTRACE("restoreTypeFree"); - rc = restoreTypeFree(rec); - break; - case GENERATE_ENUM(Type_create_struct): - JTRACE("restoreTypeCreateStruct"); - rc = restoreTypeCreateStruct(rec); - break; - case GENERATE_ENUM(Type_hindexed): - JTRACE("restoreTypeHIndexed"); - rc = restoreTypeHIndexed(rec); - break; - case GENERATE_ENUM(Type_dup): - JTRACE("restoreTypeDup"); - rc = restoreTypeDup(rec); - break; - case GENERATE_ENUM(Type_create_resized): - JTRACE("restoreTypeCreateResized"); - rc = restoreTypeCreateResized(rec); - break; - default: - JWARNING(false)(rec.getType()).Text("Unknown call"); - break; - } - return rc; -} - -int -dmtcp_mpi::restoreCarts(MpiRecord &rec) -{ - int rc = -1; - JTRACE("Restoring MPI cartesian"); - switch (rec.getType()) { - case GENERATE_ENUM(Cart_create): - JTRACE("restoreCartCreate"); - rc = restoreCartCreate(rec); - break; - case GENERATE_ENUM(Cart_map): - JTRACE("restoreCartMap"); - rc = restoreCartMap(rec); - break; - case GENERATE_ENUM(Cart_shift): - JTRACE("restoreCartShift"); - rc = restoreCartShift(rec); - break; - case GENERATE_ENUM(Cart_sub): - JTRACE("restoreCartSub"); - rc = restoreCartSub(rec); - break; - default: - JWARNING(false)(rec.getType()).Text("Unknown call"); - break; - } - return rc; -} - -int -dmtcp_mpi::restoreOps(MpiRecord &rec) -{ - int rc = -1; - JTRACE("Restoring MPI Ops"); - switch (rec.getType()) { - case GENERATE_ENUM(Op_create): - JTRACE("restoreOpCreate"); - rc = restoreOpCreate(rec); - break; - case GENERATE_ENUM(Op_free): - JTRACE("restoreOpFree"); - rc = restoreOpFree(rec); - break; - default: - JWARNING(false)(rec.getType()).Text("Unknown call"); - break; - } - return rc; -} - -int -dmtcp_mpi::restoreRequests(MpiRecord &rec) -{ - int rc = -1; - JTRACE("Restoring MPI Requests"); - switch (rec.getType()) { - case GENERATE_ENUM(Ibarrier): - JTRACE("restoreIbarrier"); - rc = restoreIbarrier(rec); - break; - case GENERATE_ENUM(Ireduce): - JTRACE("restoreIreduce"); - rc = restoreIreduce(rec); - break; - case GENERATE_ENUM(Ibcast): - JTRACE("restoreIbcast"); - rc = restoreIbcast(rec); - break; - default: - JWARNING(false)(rec.getType()).Text("Unknown call"); - break; - } - return rc; -} - -static int -restoreCommSplit(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int color = rec.args(1); - int key = rec.args(2); - MPI_Comm newcomm = MPI_COMM_NULL; - retval = FNC_CALL(Comm_split, rec)(comm, color, key, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm virtComm = rec.args(3); - UPDATE_COMM_MAP(virtComm, newcomm); - } - return retval; -} - -static int -restoreCommSplitType(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int split_type = rec.args(1); - int key = rec.args(2); - MPI_Info inf = rec.args(3); - MPI_Comm newcomm = MPI_COMM_NULL; - retval = FNC_CALL(Comm_split_type, rec)(comm, split_type, key, inf, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm virtComm = rec.args(4); - UPDATE_COMM_MAP(virtComm, newcomm); - } - return retval; -} - -static int -restoreCommDup(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - MPI_Comm newcomm = MPI_COMM_NULL; - retval = FNC_CALL(Comm_dup, rec)(comm, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm virtComm = rec.args(1); - UPDATE_COMM_MAP(virtComm, newcomm); - } - return retval; -} - -static int -restoreCommCreate(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - MPI_Group group = rec.args(1); - MPI_Comm newcomm = MPI_COMM_NULL; - retval = FNC_CALL(Comm_create, rec)(comm, group, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm oldcomm = rec.args(2); - UPDATE_COMM_MAP(oldcomm, newcomm); - } - return retval; -} - -static int -restoreCommCreateGroup(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - MPI_Group group = rec.args(1); - int tag = rec.args(2); - MPI_Comm newcomm = MPI_COMM_NULL; - retval = FNC_CALL(Comm_create_group, rec)(comm, group, tag, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm oldcomm = rec.args(3); - UPDATE_COMM_MAP(oldcomm, newcomm); - } - return retval; -} - -static int -restoreCommErrHandler(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - MPI_Errhandler errhandler = rec.args(1); - retval = FNC_CALL(Comm_set_errhandler, rec)(comm, errhandler); - JWARNING(retval == MPI_SUCCESS)(comm).Text("Error restoring MPI errhandler"); - return retval; -} - -static int -restoreCommFree(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - retval = FNC_CALL(Comm_free, rec)(&comm); - JWARNING(retval == MPI_SUCCESS)(comm).Text("Error freeing MPI comm"); - if (retval == MPI_SUCCESS) { - // See mpi_comm_wrappers.cpp:Comm_free - // NOTE: We cannot remove the old comm from the map, since - // we'll need to replay this call to reconstruct any other comms that - // might have been created using this comm. - // - // MPI_Comm oldcomm = REMOVE_OLD_COMM(comm); - } - return retval; -} - -static int -restoreAttrPut(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int key = rec.args(1); - void *val = rec.args(2); - retval = FNC_CALL(Attr_put, rec)(comm, key, val); - JWARNING(retval == MPI_SUCCESS)(comm) - .Text("Error restoring MPI attribute-put"); - return retval; -} - -static int -restoreAttrDelete(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int key = rec.args(1); - retval = FNC_CALL(Attr_delete, rec)(comm, key); - JWARNING(retval == MPI_SUCCESS)(comm).Text("Error deleting MPI attribute"); - return retval; -} - -static int -restoreCommCreateKeyval(MpiRecord& rec) -{ - int retval; - void *cfn_tmp = rec.args(0); - void *dfn_tmp = rec.args(1); - MPI_Comm_copy_attr_function *cfn = (MPI_Comm_copy_attr_function*) cfn_tmp; - MPI_Comm_delete_attr_function *dfn = (MPI_Comm_delete_attr_function*) dfn_tmp; - int newkey = 0; - void *extra_state = rec.args(3); - retval = FNC_CALL(Comm_create_keyval, rec)(cfn, dfn, &newkey, extra_state); - if (retval == MPI_SUCCESS) { - int oldkey = rec.args(2); - UPDATE_COMM_KEYVAL_MAP(oldkey, newkey); - } - return retval; -} - -static int -restoreCommFreeKeyval(MpiRecord& rec) -{ - int retval; - int key = rec.args(0); - retval = FNC_CALL(Comm_free_keyval, rec)(&key); - JWARNING(retval == MPI_SUCCESS)(key).Text("Error deleting MPI Comm Keyval"); - // See mpi_comm_wrappers.cpp:Comm_free_keyval - // We don't remove item from virtual-id tables - return retval; -} - -static int -restoreCommGroup(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - MPI_Group newgroup = MPI_GROUP_NULL; - retval = FNC_CALL(Comm_group, rec)(comm, &newgroup); - JWARNING(retval == MPI_SUCCESS)(comm).Text("Error restoring MPI comm group"); - if (retval == MPI_SUCCESS) { - MPI_Group oldgroup = rec.args(1); - UPDATE_GROUP_MAP(oldgroup, newgroup); - } - return retval; -} - -static int -restoreGroupFree(MpiRecord& rec) -{ - int retval; - MPI_Group group = rec.args(0); - retval = FNC_CALL(Group_free, rec)(&group); - JWARNING(retval == MPI_SUCCESS)(group).Text("Error restoring MPI group free"); - if (retval == MPI_SUCCESS) { - // See mpi_group_wrappers.cpp:Group_free - // NOTE: We cannot remove the old group, since we'll need - // to replay this call to reconstruct any comms that might - // have been created using this group. - // - // REMOVE_OLD_GROUP(group); - } - return retval; -} - - -static int -restoreGroupIncl(MpiRecord& rec) -{ - int retval; - MPI_Group group = rec.args(0); - int n = rec.args(1); - int *ranks = rec.args(2); - MPI_Group newgroup = MPI_GROUP_NULL; - retval = FNC_CALL(Group_incl, rec)(group, n, ranks, &newgroup); - JWARNING(retval == MPI_SUCCESS)(group).Text("Error restoring MPI group incl"); - if (retval == MPI_SUCCESS) { - MPI_Group oldgroup = rec.args(3); - UPDATE_GROUP_MAP(oldgroup, newgroup); - } - return retval; -} - -static int -restoreTypeContiguous(MpiRecord& rec) -{ - int retval; - int count = rec.args(0); - MPI_Datatype oldtype = rec.args(1); - MPI_Datatype newtype; - retval = FNC_CALL(Type_contiguous, rec)(count, oldtype, &newtype); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(2); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -static int -restoreTypeCommit(MpiRecord& rec) -{ - int retval; - MPI_Datatype type = rec.args(0); - retval = FNC_CALL(Type_commit, rec)(&type); - JWARNING(retval == MPI_SUCCESS)(type).Text("Could not commit MPI datatype"); - return retval; -} - -static int -restoreTypeHVector(MpiRecord& rec) -{ - int retval; - int count = rec.args(0); - int blocklength = rec.args(1); - MPI_Aint stride = rec.args(2); - MPI_Datatype oldtype = rec.args(3); - MPI_Datatype newtype = MPI_DATATYPE_NULL; - retval = FNC_CALL(Type_hvector, rec)(count, blocklength, - stride, oldtype, &newtype); - JWARNING(retval == MPI_SUCCESS)(oldtype) - .Text("Could not restore MPI hvector datatype"); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(4); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -static int -restoreTypeHIndexed(MpiRecord& rec) -{ - int retval; - int count = rec.args(0); - int *bs = rec.args(1); - MPI_Aint *ds = rec.args(2); - MPI_Datatype oldtype = rec.args(3); - MPI_Datatype newtype = MPI_DATATYPE_NULL; - retval = FNC_CALL(Type_hindexed, rec)(count, bs, ds, oldtype, &newtype); - JWARNING(retval == MPI_SUCCESS)(oldtype) - .Text("Could not restore MPI hvector datatype"); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(4); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -static int -restoreTypeDup(MpiRecord& rec) -{ - int retval; - MPI_Datatype oldtype = rec.args(0); - MPI_Datatype newtype = MPI_DATATYPE_NULL; - retval = FNC_CALL(Type_dup, rec)(oldtype, &newtype); - JWARNING(retval == MPI_SUCCESS)(oldtype) - .Text("Could not restore MPI hvector datatype"); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(1); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -static int -restoreTypeCreateResized(MpiRecord& rec) -{ - int retval; - MPI_Datatype oldtype = rec.args(0); - MPI_Aint lb = rec.args(1); - MPI_Aint ext = rec.args(2); - MPI_Datatype newtype = MPI_DATATYPE_NULL; - retval = FNC_CALL(Type_create_resized, rec)(oldtype, lb, ext, &newtype); - JWARNING(retval == MPI_SUCCESS)(oldtype) - .Text("Could not restore MPI hvector datatype"); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(3); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -void MpiRecordReplay::printRecords(bool print) -{ - JNOTE("Printing _records"); - for(MpiRecord* record : _records) { - int fnc_idx = record->getType(); - if (print) { - printf("%s\n", MPI_Fnc_strings[fnc_idx]); - } else { - JNOTE("") (MPI_Fnc_strings[fnc_idx]); - } - } -} - -static int -restoreTypeIndexed(MpiRecord& rec) -{ - int retval; - int count = rec.args(0); - int *blocklengths = rec.args(1); - int *displs = rec.args(2); - MPI_Datatype oldtype = rec.args(3); - MPI_Datatype newtype = MPI_DATATYPE_NULL; - retval = FNC_CALL(Type_indexed, rec)(count, blocklengths, - displs, oldtype, &newtype); - JWARNING(retval == MPI_SUCCESS)(oldtype) - .Text("Could not restore MPI indexed datatype"); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(4); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -static int -restoreTypeFree(MpiRecord& rec) -{ - int retval; - MPI_Datatype type = rec.args(0); - retval = FNC_CALL(Type_free, rec)(&type); - JWARNING(retval == MPI_SUCCESS)(type).Text("Could not free MPI datatype"); - if (retval == MPI_SUCCESS) { - // See mpi_type_wrappers.cpp:Type_free - // NOTE: We cannot remove the old type from the map, since - // we'll need to replay this call to reconstruct any other type that - // might have been created using this type. - // - // MPI_Datatype realType = REMOVE_OLD_TYPE(type); - } - return retval; -} - -static int -restoreTypeCreateStruct(MpiRecord& rec) -{ - int retval; - int count = rec.args(0); - int *blocklengths = rec.args(1); - MPI_Aint *displs = rec.args(2); - MPI_Datatype *types = rec.args(3); - MPI_Datatype newtype = MPI_DATATYPE_NULL; - retval = FNC_CALL(Type_create_struct, rec)(count, blocklengths, - displs, types, &newtype); - JWARNING(retval == MPI_SUCCESS)(types) - .Text("Could not restore MPI struct datatype"); - if (retval == MPI_SUCCESS) { - MPI_Datatype virtType = rec.args(4); - UPDATE_TYPE_MAP(virtType, newtype); - } - return retval; -} - -#ifdef SINGLE_CART_REORDER -int -load_cartesian_properties(const char *filename, CartesianProperties *cp) -{ - int fd = open(filename, O_RDONLY); - if (fd == -1) { - return -1; - } - read(fd, &cp->comm_old_size, sizeof(int)); - read(fd, &cp->comm_cart_size, sizeof(int)); - read(fd, &cp->comm_old_rank, sizeof(int)); - read(fd, &cp->comm_cart_rank, sizeof(int)); - read(fd, &cp->reorder, sizeof(int)); - read(fd, &cp->ndims, sizeof(int)); - int array_size = sizeof(int) * cp->ndims; - read(fd, cp->coordinates, array_size); - read(fd, cp->dimensions, array_size); - read(fd, cp->periods, array_size); - close(fd); - return 0; -} - -void -load_checkpoint_cartesian_mapping(CartesianProperties *cp, - CartesianInfo checkpoint_mapping[]) -{ - int ndims = cp->ndims; - int comm_old_size = cp->comm_old_size; - for (int i = 0; i < comm_old_size; i++) { - CartesianProperties cp; - dmtcp::ostringstream o; - o << "./ckpt_rank_" << i << "/cartesian.info"; - if (load_cartesian_properties(o.str().c_str(), &cp) == 0) { - checkpoint_mapping[i].comm_old_rank = cp.comm_old_rank; - checkpoint_mapping[i].comm_cart_rank = cp.comm_cart_rank; - for (int j = 0; j < ndims; j++) { - checkpoint_mapping[i].coordinates[j] = cp.coordinates[j]; - } - } - } -} - -// Prior to checkpoint we will use the normal variable names, and -// after restart we will use the '_prime' suffix with variable names. -MPI_Comm comm_cart; -MPI_Comm *comm_cart_prime; -MPI_Comm comm_old; -MPI_Comm comm_old_prime; - -void -create_cartesian_info_mpi_datatype(MPI_Datatype *cidt) -{ - int retval = -1; - int lengths[3] = { 1, 1, MAX_CART_PROP_SIZE }; - - // Calculate displacements - // In C, by default padding can be inserted between fields. MPI_Get_address - // will allow to get the address of each struct field and calculate the - // corresponding displacement relative to that struct base address. The - // displacements thus calculated will therefore include padding if any. - MPI_Aint base_address; - MPI_Aint displacements[3]; - CartesianInfo dummy_ci; - - JUMP_TO_LOWER_HALF(lh_info.fsaddr); - - retval = NEXT_FUNC(Get_address)(&dummy_ci, &base_address); - retval += NEXT_FUNC(Get_address)(&dummy_ci.comm_old_rank, &displacements[0]); - retval += NEXT_FUNC(Get_address)(&dummy_ci.comm_cart_rank, &displacements[1]); - retval += NEXT_FUNC(Get_address)(&dummy_ci.coordinates[0], &displacements[2]); - - displacements[0] = NEXT_FUNC(Aint_diff)(displacements[0], base_address); - displacements[1] = NEXT_FUNC(Aint_diff)(displacements[1], base_address); - displacements[2] = NEXT_FUNC(Aint_diff)(displacements[2], base_address); - - MPI_Datatype types[3] = { MPI_INT, MPI_INT, MPI_INT }; - - retval = - NEXT_FUNC(Type_create_struct)(3, lengths, displacements, types, cidt); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to create MPI datatype for struct."); - - retval = NEXT_FUNC(Type_commit)(cidt); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to commit MPI datatype for struct."); - - RETURN_TO_UPPER_HALF(); -} - -void -load_restart_cartesian_mapping(CartesianProperties *cp, CartesianInfo *ci, - CartesianInfo restart_mapping[]) -{ - int retval = -1; - MPI_Datatype ci_type; - create_cartesian_info_mpi_datatype(&ci_type); - // Root process will collect the cartesian info and all other process will - // send their cartesian info - if (ci->comm_old_rank == 0) { - retval = NEXT_FUNC(Gather)(ci, 1, ci_type, restart_mapping, 1, ci_type, 0, - comm_old_prime); - } else { - retval = - NEXT_FUNC(Gather)(ci, 1, ci_type, NULL, 0, ci_type, 0, comm_old_prime); - } - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to load restart cartesian mapping."); -} - -void -compare_comm_old_and_cart_cartesian_mapping(CartesianProperties *cp, - CartesianInfo checkpoint_mapping[], - CartesianInfo restart_mapping[], - int *comm_old_ranks_order, - int *comm_cart_ranks_order) -{ - for (int i = 0; i < cp->comm_old_size; i++) { - CartesianInfo *checkpoint = &checkpoint_mapping[i]; - // Iterate through each entry in the array and find out - // the rank of the process whose coordinates are equal to - // checkpoint.coordinates - for (int j = 0; j < cp->comm_old_size; j++) { - CartesianInfo *restart = &restart_mapping[j]; - int sum = 0; - for (int k = 0; k < cp->ndims; k++) { - if (checkpoint->coordinates[k] == restart->coordinates[k]) { - sum += 1; - } - } - if (sum == cp->ndims) { - comm_old_ranks_order[i] = checkpoint->comm_old_rank; - comm_cart_ranks_order[i] = checkpoint->comm_cart_rank; - break; - } - } - } -} - -void -create_comm_old_communicator(CartesianProperties *cp, int *comm_old_ranks_order) -{ - int retval = -1; - MPI_Group comm_old_group_prime, comm_old_group; - MPI_Comm_group(comm_old_prime, &comm_old_group_prime); - retval = MPI_Group_incl(comm_old_group_prime, cp->comm_old_size, - comm_old_ranks_order, &comm_old_group); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to create group."); - retval = - MPI_Comm_create_group(comm_old_prime, comm_old_group, 121, &comm_old); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to create communicator."); -} - -void -create_comm_cart_communicator(CartesianProperties *cp, int *comm_cart_ranks_order) -{ - int retval = -1; - MPI_Group comm_cart_group_prime, comm_cart_group; - MPI_Comm_group(*comm_cart_prime, &comm_cart_group_prime); - - retval = MPI_Group_incl(comm_cart_group_prime, cp->comm_cart_size, - comm_cart_ranks_order, &comm_cart_group); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to create group."); - MPI_Comm comm_cart_tmp; - retval = MPI_Comm_create_group(*comm_cart_prime, comm_cart_group, 111, - &comm_cart_tmp); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to create communicator."); - retval = MPI_Cart_create(comm_cart_tmp, cp->ndims, cp->dimensions, - cp->periods, cp->reorder, &comm_cart); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to create communicator."); -} - -void -setCartesianCommunicator(void *getCartesianCommunicatorFptr) -{ - typedef void (*getCartesianCommunicatorFptr_t)(MPI_Comm **); - ((getCartesianCommunicatorFptr_t)getCartesianCommunicatorFptr)( - &comm_cart_prime); -} - -static int -restoreCartCreate(MpiRecord &rec) -{ - int retval = -1; - int comm_old_ranks_order[MAX_PROCESSES]; - int comm_cart_ranks_order[MAX_PROCESSES]; - - CartesianInfo ci; - CartesianProperties cp; - CartesianInfo checkpoint_mapping[MAX_PROCESSES]; - CartesianInfo restart_mapping[MAX_PROCESSES]; - - // In current implementation, is MPI_COMM_WORLD - comm_old_prime = MPI_COMM_WORLD; - // Get cartesian info of this process - retval = MPI_Comm_rank(comm_old_prime, &ci.comm_old_rank); - retval = MPI_Comm_rank(*comm_cart_prime, &ci.comm_cart_rank); - // Get cartesian properties of this process - dmtcp::ostringstream o; - o << "./ckpt_rank_" << ci.comm_old_rank << "/cartesian.info"; - retval = load_cartesian_properties(o.str().c_str(), &cp); - JASSERT(retval == 0) - (o.str().c_str()).Text("Failed to load cartesian properties."); - // Get coordinates of this process - retval = MPI_Cart_coords(*comm_cart_prime, ci.comm_cart_rank, cp.ndims, - ci.coordinates); - // Load checkpoint cartesian mapping - load_checkpoint_cartesian_mapping(&cp, checkpoint_mapping); - // Load restart cartesian mapping - load_restart_cartesian_mapping(&cp, &ci, restart_mapping); - retval = MPI_Barrier(MPI_COMM_WORLD); - JASSERT(retval == MPI_SUCCESS).Text("MPI_Barrier(1) failed."); - // The root process will populate and - // arrays - if (ci.comm_old_rank == 0) { - compare_comm_old_and_cart_cartesian_mapping(&cp, checkpoint_mapping, - restart_mapping, - comm_old_ranks_order, - comm_cart_ranks_order); - } - retval = MPI_Barrier(MPI_COMM_WORLD); - JASSERT(retval == MPI_SUCCESS).Text("MPI_Barrier(2) failed."); - retval = NEXT_FUNC(Bcast)(comm_old_ranks_order, cp.comm_old_size, MPI_INT, 0, - comm_old_prime); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to broadcast integer array."); - retval = MPI_Barrier(MPI_COMM_WORLD); - JASSERT(retval == MPI_SUCCESS).Text("MPI_Barrier(3) failed."); - retval = NEXT_FUNC(Bcast)(comm_cart_ranks_order, cp.comm_cart_size, MPI_INT, - 0, *comm_cart_prime); - JASSERT(retval == MPI_SUCCESS) - .Text("Failed to broadcast integer array."); - retval = MPI_Barrier(MPI_COMM_WORLD); - JASSERT(retval == MPI_SUCCESS).Text("MPI_Barrier(4) failed."); - // Create and communicators - create_comm_old_communicator(&cp, comm_old_ranks_order); - create_comm_cart_communicator(&cp, comm_cart_ranks_order); - retval = MPI_Barrier(MPI_COMM_WORLD); - JASSERT(retval == MPI_SUCCESS).Text("MPI_Barrier(5) failed."); - // Update mapping - MPI_Comm virtComm = rec.args(5); - // FIXME: This only works for MPICH but maybe not for other MPI libraries. - UPDATE_COMM_MAP(MPI_COMM_WORLD, comm_old); - UPDATE_COMM_MAP(virtComm, comm_cart); - return MPI_SUCCESS; -} - -#else - -static int -restoreCartCreate(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int ndims = rec.args(1); - int *dims = rec.args(2); - int *periods = rec.args(3); - int reorder = rec.args(4); - MPI_Comm newcomm = MPI_COMM_NULL; - retval = FNC_CALL(Cart_create, rec)(comm, ndims, dims, - periods, reorder, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm virtComm = rec.args(5); - UPDATE_COMM_MAP(virtComm, newcomm); - } - return retval; -} - -#endif - -static int -restoreCartMap(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int ndims = rec.args(1); - int *dims = rec.args(2); - int *periods = rec.args(3); - int newrank = -1; - retval = FNC_CALL(Cart_map, rec)(comm, ndims, dims, periods, &newrank); - if (retval == MPI_SUCCESS) { - // FIXME: Virtualize rank? - int oldrank = rec.args(4); - JASSERT(newrank == oldrank)(oldrank)(newrank).Text("Different ranks"); - } - return retval; -} - -static int -restoreCartShift(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - int direction = rec.args(1); - int disp = rec.args(2); - int rank_source = -1; - int rank_dest = -1; - retval = FNC_CALL(Cart_shift, rec)(comm, direction, - disp, &rank_source, &rank_dest); - if (retval == MPI_SUCCESS) { - // FIXME: Virtualize rank? - int oldsrc = rec.args(3); - int olddest = rec.args(4); - JASSERT(oldsrc == rank_source && olddest == rank_dest) - (oldsrc)(olddest)(rank_source)(rank_dest).Text("Different ranks"); - } - return retval; -} - -static int -restoreCartSub(MpiRecord& rec) -{ - int retval; - MPI_Comm comm = rec.args(0); - // int ndims = rec.args(1); - int *remain_dims = rec.args(2); - MPI_Comm newcomm = MPI_COMM_NULL; - // LOG_CALL(restoreCarts, Cart_sub, comm, ndims, rs, virtComm); - retval = FNC_CALL(Cart_sub, rec)(comm, remain_dims, &newcomm); - if (retval == MPI_SUCCESS) { - MPI_Comm virtComm = rec.args(3); - UPDATE_COMM_MAP(virtComm, newcomm); - } - return retval; -} - -static int -restoreOpCreate(MpiRecord& rec) -{ - int retval = -1; - MPI_User_function *user_fn = rec.args(0); - int commute = rec.args(1); - MPI_Op newop = MPI_OP_NULL; - retval = FNC_CALL(Op_create, rec)(user_fn, commute, &newop); - if (retval == MPI_SUCCESS) { - MPI_Op oldop = rec.args(2); - UPDATE_OP_MAP(oldop, newop); - } - return retval; -} - -static int -restoreOpFree(MpiRecord& rec) -{ - int retval = -1; - MPI_Op op = rec.args(0); - MPI_Op realOp = VIRTUAL_TO_REAL_OP(op); - retval = FNC_CALL(Op_free, rec)(&realOp); - if (retval == MPI_SUCCESS) { - // See mpi_op_wrappers.cpp:Op_free - // NOTE: We cannot remove the old op from the map, since - // we'll need to replay this call to reconstruct any other op that - // might have been created using this op. - // - // realOp = REMOVE_OLD_OP(op); - } - return retval; -} - -static int restoreIbcast(MpiRecord& rec) { - int retval = -1; - void *buf = rec.args(0); - int count = rec.args(1); - MPI_Datatype datatype = rec.args(2); - int root = rec.args(3); - MPI_Comm comm = rec.args(4); - if (rec.getComplete()) { - if ((buf = rec.getBuf()) == NULL) { - int size; - MPI_Type_size(datatype, &size); - buf = malloc(count * size); - rec.setBuf(buf); - } - } - MPI_Request newRealRequest = MPI_REQUEST_NULL; - retval = FNC_CALL(Ibcast, rec)(buf, count, datatype, root, comm, - &newRealRequest); - if (retval == MPI_SUCCESS) { - MPI_Request virtRequest = rec.args(5); - UPDATE_REQUEST_MAP(virtRequest, newRealRequest); -#ifdef USE_REQUEST_LOG - logRequestInfo(virtRequest, IBCAST_REQUEST); -#endif - } - return retval; -} - -static int restoreIreduce(MpiRecord& rec) { - int retval = -1; - void *sendbuf = rec.args(0); - void *recvbuf = rec.args(1); - int count = rec.args(2); - MPI_Datatype datatype = rec.args(3); - MPI_Op op = rec.args(4); - int root = rec.args(5); - MPI_Comm comm = rec.args(6); - if (rec.getComplete() == true) { - int rank; - MPI_Comm_rank(comm, &rank); - if (rank == root) { // receiver - sendbuf = MPI_IN_PLACE; - // Use a temporary buffer to consume the received message - if ((recvbuf = rec.getBuf()) == NULL) { - int size; - MPI_Type_size(datatype, &size); - recvbuf = malloc(count * size); - rec.setBuf(recvbuf); - } - } else { // sender - // Sender's buffer contains the data of the recorded message's sendbuf - sendbuf = rec.getBuf(); - } - } - MPI_Request newRealRequest = MPI_REQUEST_NULL; - retval = FNC_CALL(Ireduce, rec)(sendbuf, recvbuf, count, - datatype, op, root, comm, &newRealRequest); - if (retval == MPI_SUCCESS) { - MPI_Request virtRequest = rec.args(7); - UPDATE_REQUEST_MAP(virtRequest, newRealRequest); -#ifdef USE_REQUEST_LOG - logRequestInfo(virtRequest, IREDUCE_REQUEST); -#endif - } - return retval; -} - -static int restoreIbarrier(MpiRecord& rec) { - int retval = -1; - MPI_Comm comm = rec.args(0); - MPI_Request newRealRequest = MPI_REQUEST_NULL; - retval = FNC_CALL(Ibarrier, rec)(comm, &newRealRequest); - MPI_Request virtRequest; - if (retval == MPI_SUCCESS) { - virtRequest = rec.args(1); - UPDATE_REQUEST_MAP(virtRequest, newRealRequest); -#ifdef USE_REQUEST_LOG - logRequestInfo(virtRequest, IBARRIER_REQUEST); -#endif - } - // Verify the request is valid - int flag; - retval = MPI_Request_get_status(virtRequest, &flag, MPI_STATUS_IGNORE); - JASSERT(retval == MPI_SUCCESS); - return retval; -} diff --git a/mpi-proxy-split/record-replay.h b/mpi-proxy-split/record-replay.h deleted file mode 100644 index 56e14ed9a..000000000 --- a/mpi-proxy-split/record-replay.h +++ /dev/null @@ -1,949 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2019-2021 by Gene Cooperman, Rohan Garg, Yao Xu * - * gene@ccs.neu.edu, rohgarg@ccs.neu.edu, xu.yao1@northeastern.edu * - * * - * This file is part of DMTCP. * - * * - * DMTCP is free software: you can redistribute it and/or * - * modify it under the terms of the GNU Lesser General Public License as * - * published by the Free Software Foundation, either version 3 of the * - * License, or (at your option) any later version. * - * * - * DMTCP is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public * - * License in the files COPYING and COPYING.LESSER. If not, see * - * . * - ****************************************************************************/ - -#ifndef MPI_RECORD_REPLAY_H -#define MPI_RECORD_REPLAY_H - -#include - -#include -#include -#include -#include -#include - -#include "jassert.h" -#include "jconvert.h" - -#include "lower_half_api.h" - -// Logs the MPI call to the global MPI calls log object (defined by the -// 'MpiRecordReplay' class). 'cb' specifies the callback that will be used -// to replay the MPI call while restoring the MPI state at restart time. 'fnc' -// represents the current MPI call. 'args ...' can be used to provide a -// variable-length list of arguments to be saved. The saved arguments are useful -// while replaying the call later. -#define LOG_CALL(cb, fnc, args...) \ - dmtcp_mpi::MpiRecordReplay::instance().record(cb, GENERATE_ENUM(fnc), \ - GENERATE_FNC_PTR(fnc), args) - -#define RESTORE_MPI_STATE() \ - dmtcp_mpi::MpiRecordReplay::instance().replay() - -#define CLEAR_LOG() \ - dmtcp_mpi::MpiRecordReplay::instance().reset() - -#define CLEAR_GROUP_LOGS(group) \ - dmtcp_mpi::MpiRecordReplay::instance().clearGroupLogs(group) - -#define CLEAR_COMM_LOGS(comm) \ - dmtcp_mpi::MpiRecordReplay::instance().clearCommLogs(comm) - -#define LOG_REMOVE_REQUEST(request) \ - dmtcp_mpi::MpiRecordReplay::instance().removeRequestLog(request) - -// Returns true if we are currently replaying the MPI calls from the saved MPI -// calls log; false, otherwise. Normally, this would be true while restoring -// the MPI state at restart time. All other times, this would return false. -// We cannot use LOGGING since it's used for enabling JTRACE -#define MPI_LOGGING() \ - dmtcp_mpi::MpiRecordReplay::instance().isReplayOn() - -// Calls the wrapper function corresponding to the given type 'type'. (The -// 'rec' object contains a pointer to the wrapper function.) -#define FNC_CALL(type, rec) \ - ({ \ - __typeof__(GENERATE_FNC_PTR(type))_real_MPI_## type = \ - rec.call(GENERATE_FNC_PTR(type)); \ - _real_MPI_ ## type; \ - }) - -#define CREATE_LOG_BUF(buf, len) dmtcp_mpi::FncArg(buf, len) - - -namespace dmtcp_mpi -{ - struct FncArg; - class MpiRecord; - class MpiRecordReplay; - - using mutex_t = std::mutex; - using lock_t = std::unique_lock; - using fcb_t = std::function; - using mpi_record_vector_iterator_t = dmtcp::vector::iterator; - - enum TYPE { - TYPE_INT, - TYPE_INT_PTR, - TYPE_INT_ARRAY, - TYPE_VOID_PTR, - TYPE_VOID_CONST_PTR, - TYPE_LONG, - TYPE_MPI_USER_FNC, - }; - - // Restores the MPI requests and returns MPI_SUCCESS on success - extern int restoreRequests(MpiRecord&); - - // Struct for saving arbitrary function arguments - struct FncArg - { - void *_data; - enum TYPE _type; - - FncArg(const void *data, size_t len, dmtcp_mpi::TYPE type) - : _data(JALLOC_HELPER_MALLOC(len)) - { - _type = type; - if (_data && data) { - memcpy(_data, data, len); - } - } - - // This constructor is only used by CREATE_LOG_BUF - FncArg(const void *data, size_t len) - : _data(JALLOC_HELPER_MALLOC(len)) - { - // Default _type set to TYPE_INT_ARRAY because this constructor is used - // by CREATE_LOG_BUF in MPI_Cart functions. - _type = dmtcp_mpi::TYPE_INT_ARRAY; - if (_data && data) { - memcpy(_data, data, len); - } - } - - ~FncArg() - { - if (!_data) { - JALLOC_HELPER_FREE(_data); - } - } - - // On restart, we will use the restore family of functions in - // record-replay.cpp. A typical line of code there is: - // int arg = rec.args(n); - // rec.args() returns a 'const FncArg&'. - // The copy constructor for arg will invokes one of these casts - // from FncArg to the type for arg. So in this example, the code - // is invoking the 'int cast operator', resulting in: - // int arg = *(int*)(((FncArg)rec.args(n))._data); - - // Returns an int corresponding to the saved argument - operator int() const - { - return *(int*)_data; - } - - // Returns an int pointer to saved argument - // This also handles types like MPI_Datatype* in MPICH, - // because MPI_Datatype is an integer in this implementation. - // This doesn't handle MPI_Aint* because MPI_Aint - // is 8 bytes (long int). - operator int*() const - { - if (_type == dmtcp_mpi::TYPE_INT_ARRAY) { - return (int*)_data; - } else if (_type == dmtcp_mpi::TYPE_INT_PTR) { - return *(int**)_data; - } else { - JASSERT(false).Text("Unsupported arg type"); - return NULL; - } - } - - // This is used to handle MPI_Aint*, because in MPICH - // MPI_Aint is 8 bytes (long int). - operator long*() const - { - return (long*)_data; - } - - // Returns a void pointer to saved argument - operator void*() const - { - return *(void**)_data; - } - - // Returns a void pointer to saved argument - operator void const*() const - { - return *(void const**)_data; - } - - operator MPI_User_function*() const - { - return *(MPI_User_function**)_data; - } - }; - - template - FncArg FncArgTyped(T data) - { - int a; - int *b; - void *c; - void const *d; - MPI_User_function *e; - long f; - - if (typeid(data) == typeid(a)) { - return FncArg(&data, sizeof(data), TYPE_INT); - } - if (typeid(data) == typeid(b)) { - return FncArg(&data, sizeof(data), TYPE_INT_PTR); - } - if (typeid(data) == typeid(c)) { - return FncArg(&data, sizeof(data), TYPE_VOID_PTR); - } - if (typeid(data) == typeid(d)) { - return FncArg(&data, sizeof(data), TYPE_VOID_CONST_PTR); - } - if (typeid(data) == typeid(e)) { - return FncArg(&data, sizeof(data), TYPE_MPI_USER_FNC); - } - if (typeid(data) == typeid(f)) { - return FncArg(&data, sizeof(data), TYPE_MPI_USER_FNC); - } - JASSERT(false).Text("Unkown type for FncArg"); - return FncArgTyped(-1); - } - - // Represent a single call record - class MpiRecord - { - public: -#ifdef JALIB_ALLOCATOR - static void* operator new(size_t nbytes, void* p) { return p; } - static void* operator new(size_t nbytes) { JALLOC_HELPER_NEW(nbytes); } - static void operator delete(void* p) { JALLOC_HELPER_DELETE(p); } -#endif - MpiRecord(fcb_t cb, MPI_Fncs type, void *fnc) - : _cb(cb), _type(type), _fnc(fnc), _buffer(NULL), _complete(false) - { - } - - ~MpiRecord() - { - _args.clear(); - free(_buffer); - } - - // Base case to stop the recursion - void addArgs() - { - } - - // Handle one complex argument - void addArgs(const FncArg arg) - { - _args.push_back(arg); - } - - // Handle one MPI_User_function argument - void addArgs(MPI_User_function *arg) - { - _args.push_back(FncArgTyped((void*)arg)); - } - - // Handle one simple argument - template - void addArgs(const T arg) - { - _args.push_back(FncArgTyped(arg)); - } - - // Handle list of arguments - template - void addArgs(const T car, const Targs... cdr) - { - addArgs(car); - addArgs(cdr...); - } - - // Execute the restore callback for this record - int play() const - { - return _cb(const_cast(*this)); - } - - // Returns a reference to the 'n'-th function argument object - const FncArg& args(int n) const - { - return _args[n]; - } - - // Returns the enum MPI_Fncs type of the current MPI record object - MPI_Fncs getType() const - { - return _type; - } - - void setBuf(void *buf) - { - _buffer = buf; - } - - void *getBuf() - { - return _buffer; - } - - void setComplete(bool complete) - { - _complete = complete; - } - - bool getComplete() - { - return _complete; - } - - // Returns a pointer to the wrapper function corresponding to this MPI - // record object - template - T call(T fptr) const - { - return T(_fnc); - } - - private: - fcb_t _cb; // Callback to invoke to replay this MPI call - MPI_Fncs _type; // enum MPI_Fncs type of this MPI call - void *_fnc; // Pointer to the wrapper function of this MPI call - dmtcp::vector _args; // List of argument objects for this MPI call - void *_buffer; // opaque data saved in the buffer - bool _complete; - }; - - // Singleton class representing the entire log of MPI calls, useful for - // saving and restoring the MPI state - class MpiRecordReplay - { - public: -#ifdef JALIB_ALLOCATOR - static void* operator new(size_t nbytes, void* p) { return p; } - static void* operator new(size_t nbytes) { JALLOC_HELPER_NEW(nbytes); } - static void operator delete(void* p) { JALLOC_HELPER_DELETE(p); } -#endif - - // Returns the singleton instance of the MpiRecordReplay class - static MpiRecordReplay& instance() - { - static MpiRecordReplay _records; - return _records; - } - - // Record/replay addresses two subtle issues in replaying Ireduce/Ibcast. - // [1] Checkpoint replays the Ibcast saved in the object log. The - // receiver is receiving the current MPI Ibcast call. The sender advances - // to the next MPI Ibcast call. Since only the buffer address is saved in - // the log, the sender's buffer value is different in the MPI next call. - // That causes the receiver gets wrong buffer. - // [2] The completed requests is not replayed in the replay. The Ibcast - // sender request is completed while the receiver is still not finished. - // In replay, the receiver's request is replay and waits for the broadcast - // message. However, sender's request is completed so it won't be replayed - // and won't send message. That causes the receiver waiting forever. - // The record-replay workflows is below. - // [1] All requests saved in the record-replay are replayed. - // [2] Saves Ibcast buffer value in addition to its address. - // [3] For the completed sender's request, the saved buffer value is copied - // to temporary buffer before it is replay. - // [4] For the completed receiver's request, a temporary buffer is created - // to consume the broadcast message. Since the request is completed, we - // don't want to impact the original buffer. - // [5] For the requests that is not completed yet, the original buffer address - // is used in replay for both the sender and the receiver. - // - // Records an MPI call with its arguments in the MPI calls log. Returns - // a pointer to the inserted MPI record object (containing the details - // of the call). - template - MpiRecord* record(fcb_t cb, MPI_Fncs type, - const T fPtr, const Targs... args) - { - MpiRecord *rec = new MpiRecord(cb, type, (void*)fPtr); - if (rec) { - rec->addArgs(args...); - // All collective calls are translated in MANA to the async calls Ibarrier/Ireduce/Ibcast. - // If MANA uses other async calls, we need other cases. - switch (type) { - case GENERATE_ENUM(Ibarrier): - { - MPI_Request req = rec->args(1); - if (req != MPI_REQUEST_NULL) - _recordsMap[req] = 0; - break; - } - case GENERATE_ENUM(Ireduce): - { - MPI_Request req = rec->args(7); - MPI_Comm comm = rec->args(6); - int rank; - MPI_Comm_rank(comm, &rank); - int root = rec->args(5); - // Save the sender's buffer value - // We don't need to save the receiver's buffer value - if (rank != root) { - void *sendbuf = rec->args(0); - int count = rec->args(2); - MPI_Datatype datatype = rec->args(3); - int size; - MPI_Type_size(datatype, &size); - void *newbuf = malloc(count * size); - memcpy(newbuf, sendbuf, count * size); - rec->setBuf(newbuf); - } - if (req != MPI_REQUEST_NULL) - _recordsMap[req] = 0; - break; - } - case GENERATE_ENUM(Ibcast): - { - MPI_Request req = rec->args(5); - MPI_Comm comm = rec->args(4); - int rank; - MPI_Comm_rank(comm, &rank); - int root = rec->args(3); - if (rank == root) { - void *buf = rec->args(0); - int count = rec->args(1); - MPI_Datatype type = rec->args(2); - int size; - MPI_Type_size(type, &size); - void *newbuf = malloc(count * size); - memcpy(newbuf, buf, count * size); - rec->setBuf(newbuf); - } - if (req != MPI_REQUEST_NULL) - _recordsMap[req] = 0; - break; - } - case GENERATE_ENUM(Type_hvector): - { - MPI_Datatype newtype = rec->args(4); - MPI_Datatype oldtype = rec->args(3); - datatype_create(newtype); - datatype_incRef(1, &oldtype); - break; - } - case GENERATE_ENUM(Type_create_struct): - { - MPI_Datatype newtype = rec->args(4); - int count = rec->args(0); - MPI_Datatype *oldtypes = rec->args(3); - datatype_create(newtype); - datatype_incRef(count, oldtypes); - break; - } - case GENERATE_ENUM(Type_indexed): - { - MPI_Datatype newtype = rec->args(4); - MPI_Datatype oldtype = rec->args(3); - datatype_create(newtype); - datatype_incRef(1, &oldtype); - break; - } - case GENERATE_ENUM(Type_commit): - // No need to increase ref count so Type_free can - // free the MPI_Type_ records that creates the new type - break; - case GENERATE_ENUM(Type_free): - { - MPI_Datatype type = rec->args(0); - datatype_free(type); - delete rec; - return NULL; - } - default: - // The 'default' cases include record types like - // comm_create, comm_group, group_incl, etc. - // Those known types only need to be recorded. So they don't - // have any case label to pre-process their record info - // before they are recorded. - break; - } - { - lock_t lock(_mutex); - _records.push_back(rec); - } - } - return rec; - } - - // Replays the MPI calls from the log. Returns MPI_SUCCESS on success. - int replay() - { - int rc = MPI_SUCCESS; - lock_t lock(_mutex); - _replayOn = true; - for (MpiRecord* rec : _records) { - MPI_Request req = MPI_REQUEST_NULL; - switch (rec->getType()) { - case GENERATE_ENUM(Ibarrier): - req = rec->args(1); - break; - case GENERATE_ENUM(Ireduce): - req = rec->args(7); - break; - case GENERATE_ENUM(Ibcast): - req = rec->args(5); - break; - default: - break; - } - if (req != MPI_REQUEST_NULL) { - auto iter = _recordsMap.find(req); - if (iter->second == 1) - rec->setComplete(true); - } - rc = rec->play(); - if (rc != MPI_SUCCESS) { - break; - } - } - _replayOn = false; - return rc; - } - - void reset() - { - lock_t lock(_mutex); - for (MpiRecord* i : _records) { - delete i; - } - _replayOn = false; - _records.clear(); - } - - void cleanComms(dmtcp::set &staleComms) - { - bool setChanged = false; - std::function isStaleComm = - [&](const MpiRecord *rec) { - switch (rec->getType()) { - case GENERATE_ENUM(Comm_split): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(3); - if (staleComms.count(c) > 0) { - staleComms.erase(c); - staleComms.insert(newcomm); - setChanged = true; - return true; - } - return false; - } - case GENERATE_ENUM(Comm_split_type): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(4); - if (staleComms.count(c) > 0) { - staleComms.erase(c); - staleComms.insert(newcomm); - setChanged = true; - return true; - } - return false; - } - case GENERATE_ENUM(Comm_create): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(2); - if (staleComms.count(c) > 0) { - staleComms.erase(c); - staleComms.insert(newcomm); - setChanged = true; - return true; - } - return false; - } - case GENERATE_ENUM(Comm_dup): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(1); - if (staleComms.count(c) > 0) { - staleComms.erase(c); - staleComms.insert(newcomm); - setChanged = true; - return true; - } - return false; - } - case GENERATE_ENUM(Comm_set_errhandler): - case GENERATE_ENUM(Attr_put): - case GENERATE_ENUM(Attr_delete): - { - MPI_Comm c = rec->args(0); - if (staleComms.count(c) > 0) { - staleComms.erase(c); - return true; - } - return false; - } - default: - return false; - } }; - do { - mpi_record_vector_iterator_t it = - remove_if(_records.begin(), _records.end(), isStaleComm); - _records.erase(it, _records.end()); - } while (setChanged); - } - - void clearGroupLogs(MPI_Group group) - { - lock_t lock(_mutex); - dmtcp::set staleComms; - std::function isValidGroup = - [group, &staleComms](const MpiRecord *rec) { - switch (rec->getType()) { - case GENERATE_ENUM(Group_incl): - { - MPI_Group g = rec->args(0); - return group == g; - } - case GENERATE_ENUM(Comm_group): - { - // MPI_Comm comm = rec->args(0); - MPI_Group g = rec->args(1); - return group == g; - } - case GENERATE_ENUM(Comm_create): - { - // MPI_Comm comm = rec->args(0); - MPI_Group g = rec->args(1); - MPI_Comm oldcomm = rec->args(2); - if (group == g) { - staleComms.insert(oldcomm); // save this - return true; - } - return false; - } - default: - return false; - } }; - mpi_record_vector_iterator_t it = - remove_if(_records.begin(), _records.end(), isValidGroup); - _records.erase(it, _records.end()); - cleanComms(staleComms); - } - - void clearCommLogs(MPI_Comm comm) - { - lock_t lock(_mutex); - dmtcp::set staleComms; - std::function isValidComm = - [comm, &staleComms](const MpiRecord *rec) { - switch (rec->getType()) { - case GENERATE_ENUM(Comm_split): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(3); - if (c == comm) { - staleComms.insert(newcomm); - return true; - } - return false; - } - case GENERATE_ENUM(Comm_split_type): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(4); - if (c == comm) { - staleComms.insert(newcomm); - return true; - } - return false; - } - case GENERATE_ENUM(Comm_create): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(2); - if (c == comm) { - staleComms.insert(newcomm); - return true; - } - return false; - } - case GENERATE_ENUM(Comm_dup): - { - MPI_Comm c = rec->args(0); - MPI_Comm newcomm = rec->args(1); - if (c == comm) { - staleComms.insert(newcomm); - return true; - } - return false; - } - case GENERATE_ENUM(Comm_set_errhandler): - case GENERATE_ENUM(Attr_put): - case GENERATE_ENUM(Attr_delete): - { - MPI_Comm c = rec->args(0); - return staleComms.count(c) > 0; - } - default: - return false; - } }; - mpi_record_vector_iterator_t it = - remove_if(_records.begin(), _records.end(), isValidComm); - _records.erase(it, _records.end()); - cleanComms(staleComms); - } - - void removeRequestLog(MPI_Request request) - { - lock_t lock(_mutex); - auto iter = _recordsMap.find(request); - if (iter == _recordsMap.end()) { - return; - } else { - iter->second = 1; // finished - } - } - - // Returns true if we are currently replaying the MPI calls - bool isReplayOn() - { - // FIXME: This needs locking. But we can't do this here, otherwise it'll - // deadlock - return !_replayOn; - } - - void printRecords(bool print); - private: - // Pvt. constructor - MpiRecordReplay() - : _records(), - _replayOn(false), - _mutex() - { - } - - void datatype_create(MPI_Datatype datatype) - { - _datatypeMap[datatype] = 1; - } - - void datatype_incRef(int count, MPI_Datatype *datatypes) - { - MPI_Datatype type; - lock_t lock(_mutex); - if (count == 1) { - type = *datatypes; - if (_datatypeMap.find(type) != _datatypeMap.end()) { - _datatypeMap[type]++; - } - } else { - for (int i = 0; i < count; i++) { - type = datatypes[i]; - if (_datatypeMap.find(type) != _datatypeMap.end()) { - _datatypeMap[type]++; - } - } - } - } - - int datatype_decRef(MPI_Datatype datatype) { - if (_datatypeMap.find(datatype) == _datatypeMap.end()) { - return -1; - } else { - return --_datatypeMap[datatype]; - } - } - - void datatype_find_stale_types(MPI_Datatype type, - dmtcp::set &staleTypes) - { - std::function isStaleType = - [this, type, &staleTypes](const MpiRecord *rec) { - switch (rec->getType()) { - case GENERATE_ENUM(Type_hvector): - { - MPI_Datatype newtype = rec->args(4); - if (newtype == type) { - // The oldtype could be stale if its ref count drops to zero - if (this->datatype_decRef(type) == 0) { - MPI_Datatype oldtype = rec->args(3); - // Skip pre-defined MPI constants like MPI_INT, etc. - if (_datatypeMap.find(oldtype) != _datatypeMap.end()) { - datatype_find_stale_types(oldtype, staleTypes); - } - } - return true; - } - return false; - } - case GENERATE_ENUM(Type_create_struct): - { - MPI_Datatype newtype = rec->args(4); - if (newtype == type) { - if (this->datatype_decRef(type) == 0) { - int count = rec->args(0); - MPI_Datatype *oldtypes = rec->args(3); - for (int i = 0; i < count; i++) { - if (_datatypeMap.find(oldtypes[i]) != _datatypeMap.end()) { - datatype_find_stale_types(oldtypes[i], staleTypes); - } - } - return true; - } - } - return false; - } - case GENERATE_ENUM(Type_indexed): - { - MPI_Datatype newtype = rec->args(4); - if (newtype == type) { - // The oldtype could be stale if its ref count drops to zero - if (this->datatype_decRef(type) == 0) { - MPI_Datatype oldtype = rec->args(3); - // Skip pre-defined MPI constants like MPI_INT, etc. - if (_datatypeMap.find(oldtype) != _datatypeMap.end()) { - datatype_find_stale_types(oldtype, staleTypes); - } - } - return true; - } - return false; - } - case GENERATE_ENUM(Type_commit): - { - // Skip commit because the stale type is collected by - // the creator of the type - return false; - } - default: - return false; - } - }; - - for (auto it = _records.rbegin(); it != _records.rend(); it++) { - if (isStaleType(*it)) { - staleTypes.insert(type); - break; - } - } - } - - void datatype_free(MPI_Datatype datatype) - { - dmtcp::set staleTypes; - bool creator = false; - lock_t lock(_mutex); - datatype_find_stale_types(datatype, staleTypes); - std::function isEqualType = - [&staleTypes, &creator](const MpiRecord *rec) { - switch (rec->getType()) { - case GENERATE_ENUM(Type_hvector): - { - MPI_Datatype newtype = rec->args(4); - if (staleTypes.count(newtype) > 0) { - creator = true; - return true; - } - return false; - } - case GENERATE_ENUM(Type_create_struct): - { - MPI_Datatype newtype = rec->args(4); - if (staleTypes.count(newtype) > 0) { - creator = true; - return true; - } - return false; - } - case GENERATE_ENUM(Type_indexed): - { - MPI_Datatype newtype = rec->args(4); - if (staleTypes.count(newtype) > 0) { - creator = true; - return true; - } - return false; - } - case GENERATE_ENUM(Type_commit): - { - MPI_Datatype newtype = rec->args(0); - return staleTypes.count(newtype) > 0; - } - case GENERATE_ENUM(Type_free): - { - MPI_Datatype type = rec->args(0); - return staleTypes.count(type) > 0; - } - default: - return false; - } - }; - // Traverse the log in the reverse order as the latest MPI record - // is added at the end. - auto it = _records.end(); - while (it != _records.begin()) { - it--; - MpiRecord *rec = *it; - if (isEqualType(rec)) { - it = _records.erase(it); - delete rec; - if (creator) break; - } - } - for (MPI_Datatype type : staleTypes) { - _datatypeMap.erase(type); - } - } - - // Virtual Ids Table - dmtcp::vector _records; - std::unordered_map _recordsMap; //map - std::unordered_map _datatypeMap; //map - // True on restart, false otherwise - bool _replayOn; - // Lock on list - mutex_t _mutex; - }; // class MpiRecordReplay - - - // Restores the MPI communicators and returns MPI_SUCCESS on success - extern int restoreComms(MpiRecord& ); - - // Restores the MPI groups and returns MPI_SUCCESS on success - extern int restoreGroups(MpiRecord& ); - - // Restores the MPI types and returns MPI_SUCCESS on success - extern int restoreTypes(MpiRecord& ); - - // Restores the MPI cartesian communicators and returns MPI_SUCCESS on success - extern int restoreCarts(MpiRecord& ); - - // Restores the MPI ops and returns MPI_SUCCESS on success - extern int restoreOps(MpiRecord& ); - -}; // namespace dmtcp_mpi - -// Restores the MPI state by recreating the communicator, groups, types, etc. -// post restart -extern void restoreMpiLogState(); -#ifdef SINGLE_CART_REORDER -extern void setCartesianCommunicator(void *getCartesianCommunicatorFptr); -#endif -#endif // ifndef MPI_RECORD_REPLAY_H diff --git a/mpi-proxy-split/seq_num.cpp b/mpi-proxy-split/seq_num.cpp index e3cb13acd..8b424458b 100644 --- a/mpi-proxy-split/seq_num.cpp +++ b/mpi-proxy-split/seq_num.cpp @@ -10,9 +10,7 @@ #include "seq_num.h" #include "mpi_nextfunc.h" #include "virtual-ids.h" -#include "record-replay.h" -using namespace dmtcp_mpi; using dmtcp::kvdb::KVDBRequest; using dmtcp::kvdb::KVDBResponse; @@ -53,10 +51,6 @@ sem_t ckpt_thread_sem; sem_t freepass_sem; sem_t freepass_sync_sem; -std::map seq_num; -std::map target; -typedef std::pair comm_seq_pair_t; - constexpr const char *comm_seq_max_db = "/plugin/MANA/comm-seq-max"; void seq_num_init() { @@ -82,30 +76,32 @@ void seq_num_destroy() { } int print_seq_nums() { - unsigned int comm_id; - unsigned long seq; + unsigned int comm_ggid; + unsigned long seq_num; int target_reached = 1; - for (comm_seq_pair_t pair : seq_num) { - comm_id = pair.first; - seq = pair.second; - printf("%d, %u, %lu\n", g_world_rank, comm_id, seq); + for (ggid_desc_pair pair : ggidDescriptorTable) { + comm_ggid = pair.first; + seq_num = pair.second->seq_num; + printf("%d, %u, %lu\n", g_world_rank, comm_ggid, seq_num); } fflush(stdout); return target_reached; } int check_seq_nums(bool exclusive) { - unsigned int comm_id; + unsigned int comm_ggid; + ggid_desc_t* ggid_desc; int target_reached = 1; - for (comm_seq_pair_t pair : seq_num) { - comm_id = pair.first; + for (ggid_desc_pair pair : ggidDescriptorTable) { + comm_ggid = pair.first; + ggid_desc = pair.second; if (exclusive) { - if (target[comm_id] + 1 > seq_num[comm_id]) { + if (ggid_desc->target_num + 1 > ggid_desc->seq_num) { target_reached = 0; break; } } else { - if (target[comm_id] > seq_num[comm_id]) { + if (ggid_desc->target_num > ggid_desc->seq_num) { target_reached = 0; break; } @@ -114,9 +110,10 @@ int check_seq_nums(bool exclusive) { return target_reached; } +// FIXME: This is barely used. Remove? int twoPhaseCommit(MPI_Comm comm, std::functiondoRealCollectiveComm) { - if (!MPI_LOGGING() || comm == MPI_COMM_NULL) { + if (mana_state == RESTART_REPLAY || comm == MPI_COMM_NULL) { return doRealCollectiveComm(); // lambda function: already captured args } @@ -127,15 +124,17 @@ int twoPhaseCommit(MPI_Comm comm, } void seq_num_broadcast(MPI_Comm comm, unsigned long new_target) { - unsigned int comm_gid = VirtualGlobalCommId::instance().getGlobalId(comm); - unsigned long msg[2] = {comm_gid, new_target}; + comm_desc_t* comm_desc = VIRTUAL_TO_DESC_COMM(comm); + ggid_desc_t* comm_ggid_desc = comm_desc->ggid_desc; + unsigned int comm_ggid = comm_ggid_desc->ggid; + unsigned long msg[2] = {comm_ggid, new_target}; int comm_size; int comm_rank; int world_rank; MPI_Comm_size(comm, &comm_size); MPI_Comm_rank(comm, &comm_rank); MPI_Group world_group, local_group; - MPI_Comm real_local_comm = VIRTUAL_TO_REAL_COMM(comm); + MPI_Comm real_local_comm = comm_desc->real_id; MPI_Comm real_world_comm = VIRTUAL_TO_REAL_COMM(g_world_comm); JUMP_TO_LOWER_HALF(lh_info.fsaddr); NEXT_FUNC(Comm_group)(real_world_comm, &world_group); @@ -146,12 +145,12 @@ void seq_num_broadcast(MPI_Comm comm, unsigned long new_target) { JUMP_TO_LOWER_HALF(lh_info.fsaddr); NEXT_FUNC(Group_translate_ranks)(local_group, 1, &i, world_group, &world_rank); - NEXT_FUNC(Send)(&msg, 2, MPI_UNSIGNED_LONG, world_rank, + NEXT_FUNC(Send)(&msg, 2, REAL_CONSTANT(UNSIGNED_LONG), world_rank, 0, real_world_comm); RETURN_TO_UPPER_HALF(); #ifdef DEBUG_SEQ_NUM printf("rank %d sending to rank %d new target comm %u seq %lu target %lu\n", - g_world_rank, world_rank, comm_gid, seq_num[comm_gid], new_target); + g_world_rank, world_rank, comm_ggid, comm_ggid_desc->seq_num, new_target); fflush(stdout); #endif } @@ -174,19 +173,18 @@ void commit_begin(MPI_Comm comm, bool passthrough) { unsigned long new_target[2]; MPI_Comm real_world_comm = VIRTUAL_TO_REAL_COMM(g_world_comm); JUMP_TO_LOWER_HALF(lh_info.fsaddr); - NEXT_FUNC(Recv)(&new_target, 2, MPI_UNSIGNED_LONG, + NEXT_FUNC(Recv)(&new_target, 2, REAL_CONSTANT(UNSIGNED_LONG), status.MPI_SOURCE, status.MPI_TAG, real_world_comm, MPI_STATUS_IGNORE); RETURN_TO_UPPER_HALF(); - unsigned int updated_comm = (unsigned int) new_target[0]; + unsigned int updated_comm_ggid = (unsigned int) new_target[0]; + ggid_desc_iterator it = ggidDescriptorTable.find(updated_comm_ggid); unsigned long updated_target = new_target[1]; - std::map::iterator it = - target.find(updated_comm); - if (it != target.end() && it->second < updated_target) { - target[updated_comm] = updated_target; + if (it != ggidDescriptorTable.end() && it->second->target_num < updated_target) { + it->second->target_num = updated_target; #ifdef DEBUG_SEQ_NUM printf("rank %d received new target comm %u seq %lu target %lu\n", - g_world_rank, updated_comm, seq_num[updated_comm], updated_target); + g_world_rank, updated_comm_ggid, it->second->seq_num, updated_target); fflush(stdout); #endif } @@ -194,15 +192,15 @@ void commit_begin(MPI_Comm comm, bool passthrough) { } pthread_mutex_lock(&seq_num_lock); current_phase = IN_CS; - unsigned int comm_gid = VirtualGlobalCommId::instance().getGlobalId(comm); - seq_num[comm_gid]++; + ggid_desc_t* comm_ggid_desc = VIRTUAL_TO_DESC_COMM(comm)->ggid_desc; + comm_ggid_desc->seq_num++; pthread_mutex_unlock(&seq_num_lock); #ifdef DEBUG_SEQ_NUM // print_seq_nums(); #endif - if (ckpt_pending && seq_num[comm_gid] > target[comm_gid]) { - target[comm_gid] = seq_num[comm_gid]; - seq_num_broadcast(comm, seq_num[comm_gid]); + if (ckpt_pending && comm_ggid_desc->seq_num > comm_ggid_desc->target_num) { + comm_ggid_desc->target_num = comm_ggid_desc->seq_num; + seq_num_broadcast(comm, comm_ggid_desc->seq_num); } } @@ -222,19 +220,18 @@ void commit_finish(MPI_Comm comm, bool passthrough) { unsigned long new_target[2]; MPI_Comm real_world_comm = VIRTUAL_TO_REAL_COMM(g_world_comm); JUMP_TO_LOWER_HALF(lh_info.fsaddr); - NEXT_FUNC(Recv)(&new_target, 2, MPI_UNSIGNED_LONG, + NEXT_FUNC(Recv)(&new_target, 2, REAL_CONSTANT(UNSIGNED_LONG), status.MPI_SOURCE, status.MPI_TAG, real_world_comm, MPI_STATUS_IGNORE); RETURN_TO_UPPER_HALF(); - unsigned int updated_comm = (unsigned int) new_target[0]; + unsigned int updated_comm_ggid = (unsigned int) new_target[0]; unsigned long updated_target = new_target[1]; - std::map::iterator it = - target.find(updated_comm); - if (it != target.end() && it->second < updated_target) { - target[updated_comm] = updated_target; + ggid_desc_iterator it = ggidDescriptorTable.find(updated_comm_ggid); + if (it != ggidDescriptorTable.end() && it->second->target_num < updated_target) { + it->second->target_num = updated_target; #ifdef DEBUG_SEQ_NUM printf("rank %d received new target comm %u seq %lu target %lu\n", - g_world_rank, updated_comm, seq_num[updated_comm], updated_target); + g_world_rank, updated_comm_ggid, it->second->seq_num, updated_target); fflush(stdout); #endif } @@ -243,30 +240,32 @@ void commit_finish(MPI_Comm comm, bool passthrough) { } void upload_seq_num() { - for (comm_seq_pair_t pair : seq_num) { + for (ggid_desc_pair pair : ggidDescriptorTable) { dmtcp::string comm_id_str(jalib::XToString(pair.first)); - unsigned int seq = pair.second; + unsigned int seq = pair.second->seq_num; JASSERT(dmtcp::kvdb::request64(KVDBRequest::MAX, comm_seq_max_db, comm_id_str, seq) == KVDBResponse::SUCCESS); } } -void download_targets(std::map &target) { +void download_targets(std::map &ggidDescriptorTable) { int64_t max_seq = 0; unsigned int comm_id; - for (comm_seq_pair_t pair : seq_num) { + ggid_desc_t* ggid_desc; + for (ggid_desc_pair pair : ggidDescriptorTable) { comm_id = pair.first; + ggid_desc = pair.second; dmtcp::string comm_id_str(jalib::XToString(pair.first)); JASSERT(dmtcp::kvdb::get64(comm_seq_max_db, comm_id_str, &max_seq) == KVDBResponse::SUCCESS); - target[comm_id] = max_seq; + ggid_desc->target_num = max_seq; } } -void share_seq_nums(std::map &target) { +void share_seq_nums(std::map &ggidDescriptorTable) { upload_seq_num(); dmtcp_global_barrier("mana/share-seq-num"); - download_targets(target); + download_targets(ggidDescriptorTable); } void drain_mpi_collective() { @@ -275,7 +274,7 @@ void drain_mpi_collective() { int64_t in_cs = 0; pthread_mutex_lock(&seq_num_lock); ckpt_pending = true; - share_seq_nums(target); + share_seq_nums(ggidDescriptorTable); pthread_mutex_unlock(&seq_num_lock); while (1) { char key[32] = {0}; diff --git a/mpi-proxy-split/seq_num.h b/mpi-proxy-split/seq_num.h index 745b227d5..0cb4c9003 100644 --- a/mpi-proxy-split/seq_num.h +++ b/mpi-proxy-split/seq_num.h @@ -4,6 +4,7 @@ #include #include #include +#include "virtual-ids.h" typedef enum _reset_type_t { RESUME, @@ -44,16 +45,13 @@ typedef struct __rank_state_t // Global communicator for MANA internal use extern MPI_Comm g_world_comm; -extern std::map seq_num; -extern std::map target; - // The main functions of the sequence number algorithm for MPI collectives void commit_begin(MPI_Comm comm, bool passthrough); void commit_finish(MPI_Comm comm, bool passthrough); int twoPhaseCommit(MPI_Comm comm, std::functiondoRealCollectiveComm); void drain_mpi_collective(); -void share_seq_nums(std::map &target); +void share_seq_nums(std::map &ggidDescriptorTable); int check_seq_nums(bool exclusive); int print_seq_nums(); void seq_num_init(); diff --git a/mpi-proxy-split/split_process.cpp b/mpi-proxy-split/split_process.cpp index d78d517b7..f0b8b8986 100644 --- a/mpi-proxy-split/split_process.cpp +++ b/mpi-proxy-split/split_process.cpp @@ -58,6 +58,7 @@ static unsigned long origPhnum; static unsigned long origPhdr; LowerHalfInfo_t lh_info; proxyDlsym_t pdlsym; // initialized to (proxyDlsym_t)lh_info.lh_dlsym +lh_constant_t lh_mpi_constants; LhCoreRegions_t lh_regions_list[MAX_LH_REGIONS] = {0}; static unsigned long getStackPtr(); @@ -509,6 +510,7 @@ initializeLowerHalf() // Save the pointer to mydlsym() function in the lower half. This will be // used in all the mpi-wrappers. pdlsym = (proxyDlsym_t)lh_info.lh_dlsym; + lh_mpi_constants = (lh_constant_t)lh_info.lh_mpi_constants; // Copied from glibc source ElfW(auxv_t) *auxvec; diff --git a/mpi-proxy-split/unit-test/Makefile b/mpi-proxy-split/unit-test/Makefile index 078b569e8..73113fc7b 100644 --- a/mpi-proxy-split/unit-test/Makefile +++ b/mpi-proxy-split/unit-test/Makefile @@ -16,9 +16,6 @@ MPICC = mpicc MPICXX = mpic++ endif -TESTS = record-replay-comm-test \ - record-replay-group-test \ - record-replay-types-test TEST_OBJS=$(addsuffix .o, ${TESTS}) TEST_BINS=$(addsuffix .exe, ${TESTS}) @@ -48,16 +45,15 @@ TEST_LD_FLAGS=${DMTCP_ROOT}/src/libdmtcpinternal.a \ DRAIN_TEST_OBJS = drain-send-recv-test.o ../p2p_drain_send_recv.cpp \ ../p2p_log_replay.cpp ${DMTCP_ROOT}/src/lookup_service.o -default: ${TEST_BINS} +# default: ${TEST_BINS} +default: ; -../record-replay.o ../drain_send_recv_packets.o: @make -C .. # NOTE: The objects files split_process.o and procmapsutils.o # below are required only when MANA is configured with SINGLE_CART_REORDER # C/C++ flag. One should remove these two obj files when we decide to remove # the SINGLE_CART_REORDER macro implementation entirely from MANA. -%.exe: %.o ../record-replay.o ../split_process.o ../lower-half/procmapsutils.o ${MPICXX} -fPIC -g3 -O0 -o $@ $^ ${TEST_LD_FLAGS} drain-send-recv-test.exe: ${DRAIN_TEST_OBJS} @@ -70,10 +66,11 @@ drain-send-recv-test.exe: ${DRAIN_TEST_OBJS} ${MPICXX} ${CXXFLAGS} -g3 -O0 -c -o $@ $< gdb-%: ${TEST_BINS} - gdb --args ./record-replay-$*-test.exe -check: ${TEST_BINS} - @for x in $^; do ${MPIRUN} -n 1 ./$$x; done +# check: ${TEST_BINS} + # @for x in $^; do ${MPIRUN} -n 1 ./$$x; done + +check: ; clean: tidy rm -f ${TEST_BINS} ${TEST_OBJS} diff --git a/mpi-proxy-split/unit-test/record-replay-cart-test.cpp b/mpi-proxy-split/unit-test/record-replay-cart-test.cpp index 8502233a2..ff189e13b 100644 --- a/mpi-proxy-split/unit-test/record-replay-cart-test.cpp +++ b/mpi-proxy-split/unit-test/record-replay-cart-test.cpp @@ -3,7 +3,6 @@ #include #include -#include "record-replay.h" #include "virtual-ids.h" #undef DMTCP_PLUGIN_ENABLE_CKPT @@ -71,7 +70,6 @@ TEST_F(CartTests, testCartCreate) // Log the call FncArg ds = CREATE_LOG_BUF(_dims, _ndims); FncArg ps = CREATE_LOG_BUF(_periods, _ndims); - EXPECT_TRUE(LOG_CALL(restoreCarts, Cart_create, _comm, _ndims, ds, ps, _reorder, virtComm) != NULL); // Replay the call EXPECT_EQ(RESTORE_MPI_STATE(), MPI_SUCCESS); @@ -97,7 +95,6 @@ TEST_F(CartTests, testCartMap) EXPECT_NE(newrank1, -1); FncArg ds = CREATE_LOG_BUF(_dims, _ndims * sizeof(int)); FncArg ps = CREATE_LOG_BUF(_periods, _ndims * sizeof(int)); - EXPECT_TRUE(LOG_CALL(restoreCarts, Cart_map, _comm, _ndims, ds, ps, newrank1) != NULL); EXPECT_EQ(RESTORE_MPI_STATE(), MPI_SUCCESS); // TODO: Not sure how to test that the mapping is still there @@ -112,12 +109,8 @@ int main(int argc, char **argv) { // FIXME: This unit test has been disabled because MPI cartesian communicator - // is created at the restart step instead of the record-replay step, which // happens after the restart step. The new cartesian communicator created at - // the restart step is then passed to record-replay separately via - // setCartesianCommunicator(). Therefore, the "record-replay-cart" unit test // will result in seg fault error because variable (in - // record-replay.cpp) has not been set in this unit test. // initializeJalib(); diff --git a/mpi-proxy-split/unit-test/record-replay-comm-test.cpp b/mpi-proxy-split/unit-test/record-replay-comm-test.cpp index 768df942a..5c941f6c1 100644 --- a/mpi-proxy-split/unit-test/record-replay-comm-test.cpp +++ b/mpi-proxy-split/unit-test/record-replay-comm-test.cpp @@ -2,7 +2,6 @@ #include -#include "record-replay.h" #include "virtual-ids.h" #undef DMTCP_PLUGIN_ENABLE_CKPT @@ -48,7 +47,6 @@ TEST_F(CommTests, testCommDup) MPI_Comm oldvirt = _virtComm; EXPECT_EQ(VIRTUAL_TO_REAL_COMM(_comm), MPI_COMM_WORLD); // Log the call - LOG_CALL(restoreComms, Comm_dup, _comm, _virtComm); // Replay the call EXPECT_EQ(RESTORE_MPI_STATE(), MPI_SUCCESS); // Verify state after replay @@ -69,7 +67,6 @@ TEST_F(CommTests, testCommSplit) _virtComm = ADD_NEW_COMM(real1); EXPECT_NE(_virtComm, -1); MPI_Comm oldvirt = _virtComm; - LOG_CALL(restoreComms, Comm_split, _comm, color, key, _virtComm); // Replay the call EXPECT_EQ(RESTORE_MPI_STATE(), MPI_SUCCESS); // Verify state after replay @@ -91,7 +88,6 @@ TEST_F(CommTests, testCommCreate) _virtComm = ADD_NEW_COMM(real1); EXPECT_NE(_virtComm, -1); MPI_Comm oldvirt = _virtComm; - LOG_CALL(restoreComms, Comm_create, _comm, group, _virtComm); // Replay the call EXPECT_EQ(RESTORE_MPI_STATE(), MPI_SUCCESS); // Verify state after replay diff --git a/mpi-proxy-split/unit-test/record-replay-group-test.cpp b/mpi-proxy-split/unit-test/record-replay-group-test.cpp index a0dd59367..b0af16d6c 100644 --- a/mpi-proxy-split/unit-test/record-replay-group-test.cpp +++ b/mpi-proxy-split/unit-test/record-replay-group-test.cpp @@ -2,7 +2,6 @@ #include -#include "record-replay.h" #include "virtual-ids.h" #undef DMTCP_PLUGIN_ENABLE_CKPT diff --git a/mpi-proxy-split/unit-test/record-replay-types-test.cpp b/mpi-proxy-split/unit-test/record-replay-types-test.cpp index 6e491f5cd..3251d238e 100644 --- a/mpi-proxy-split/unit-test/record-replay-types-test.cpp +++ b/mpi-proxy-split/unit-test/record-replay-types-test.cpp @@ -2,7 +2,6 @@ #include -#include "record-replay.h" #include "virtual-ids.h" #undef DMTCP_PLUGIN_ENABLE_CKPT @@ -53,7 +52,6 @@ TEST_F(TypesTests, testTypeContiguous) MPI_Datatype virtType = ADD_NEW_TYPE(real1); EXPECT_EQ(VIRTUAL_TO_REAL_TYPE(virtType), real1); - EXPECT_TRUE(LOG_CALL(restoreTypes, Type_contiguous, _count, type, virtType) != NULL); EXPECT_EQ(RESTORE_MPI_STATE(), MPI_SUCCESS); @@ -73,12 +71,10 @@ TEST_F(TypesTests, testTypeCommit) EXPECT_NE(real1, MPI_DATATYPE_NULL); MPI_Datatype virtType = ADD_NEW_TYPE(real1); EXPECT_EQ(VIRTUAL_TO_REAL_TYPE(virtType), real1); - EXPECT_TRUE(LOG_CALL(restoreTypes, Type_contiguous, _count, type, virtType) != NULL); // Commit the new datatype EXPECT_EQ(MPI_Type_commit(&real1), MPI_SUCCESS); - EXPECT_TRUE(LOG_CALL(restoreTypes, Type_commit, real1) != NULL); EXPECT_EQ(VIRTUAL_TO_REAL_TYPE(virtType), real1); int size = -1; EXPECT_EQ(MPI_Type_size(real1, &size), MPI_SUCCESS); diff --git a/mpi-proxy-split/virtual-ids.cpp b/mpi-proxy-split/virtual-ids.cpp new file mode 100644 index 000000000..3f6a464c0 --- /dev/null +++ b/mpi-proxy-split/virtual-ids.cpp @@ -0,0 +1,620 @@ +/**************************************************************************** + * Copyright (C) 2019-2021 by Gene Cooperman, Rohan Garg, Yao Xu * + * gene@ccs.neu.edu, rohgarg@ccs.neu.edu, xu.yao1@northeastern.edu * + * * + * Edited 2023 by Leonid Belyaev * + * belyaev.l@northeastern.edu * + * * + * This file is part of DMTCP. * + * * + * DMTCP is free software: you can redistribute it and/or * + * modify it under the terms of the GNU Lesser General Public License as * + * published by the Free Software Foundation, either version 3 of the * + * License, or (at your option) any later version. * + * * + * DMTCP is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License in the files COPYING and COPYING.LESSER. If not, see * + * . * + ****************************************************************************/ +#include +#include + +#include +#include + +#include "dmtcp.h" + +#include "jassert.h" +#include "jconvert.h" +#include "split_process.h" +#include "lower_half_api.h" +#include "virtual-ids.h" +#include "mpi_nextfunc.h" + +#define MAX_VIRTUAL_ID 999 + +MPI_Comm WORLD_COMM = MPI_COMM_WORLD; +MPI_Comm NULL_COMM = MPI_COMM_NULL; + +// #define DEBUG_VIDS + +// TODO I use an explicitly integer virtual id, which under macro +// reinterpretation will fit into an int64 pointer. This should be +// fine if no real MPI Type is smaller than an int32. +typedef typename std::map::iterator id_desc_iterator; +typedef typename std::map::iterator ggid_desc_iterator; + +// Per Yao Xu, MANA does not require the thread safety offered by +// DMTCP's VirtualIdTable. We use std::map. + +// int vId -> id_desc_t*, which contains rId. +std::map idDescriptorTable; + +// int ggid -> ggid_desc_t*, which contains CVC information. +std::map ggidDescriptorTable; + +// For best compatibility, we determine MPI constants at runtime. +std::map lh_constants_map; + +#define INIT_LH_CONST_MAP(const) lh_constants_map[MPI_##const] = REAL_CONSTANT(const); + +void init_lh_constants_map() { + FOREACH_CONSTANT(INIT_LH_CONST_MAP) + lh_constants_map[MPI_ERRORS_RETURN] = 0; +} + +// dead-simple vid generation mechanism, add one to get new ids. +int base = 1; +int nextvId = base; + +// Internal /real/ group, used to reconstruct and update. A way to access the +// whole bag of global ranks. +MPI_Group g_world_group; + +// Hash function on integers. Consult https://stackoverflow.com/questions/664014/. +// Returns a hash. +int hash(int i) { + return i * 2654435761 % ((unsigned long)1 << 32); +} + +// Compute the ggid [Global Group Identifier] of a real MPI communicator. +// This consists of a hash of its integer ranks. +// OUT: rbuf +// Returns ggid. +int getggid(MPI_Comm comm, int worldRank, int commSize, int* rbuf) { + if (comm == NULL_COMM || comm == WORLD_COMM) { + return (intptr_t)comm; + } + unsigned int ggid = 0; + + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Allgather)(&worldRank, 1, REAL_CONSTANT(INT), + rbuf, 1, REAL_CONSTANT(INT), comm); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); + + for (int i = 0; i < commSize; i++) { + ggid ^= hash(rbuf[i] + 1); + } + + return ggid; +} + +// This is a descriptor initializer. Its job is to write an initial +// descriptor for a real MPI Communicator. + +comm_desc_t* init_comm_desc_t(MPI_Comm realComm) { + comm_desc_t* desc = ((comm_desc_t*)malloc(sizeof(comm_desc_t))); + + desc->real_id = realComm; + desc->global_ranks = NULL; + + return desc; +} + +void grant_ggid(MPI_Comm virtualComm) { + int worldRank, commSize, localRank; + + comm_desc_t* desc = ((comm_desc_t*)virtualToDescriptor(*((int*)&virtualComm))); + + MPI_Comm realComm = desc->real_id; + + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Comm_rank)(REAL_CONSTANT(COMM_WORLD), &worldRank); + NEXT_FUNC(Comm_size)(realComm, &commSize); + NEXT_FUNC(Comm_rank)(realComm, &localRank); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); + + int* ranks = ((int* )malloc(sizeof(int) * commSize)); + + int ggid = getggid(realComm, worldRank, commSize, ranks); + ggid_desc_iterator it = ggidDescriptorTable.find(ggid); + + if (it != ggidDescriptorTable.end()) { + // There exists another communicator with the same ggid, i.e., + // this communicator is an alias. We use the same ggid_desc. + desc->ggid_desc = it->second; + } else { + ggid_desc_t* gd = ((ggid_desc_t *) malloc(sizeof(ggid_desc_t))); + gd->ggid = ggid; + + // FIXME: In the old system, using VirtualGlobalCommId, These were only + // initiatialized in the wrapper function for Comm_create, (but not + // Comm_split, etc.) + // So, what is correct? + gd->target_num = 0; + gd->seq_num = 0; + ggidDescriptorTable[ggid] = gd; + desc->ggid_desc = gd; + } + + desc->global_ranks = ranks; + desc->local_rank = localRank; + desc->size = commSize; + + return; +} + +// FIXME: In some cases, this could cause a group to be needlessly constructed. +// A more efficient design would link groups and comms, but would introduce +// complexity. +void update_comm_desc_t(comm_desc_t* desc) { + // We need to virtualize MPI_COMM_WORLD, but should not reconstruct it. + if (desc->handle == MPI_COMM_WORLD) { + return; + } + + // Cleanup from a previous CKPT, if one occured. FIXME: Is this the correct + // way? + free(desc->global_ranks); + desc->global_ranks = NULL; + + MPI_Group group; + + // FIXME: Should we free this group? + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Comm_group)(desc->real_id, &group); + RETURN_TO_UPPER_HALF(); + + int groupSize = 0; + + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_size)(group, &groupSize); + RETURN_TO_UPPER_HALF(); + + desc->size = groupSize; + + int* local_ranks = ((int*)malloc(sizeof(int) * groupSize)); + int* global_ranks = ((int*)malloc(sizeof(int) * groupSize)); + for (int i = 0; i < groupSize; i++) { + local_ranks[i] = i; + } + + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_translate_ranks)(group, groupSize, local_ranks, g_world_group, global_ranks); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); + + free(local_ranks); + desc->global_ranks = global_ranks; +} + +void reconstruct_with_comm_desc_t(comm_desc_t* desc) { + if (desc->handle == MPI_COMM_WORLD) { + return; + } +#ifdef DEBUG_VIDS + printf("reconstruct_comm_desc_t comm: %x -> %x\n", desc->handle, desc->real_id); + + fflush(stdout); + printf("reconstruct_with_comm_desc_t comm size: 0x%x\n", desc->size); + + fflush(stdout); + printf("reconstruct_with_comm_desc_t ranks:"); + + fflush(stdout); + for (int i = 0; i < desc->size; i++) { + printf(" %i", desc->ranks[i]); + } + printf("\n"); + + fflush(stdout); + fflush(stdout); +#endif + + MPI_Group group; + + // We recreate the communicator with the reconstructed group and MPI_COMM_WORLD. + + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_incl)(g_world_group, desc->size, desc->global_ranks, &group); + NEXT_FUNC(Comm_create_group)(REAL_CONSTANT(COMM_WORLD), group, 0, &desc->real_id); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); +} + +void destroy_comm_desc_t(comm_desc_t* desc) { + free(desc->global_ranks); + // FIXME: We DO NOT free the ggid_desc, because it is aliased, and as such + // shouldn't always be removed. We could do a form of reference counting, but + // we would need to make sure that it is thread-safe. Need to think about how + // to do it correctly in the MANA architecture. + free(desc); +} + +group_desc_t* init_group_desc_t(MPI_Group realGroup) { + group_desc_t* desc = ((group_desc_t*)malloc(sizeof(group_desc_t))); + desc->real_id = realGroup; + desc->global_ranks = NULL; + desc->size = 0; + return desc; +} + +void update_group_desc_t(group_desc_t* group) { + // Cleanup from a previous checkpoint. + free(group->global_ranks); + group->global_ranks = NULL; + + int groupSize; + + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_size)(group->real_id, &groupSize); + RETURN_TO_UPPER_HALF(); + + int* local_ranks = ((int*)malloc(sizeof(int) * groupSize)); + int* global_ranks = ((int*)malloc(sizeof(int) * groupSize)); + for (int i = 0; i < groupSize; i++) { + local_ranks[i] = i; + } + + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_translate_ranks)(group->real_id, groupSize, local_ranks, g_world_group, global_ranks); + RETURN_TO_UPPER_HALF(); + + free(local_ranks); + group->global_ranks = global_ranks; + group->size = groupSize; + DMTCP_PLUGIN_ENABLE_CKPT(); +} + +void reconstruct_with_group_desc_t(group_desc_t* group) { + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_incl)(g_world_group, group->size, group->global_ranks, &group->real_id); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); +} + +void destroy_group_desc_t(group_desc_t* group) { + free(group->global_ranks); + free(group); +} + +request_desc_t* init_request_desc_t(MPI_Request realReq) { + request_desc_t* desc = ((request_desc_t*)malloc(sizeof(request_desc_t))); + desc->real_id = realReq; + return desc; +} + +void destroy_request_desc_t(request_desc_t* request) { + free(request); +} + +op_desc_t* init_op_desc_t(MPI_Op realOp) { + op_desc_t* desc = ((op_desc_t*)malloc(sizeof(op_desc_t))); + desc->real_id = realOp; + desc->user_fn = NULL; + return desc; +} + +// This update function is special, for the information we need is +// only available at creation time, and not with MPI calls. We +// manually invoke this function at creation time. Mercifully, there +// is only one way to create a new operator, AFAIK. +void update_op_desc_t(op_desc_t* op, MPI_User_function* user_fn, int commute) { + op->user_fn = user_fn; + op->commute = commute; +} + +void destroy_op_desc_t(op_desc_t* op) { + free(op); +} + +datatype_desc_t* init_datatype_desc_t(MPI_Datatype realType) { + datatype_desc_t* desc = ((datatype_desc_t*)malloc(sizeof(datatype_desc_t))); + desc->real_id = realType; + + desc->num_integers = 0; + desc->integers = NULL; + + desc->num_addresses = 0; + desc->addresses = NULL; + + // FIXME: this was in Yao's spec sheet, but I haven't seen it used in any + // of the relevant functions. Why is this here? + desc->num_large_counts = 0; + desc->large_counts = NULL; + + desc->num_datatypes = 0; + desc->datatypes = NULL; + + desc->combiner = 0; + + // FIXME: I had a design that would keep freed descriptors, to implement + // doubly-derived reconstruction. But we can probably ignore this for now. + + // desc->is_freed = false; + return desc; +} + +void update_datatype_desc_t(datatype_desc_t* datatype) { + // 2023-09-19: ExaMPI does not support this yet, though it ought to soon. +#ifndef EXAMPI + // Free the existing memory, if it is not NULL. (i.e., from an older checkpoint) + free(datatype->integers); + free(datatype->addresses); + free(datatype->datatypes); + datatype->integers = NULL; + datatype->addresses = NULL; + datatype->datatypes = NULL; + + DMTCP_PLUGIN_DISABLE_CKPT(); + + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Type_get_envelope)(datatype->real_id, &datatype->num_integers, &datatype->num_addresses, &datatype->num_datatypes, &datatype->combiner); + RETURN_TO_UPPER_HALF(); + + // FIXME: "If combiner is MPI_COMBINER_NAMED then it is erroneous + // to call MPI_TYPE_GET_CONTENTS. " So, we might want to exit + // early (although this shouldn't happen anyway, as nobody should + // make a virtualization of these types) + + // Use the malloc in the upper-half. + datatype->integers = ((int*)malloc(sizeof(int) * datatype->num_integers)); + datatype->addresses = ((MPI_Aint*)malloc(sizeof(MPI_Aint) * datatype->num_addresses)); + datatype->datatypes = ((MPI_Datatype*)malloc(sizeof(MPI_Datatype) * datatype->num_datatypes)); + + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Type_get_contents)(datatype->real_id, datatype->num_integers, datatype->num_addresses, datatype->num_datatypes, datatype->integers, datatype->addresses, datatype->datatypes); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); +#endif +} + +// Reconstruction arguments taken from here: +// https://www.mpi-forum.org/docs/mpi-3.1/mpi31-report/node90.htm +void reconstruct_with_datatype_desc_t(datatype_desc_t* datatype) { +#ifndef EXAMPI + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + switch (datatype->combiner) { + case MPI_COMBINER_DUP: + NEXT_FUNC(Type_dup)(datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_NAMED: + // if the type is named and predefined, we shouldn't need to do anything. + // "If combiner is MPI_COMBINER_NAMED then it is erroneous to call MPI_TYPE_GET_CONTENTS. " + break; + case MPI_COMBINER_VECTOR: + NEXT_FUNC(Type_vector)(datatype->integers[0], datatype->integers[1], datatype->integers[2], datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_HVECTOR: + NEXT_FUNC(Type_hvector)(datatype->integers[0], datatype->integers[1], datatype->addresses[0], datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_INDEXED: + NEXT_FUNC(Type_indexed)(datatype->integers[0], datatype->integers + 1, datatype->integers + 1 + datatype->integers[0], datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_HINDEXED: + NEXT_FUNC(Type_hindexed)(datatype->integers[0], datatype->integers + 1, datatype->addresses, datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_INDEXED_BLOCK: + NEXT_FUNC(Type_create_indexed_block)(datatype->integers[0], datatype->integers[1], datatype->integers + 2, datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_HINDEXED_BLOCK: + NEXT_FUNC(Type_create_hindexed_block)(datatype->integers[0], datatype->integers[1], datatype->addresses, datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_STRUCT: + NEXT_FUNC(Type_create_struct)(datatype->integers[0], datatype->integers + 1, datatype->addresses, datatype->datatypes, &datatype->real_id); + break; + case MPI_COMBINER_SUBARRAY: + NEXT_FUNC(Type_create_subarray)(datatype->integers[0], datatype->integers + 1, datatype->integers + 1 + datatype->integers[0], datatype->integers + 1 + 2 * datatype->integers[0], datatype->integers[1 + 3 * datatype->integers[0]], datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_DARRAY: + NEXT_FUNC(Type_create_darray)(datatype->integers[0], datatype->integers[1], datatype->integers[2], datatype->integers + 3, datatype->integers + 3 + datatype->integers[2], datatype->integers + 3 + 2 * datatype->integers[2], datatype->integers + 3 + 3 * datatype->integers[2], datatype->integers[3 + 4 * datatype->integers[2]], datatype->datatypes[0], &datatype->real_id); + case MPI_COMBINER_CONTIGUOUS: + NEXT_FUNC(Type_contiguous)(datatype->integers[0], datatype->datatypes[0], &datatype->real_id); + break; + case MPI_COMBINER_F90_REAL: + NEXT_FUNC(Type_create_f90_real)(datatype->integers[0], datatype->integers[1], &datatype->real_id); + break; + case MPI_COMBINER_F90_COMPLEX: + NEXT_FUNC(Type_create_f90_complex)(datatype->integers[0], datatype->integers[1], &datatype->real_id); + break; + case MPI_COMBINER_F90_INTEGER: + NEXT_FUNC(Type_create_f90_integer)(datatype->integers[0], &datatype->real_id); + break; + case MPI_COMBINER_RESIZED: + NEXT_FUNC(Type_create_resized)(datatype->datatypes[0], datatype->addresses[0], datatype->addresses[1], &datatype->real_id); + break; + default: + break; + } + // FIXME: This is needed to make the reconstructed type usable in the + // network. But there is a small potential for a double-commit bug here. Is + // there an mpi function that checks if a type has been committed? + // + // Need to research if double-commit is an issue. + NEXT_FUNC(Type_commit)(&datatype->real_id); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); +#endif // ifndef EXAMPI +} + +void destroy_datatype_desc_t(datatype_desc_t* datatype) { + free(datatype->integers); + free(datatype->addresses); + free(datatype->large_counts); + free(datatype->datatypes); + free(datatype); +} + +file_desc_t* init_file_desc_t(MPI_File realFile) { + file_desc_t* desc = ((file_desc_t*)malloc(sizeof(file_desc_t))); + desc->real_id = realFile; + return desc; +} + +void destroy_file_desc_t(file_desc_t* file) { + free(file); +} + +void print_id_descriptors() { + printf("Printing %u id_descriptors:\n", idDescriptorTable.size()); + fflush(stdout); + for (id_desc_pair pair : idDescriptorTable) { + printf("%x\n", pair.first); + fflush(stdout); + } +} + +// Given int virtualid, return the contained id_desc_t if it exists. +// Otherwise return NULL +id_desc_t* virtualToDescriptor(int virtId) { +#ifdef DEBUG_VIDS + // print_id_descriptors(); +#endif + id_desc_iterator it = idDescriptorTable.find(virtId); + if (it != idDescriptorTable.end()) { + return it->second; + } + return NULL; +} + +// FIXME: We need to preemptively virtualize MPI_COMM_WORLD, with GGID. In the +// original code, there is also a pre-initialization for MPI_COMM_NULL, but +// that doesn't appear to be necessary for CVC algorithm functionality. +void init_comm_world() { + comm_desc_t* comm_world = ((comm_desc_t*)malloc(sizeof(comm_desc_t))); + ggid_desc_t* comm_world_ggid = ((ggid_desc_t*)malloc(sizeof(ggid_desc_t))); + comm_world->ggid_desc = comm_world_ggid; + // The upper-half one. + comm_world_ggid->ggid = MPI_COMM_WORLD; + + comm_world_ggid->seq_num = 0; + comm_world_ggid->target_num = 0; + + // FIXME: This WILL NOT WORK when moving to OpenMPI, ExaMPI, as written. + // Yao, Twinkle, should apply their lh_constants_map strategy here. + comm_world->real_id = REAL_CONSTANT(COMM_WORLD); + comm_world->handle = MPI_COMM_WORLD; + // FIXME: the other fields are not initialized. This is an INTERNAL + // communicator descriptor, strictly for bookkeeping, not for reconstructing, + // etc. + + idDescriptorTable[MPI_COMM_WORLD] = ((union id_desc_t*)comm_world); + ggidDescriptorTable[MPI_COMM_WORLD] = comm_world_ggid; +} + +// FIXME: This gets us a copy of the world group that we can use to get all the +// ranks. It's a real id, not a virtual one. Is this okay? +void write_g_world_group() { + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Comm_group)(REAL_CONSTANT(COMM_WORLD), &g_world_group); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); +} + +void destroy_g_world_group() { + DMTCP_PLUGIN_DISABLE_CKPT(); + JUMP_TO_LOWER_HALF(lh_info.fsaddr); + NEXT_FUNC(Group_free)(&g_world_group); + RETURN_TO_UPPER_HALF(); + DMTCP_PLUGIN_ENABLE_CKPT(); +} + + +// For all descriptors, update the respective information. +void update_descriptors() { + write_g_world_group(); + for (id_desc_pair pair : idDescriptorTable) { + switch (pair.first & 0xFF000000) { + case UNDEFINED_MASK: + break; + case COMM_MASK: + update_comm_desc_t((comm_desc_t*)pair.second); + break; + case GROUP_MASK: + update_group_desc_t((group_desc_t*)pair.second); + break; + case REQUEST_MASK: + // update_request_desc_t((request_desc_t*)pair.second); + break; + case OP_MASK: + // update_op_desc_t((op_desc_t*)pair.second); + // HACK: This must be called on Op_create. see update_op_desc + break; + case DATATYPE_MASK: + update_datatype_desc_t((datatype_desc_t*)pair.second); + break; + case FILE_MASK: + // update_file_desc_t((file_desc_t*)pair.second); + break; + case COMM_KEYVAL_MASK: + // update_comm_keyval_desc_t((comm_keyval_desc_t*)pair.second); + break; + default: + break; + } + } + destroy_g_world_group(); +} + +// For all descriptors, set its real ID to the one uniquely described by its fields. +void reconstruct_with_descriptors() { + write_g_world_group(); + for (id_desc_pair pair : idDescriptorTable) { + switch (pair.first & 0xFF000000) { + case UNDEFINED_MASK: + break; + case COMM_MASK: + reconstruct_with_comm_desc_t((comm_desc_t*)pair.second); + break; + case GROUP_MASK: + reconstruct_with_group_desc_t((group_desc_t*)pair.second); + break; + case REQUEST_MASK: + // update_request_desc_t((request_desc_t*)pair.second); + break; + case OP_MASK: + reconstruct_with_op_desc_t((op_desc_t*)pair.second); + break; + case DATATYPE_MASK: + reconstruct_with_datatype_desc_t((datatype_desc_t*)pair.second); + break; + case FILE_MASK: + // update_file_desc_t((file_desc_t*)pair.second); + break; + case COMM_KEYVAL_MASK: + // update_comm_keyval_desc_t((comm_keyval_desc_t*)pair.second); + break; + default: + break; + } + } + destroy_g_world_group(); +} + + + diff --git a/mpi-proxy-split/virtual-ids.h b/mpi-proxy-split/virtual-ids.h index 3f5befa93..cd49fd986 100644 --- a/mpi-proxy-split/virtual-ids.h +++ b/mpi-proxy-split/virtual-ids.h @@ -1,44 +1,36 @@ -/**************************************************************************** - * Copyright (C) 2019-2021 by Gene Cooperman, Rohan Garg, Yao Xu * - * gene@ccs.neu.edu, rohgarg@ccs.neu.edu, xu.yao1@northeastern.edu * - * * - * This file is part of DMTCP. * - * * - * DMTCP is free software: you can redistribute it and/or * - * modify it under the terms of the GNU Lesser General Public License as * - * published by the Free Software Foundation, either version 3 of the * - * License, or (at your option) any later version. * - * * - * DMTCP is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public * - * License in the files COPYING and COPYING.LESSER. If not, see * - * . * - ****************************************************************************/ - #pragma once #ifndef MPI_VIRTUAL_IDS_H #define MPI_VIRTUAL_IDS_H #include - #include "virtualidtable.h" #include "jassert.h" #include "jconvert.h" #include "split_process.h" #include "dmtcp.h" +#include "lower_half_api.h" + +#define CONCAT(a,b) a ## b + +// num - type - VID MASK +// 0 - undefined - 0x00000000 +// 1 - communicator - 0x01000000 +// 2 - group - 0x02000000 +// 3 - request - 0x03000000 +// 4 - op - 0x04000000 +// 5 - datatype - 0x05000000 +// 6 - file - 0x06000000 +// 7 - comm_keyval - 0x07000000 + +#define UNDEFINED_MASK 0x00000000 +#define COMM_MASK 0x01000000 +#define GROUP_MASK 0x02000000 +#define REQUEST_MASK 0x03000000 +#define OP_MASK 0x04000000 +#define DATATYPE_MASK 0x05000000 +#define FILE_MASK 0x06000000 +#define COMM_KEYVAL_MASK 0x07000000 -// Convenience macros -#define MpiCommList dmtcp_mpi::MpiVirtualization -#define MpiGroupList dmtcp_mpi::MpiVirtualization -#define MpiTypeList dmtcp_mpi::MpiVirtualization -#define MpiOpList dmtcp_mpi::MpiVirtualization -#define MpiFileList dmtcp_mpi::MpiVirtualization -#define MpiCommKeyvalList dmtcp_mpi::MpiVirtualization -#define MpiRequestList dmtcp_mpi::MpiVirtualization #ifndef NEXT_FUNC # define NEXT_FUNC(func) \ ({ \ @@ -50,348 +42,346 @@ _real_MPI_ ## func; \ }) #endif // ifndef NEXT_FUNC -#define REAL_TO_VIRTUAL_FILE(id) \ - MpiFileList::instance("MpiFile", MPI_FILE_NULL).realToVirtual(id) + +#ifndef REAL_CONSTANT +# define REAL_CONSTANT(name) *(__typeof__(MPI_##name)*)lh_mpi_constants(LH_MPI_##name) +#endif // ifndef REAL_CONSTANT + + +#define DESC_TO_VIRTUAL(desc, null, real_type) \ + ({ \ + real_type _DTV_vId = (desc == NULL) ? null : desc->handle; \ + _DTV_vId; \ + }) + +#define VIRTUAL_TO_DESC(virtual_id, null, desc_type) \ + (virtual_id == null) ? NULL : ((desc_type*) virtualToDescriptor( *((int*)(&virtual_id)) )) + +#define VIRTUAL_TO_REAL(id, null, real_id_type, desc_type) \ + ({ \ + desc_type* _VTR_tmp = VIRTUAL_TO_DESC(id, null, desc_type); \ + real_id_type _VTR_id = (_VTR_tmp == NULL) ? (real_id_type)lh_constants_map[(intptr_t)id] : _VTR_tmp->real_id; \ + _VTR_id; \ + }) + +// FIXME: Looping through every real id is /very/ inefficient for a large +// program, but we don't have any way to loop through only the real ids of a +// particular type yet. That would change with a two-level table approach. +#define ADD_NEW(given_real_id, null, real_id_type, descriptor_type, vid_mask) \ + ({ \ + real_id_type _AD_retval; \ + descriptor_type* _AD_desc; \ + if (given_real_id != null) { \ + bool real_id_exists = false; \ + for (id_desc_pair pair : idDescriptorTable) { \ + if ((vid_mask & pair.first) == vid_mask && ((descriptor_type*)pair.second)->real_id == given_real_id) { \ + real_id_exists = true; \ + _AD_retval = *((real_id_type*)&pair.first); \ + break; \ + } \ + } \ + if (!real_id_exists) { \ + _AD_desc = CONCAT(init_,descriptor_type)(given_real_id); \ + int _AD_vId = nextvId++; \ + _AD_vId = _AD_vId | vid_mask; \ + _AD_desc->handle = _AD_vId; \ + idDescriptorTable[_AD_vId] = ((union id_desc_t*) _AD_desc); \ + _AD_retval = *((real_id_type*)&_AD_vId); \ + } \ + } else { \ + _AD_retval = null; \ + } \ + _AD_retval; \ + }) + +#define REMOVE_OLD(virtual_id, null, descriptor_type, real_type) \ + ({ \ + real_type _RO_retval; \ + if (virtual_id == null) { \ + _RO_retval = null; \ + } else { \ + descriptor_type* _RO_torem; \ + id_desc_iterator it = idDescriptorTable.find( *((int*)&virtual_id) ); \ + if (it != idDescriptorTable.end()) { \ + _RO_torem = ((descriptor_type*)it->second); \ + idDescriptorTable.erase(*((int*)&virtual_id)); \ + _RO_retval = _RO_torem->real_id; \ + CONCAT(destroy_,descriptor_type)(_RO_torem); \ + } else { \ + _RO_retval = null; \ + } \ + } \ + _RO_retval; \ + }) + +// FIXME: this cannot easily have precisely the same semantics as ADD_NEW: the [] +// operator inserts when the argument is not yet present. If the usage in the +// code is indicative of the "update" name despite this, then it's not a +// problem. +// I just checked every usage of UPDATE_MAP to see if those semantics of [] are employed -- +// They are not, with the exception of MPI_COMM_WORLD, so we previrtualize it. +#define UPDATE_MAP(virtual_id, to_update, null, descriptor_type, to_update_type) \ + ({ \ + to_update_type _UM_retval; \ + if (virtual_id == null) { \ + _UM_retval = null; \ + } else { \ + id_desc_iterator _UM_it = idDescriptorTable.find(*((int*)(&virtual_id))); \ + if (_UM_it != idDescriptorTable.end()) { \ + descriptor_type* desc = ((descriptor_type*)_UM_it->second); \ + desc->real_id = to_update; \ + _UM_retval = virtual_id; \ + } else { \ + _UM_retval = null; \ + } \ + } \ + _UM_retval; \ +}) + +#define DESC_TO_VIRTUAL_FILE(desc) \ + DESC_TO_VIRTUAL(desc, MPI_FILE_NULL, MPI_File) +#define VIRTUAL_TO_DESC_FILE(id) \ + VIRTUAL_TO_DESC(id, MPI_FILE_NULL, file_desc_t) #define VIRTUAL_TO_REAL_FILE(id) \ - MpiFileList::instance("MpiFile", MPI_FILE_NULL).virtualToReal(id) + VIRTUAL_TO_REAL(id, MPI_FILE_NULL, MPI_File, file_desc_t) #define ADD_NEW_FILE(id) \ - MpiFileList::instance("MpiFile", MPI_FILE_NULL).onCreate(id) + ADD_NEW(id, MPI_FILE_NULL, MPI_File, file_desc_t, FILE_MASK) #define REMOVE_OLD_FILE(id) \ - MpiFileList::instance("MpiFile", MPI_FILE_NULL).onRemove(id) + REMOVE_OLD(id, MPI_FILE_NULL, file_desc_t, MPI_File) #define UPDATE_FILE_MAP(v, r) \ - MpiFileList::instance("MpiFile", MPI_FILE_NULL).updateMapping(v, r) + UPDATE_MAP(v, r, MPI_FILE_NULL, file_desc_t, MPI_File) -#define REAL_TO_VIRTUAL_COMM(id) \ - MpiCommList::instance("MpiComm", MPI_COMM_NULL).realToVirtual(id) +#define DESC_TO_VIRTUAL_COMM(desc) \ + DESC_TO_VIRTUAL(desc, MPI_COMM_NULL, MPI_Comm) +#define VIRTUAL_TO_DESC_COMM(id) \ + VIRTUAL_TO_DESC(id, MPI_COMM_NULL, comm_desc_t) #define VIRTUAL_TO_REAL_COMM(id) \ - MpiCommList::instance("MpiComm", MPI_COMM_NULL).virtualToReal(id) + VIRTUAL_TO_REAL(id, MPI_COMM_NULL, MPI_Comm, comm_desc_t) #define ADD_NEW_COMM(id) \ - MpiCommList::instance("MpiComm", MPI_COMM_NULL).onCreate(id) + ADD_NEW(id, MPI_COMM_NULL, MPI_Comm, comm_desc_t, COMM_MASK) #define REMOVE_OLD_COMM(id) \ - MpiCommList::instance("MpiComm", MPI_COMM_NULL).onRemove(id) + REMOVE_OLD(id, MPI_COMM_NULL, comm_desc_t, MPI_Comm) #define UPDATE_COMM_MAP(v, r) \ - MpiCommList::instance("MpiComm", MPI_COMM_NULL).updateMapping(v, r) + UPDATE_MAP(v, r, MPI_COMM_NULL, comm_desc_t, MPI_Comm) -#define REAL_TO_VIRTUAL_GROUP(id) \ - MpiGroupList::instance("MpiGroup", MPI_GROUP_NULL).realToVirtual(id) +#define DESC_TO_VIRTUAL_GROUP(desc) \ + DESC_TO_VIRTUAL(desc, MPI_GROUP_NULL, MPI_Group) +#define VIRTUAL_TO_DESC_GROUP(id) \ + VIRTUAL_TO_DESC(id, MPI_GROUP_NULL, group_desc_t) #define VIRTUAL_TO_REAL_GROUP(id) \ - MpiGroupList::instance("MpiGroup", MPI_GROUP_NULL).virtualToReal(id) + VIRTUAL_TO_REAL(id, MPI_GROUP_NULL, MPI_Group, group_desc_t) #define ADD_NEW_GROUP(id) \ - MpiGroupList::instance("MpiGroup", MPI_GROUP_NULL).onCreate(id) + ADD_NEW(id, MPI_GROUP_NULL, MPI_Group, group_desc_t, GROUP_MASK) #define REMOVE_OLD_GROUP(id) \ - MpiGroupList::instance("MpiGroup", MPI_GROUP_NULL).onRemove(id) + REMOVE_OLD(id, MPI_GROUP_NULL, group_desc_t, MPI_Group) #define UPDATE_GROUP_MAP(v, r) \ - MpiGroupList::instance("MpiGroup", MPI_GROUP_NULL).updateMapping(v, r) + UPDATE_MAP(v, r, MPI_GROUP_NULL, group_desc_t, MPI_Group) -#define REAL_TO_VIRTUAL_TYPE(id) \ - MpiTypeList::instance("MpiType", MPI_DATATYPE_NULL).realToVirtual(id) +#define DESC_TO_VIRTUAL_TYPE(desc) \ + DESC_TO_VIRTUAL(desc, MPI_DATATYPE_NULL, MPI_Datatype) +#define VIRTUAL_TO_DESC_TYPE(id) \ + VIRTUAL_TO_DESC(id, MPI_DATATYPE_NULL, datatype_desc_t) #define VIRTUAL_TO_REAL_TYPE(id) \ - MpiTypeList::instance("MpiType", MPI_DATATYPE_NULL).virtualToReal(id) + VIRTUAL_TO_REAL(id, MPI_DATATYPE_NULL, MPI_Datatype, datatype_desc_t) #define ADD_NEW_TYPE(id) \ - MpiTypeList::instance("MpiType", MPI_DATATYPE_NULL).onCreate(id) + ADD_NEW(id, MPI_DATATYPE_NULL, MPI_Datatype, datatype_desc_t, DATATYPE_MASK) #define REMOVE_OLD_TYPE(id) \ - MpiTypeList::instance("MpiType", MPI_DATATYPE_NULL).onRemove(id) + REMOVE_OLD(id, MPI_DATATYPE_NULL, datatype_desc_t, MPI_Datatype) #define UPDATE_TYPE_MAP(v, r) \ - MpiTypeList::instance("MpiType", MPI_DATATYPE_NULL).updateMapping(v, r) + UPDATE_MAP(v, r, MPI_DATATYPE_NULL, datatype_desc_t, MPI_Datatype) -#define REAL_TO_VIRTUAL_OP(id) \ - MpiOpList::instance("MpiOp", MPI_OP_NULL).realToVirtual(id) +#define DESC_TO_VIRTUAL_OP(desc) \ + DESC_TO_VIRTUAL(desc, MPI_OP_NULL, MPI_Op) +#define VIRTUAL_TO_DESC_OP(id) \ + VIRTUAL_TO_DESC(id, MPI_OP_NULL, op_desc_t) #define VIRTUAL_TO_REAL_OP(id) \ - MpiOpList::instance("MpiOp", MPI_OP_NULL).virtualToReal(id) + VIRTUAL_TO_REAL(id, MPI_OP_NULL, MPI_Op, op_desc_t) #define ADD_NEW_OP(id) \ - MpiOpList::instance("MpiOp", MPI_OP_NULL).onCreate(id) + ADD_NEW(id, MPI_OP_NULL, MPI_Op, op_desc_t, OP_MASK) #define REMOVE_OLD_OP(id) \ - MpiOpList::instance("MpiOp", MPI_OP_NULL).onRemove(id) + REMOVE_OLD(id, MPI_OP_NULL, op_desc_t, MPI_Op) #define UPDATE_OP_MAP(v, r) \ - MpiOpList::instance("MpiOp", MPI_OP_NULL).updateMapping(v, r) + UPDATE_MAP(v, r, MPI_OP_NULL, op_desc_t, MPI_Op) -#define REAL_TO_VIRTUAL_COMM_KEYVAL(id) \ - MpiOpList::instance("MpiCommKeyval", 0).realToVirtual(id) +// FIXME: Earlier, I was under the impression that we didn't need to virtualize +// communicator keyvals anymore, but without these, VASP5 DBG will not run. So, +// what is the case? +#define DESC_TO_VIRTUAL_COMM_KEYVAL(desc) \ + DESC_TO_VIRTUAL(desc, 0, int) +#define VIRTUAL_TO_DESC_COMM_KEYVAL(id) \ + VIRTUAL_TO_DESC(id, 0, comm_keyval_desc_t) #define VIRTUAL_TO_REAL_COMM_KEYVAL(id) \ - MpiOpList::instance("MpiCommKeyval", 0).virtualToReal(id) + VIRTUAL_TO_REAL(id, 0, int, comm_keyval_desc_t) #define ADD_NEW_COMM_KEYVAL(id) \ - MpiOpList::instance("MpiCommKeyval", 0).onCreate(id) + ADD_NEW(id, 0, int, comm_keyval_desc_t, COMM_KEYVAL_MASK) #define REMOVE_OLD_COMM_KEYVAL(id) \ - MpiOpList::instance("MpiCommKeyval", 0).onRemove(id) + REMOVE_OLD(id, 0, comm_keyval_desc_t, int) #define UPDATE_COMM_KEYVAL_MAP(v, r) \ - MpiOpList::instance("MpiCommKeyval", 0).updateMapping(v, r) + UPDATE_MAP(v, r, 0, comm_keyval_desc_t, int) -#if 1 -#define REAL_TO_VIRTUAL_REQUEST(id) \ - MpiRequestList::instance("MpiRequest", MPI_REQUEST_NULL).realToVirtual(id) +#define DESC_TO_VIRTUAL_REQUEST(desc) \ + DESC_TO_VIRTUAL(desc, MPI_REQUEST_NULL, MPI_Request) +#define VIRTUAL_TO_DESC_REQUEST(id) \ + VIRTUAL_TO_DESC(id, MPI_REQUEST_NULL, request_desc_t) #define VIRTUAL_TO_REAL_REQUEST(id) \ - MpiRequestList::instance("MpiRequest", MPI_REQUEST_NULL).virtualToReal(id) + VIRTUAL_TO_REAL(id, MPI_REQUEST_NULL, MPI_Request, request_desc_t) #define ADD_NEW_REQUEST(id) \ - MpiRequestList::instance("MpiRequest", MPI_REQUEST_NULL).onCreate(id) + ADD_NEW(id, MPI_REQUEST_NULL, MPI_Request, request_desc_t, REQUEST_MASK) #define REMOVE_OLD_REQUEST(id) \ - MpiRequestList::instance("MpiRequest", MPI_REQUEST_NULL).onRemove(id) + REMOVE_OLD(id, MPI_REQUEST_NULL, request_desc_t, MPI_Request) #define UPDATE_REQUEST_MAP(v, r) \ - MpiRequestList::instance("MpiRequest", MPI_REQUEST_NULL).updateMapping(v, r) -#else -#define VIRTUAL_TO_REAL_REQUEST(id) id -#define ADD_NEW_REQUEST(id) id -#define UPDATE_REQUEST_MAP(v, r) r -#endif - -namespace dmtcp_mpi -{ - - template - class MpiVirtualization - { - public: -#ifdef JALIB_ALLOCATOR - static void* operator new(size_t nbytes, void* p) { return p; } - static void* operator new(size_t nbytes) { JALLOC_HELPER_NEW(nbytes); } - static void operator delete(void* p) { JALLOC_HELPER_DELETE(p); } -#endif - static MpiVirtualization& instance(const char *name, T nullId) - { - // FIXME: - // dmtcp_mpi::MpiVirtualization::instance("MpiGroup", 1) - // ._vIdTable.printMaps(true) - // to access _virTableMpiGroup in GDB. - // We need a cleaner way to access it. - if (strcmp(name, "MpiOp") == 0) { - static MpiVirtualization _virTableMpiOp(name, nullId); - return _virTableMpiOp; - } else if (strcmp(name, "MpiComm") == 0) { - static MpiVirtualization _virTableMpiComm(name, nullId); - return _virTableMpiComm; - } else if (strcmp(name, "MpiGroup") == 0) { - static MpiVirtualization _virTableMpiGroup(name, nullId); - return _virTableMpiGroup; - } else if (strcmp(name, "MpiType") == 0) { - static MpiVirtualization _virTableMpiType(name, nullId); - return _virTableMpiType; - } else if (strcmp(name, "MpiCommKeyval") == 0) { - static MpiVirtualization _virTableMpiCommKeyval(name, nullId); - return _virTableMpiCommKeyval; - } else if (strcmp(name, "MpiRequest") == 0) { - static MpiVirtualization _virTableMpiRequest(name, nullId); - return _virTableMpiRequest; - } else if (strcmp(name, "MpiFile") == 0) { - static MpiVirtualization _virTableMpiFile(name, nullId); - return _virTableMpiFile; - } - JWARNING(false)(name)(nullId).Text("Unhandled type"); - static MpiVirtualization _virTableNoSuchObject(name, nullId); - return _virTableNoSuchObject; - } - - T virtualToReal(T virt) - { - // Don't need to virtualize the null id - if (virt == _nullId) { - return virt; - } - // DMTCP virtual id table already does the lock around the table. - // FIXME: Even with an empty map, we are seeing 1 microsecond overhead. - return _vIdTable.virtualToReal(virt); - } - - T realToVirtual(T real) - { - // Don't need to virtualize the null id - if (real == _nullId) { - return real; - } - // DMTCP virtual id table already does the lock around the table. - return _vIdTable.realToVirtual(real); - } - - // Adds the given real id to the virtual id table and creates a new - // corresponding virtual id. - // Returns the new virtual id on success, null id otherwise. - T onCreate(T real) - { - T vId = _nullId; - // Don't need to virtualize the null id - if (real == _nullId) { - return vId; - } - // DMTCP virtual id table already does the lock around the table. - if (_vIdTable.realIdExists(real)) { - // Adding a existing real id is a legal operation and - // we should not report warning/error. - // For example, MPI_Comm_group accesses the group associated with - // given communicator. It can be called multiple times from - // different localtions. They should get the same virtual id and - // real id of the same group. - // JWARNING(false)(real)(_vIdTable.getTypeStr()) - // (_vIdTable.realToVirtual(real)) - // .Text("Real id exists. Will overwrite existing mapping"); - vId = _vIdTable.realToVirtual(real); - } else { - if (!_vIdTable.getNewVirtualId(&vId)) { - JWARNING(false)(real)(_vIdTable.getTypeStr()) - .Text("Failed to create a new vId"); - } else { - _vIdTable.updateMapping(vId, real); - } - } - return vId; - } - - // Removes virtual id from table and returns the real id corresponding - // to the virtual id; if the virtual id does not exist in the table, - // returns null id. - T onRemove(T virt) - { - T realId = _nullId; - // Don't need to virtualize the null id - if (virt == _nullId) { - return realId; - } - // DMTCP virtual id table already does the lock around the table. - if (_vIdTable.virtualIdExists(virt)) { - realId = _vIdTable.virtualToReal(virt); - _vIdTable.erase(virt); - } else { - JWARNING(false)(virt)(_vIdTable.getTypeStr()) - .Text("Cannot delete non-existent virtual id"); - } - return realId; - } - - // Updates the mapping for the given virtual id to the given real id. - // Returns virtual id on success, null-id otherwise - T updateMapping(T virt, T real) - { - // If the virt is the null id, then return it directly. - // Don't need to virtualize the null id - if (virt == _nullId) { - return _nullId; - } - // DMTCP virtual id table already does the lock around the table. - if (!_vIdTable.virtualIdExists(virt)) { - JWARNING(false)(virt)(real)(_vIdTable.getTypeStr()) - (_vIdTable.realToVirtual(real)) - .Text("Cannot update mapping for a non-existent virt. id"); - return _nullId; - } - _vIdTable.updateMapping(virt, real); - return virt; - } - - private: - // Pvt. constructor - MpiVirtualization(const char *name, T nullId) - : _vIdTable(name, (T)0, (size_t)999999), - _nullId(nullId) - { - } - - // Virtual Ids Table - dmtcp::VirtualIdTable _vIdTable; - // Default "NULL" value for id - T _nullId; - }; // class MpiId - - // FIXME: The new name should be: GlobalIdOfSimiliarComm - class VirtualGlobalCommId { - public: - unsigned int createGlobalId(MPI_Comm comm) { - if (comm == MPI_COMM_NULL) { - return comm; - } - unsigned int gid = 0; - int worldRank, commSize; - int realComm = VIRTUAL_TO_REAL_COMM(comm); - MPI_Comm_rank(MPI_COMM_WORLD, &worldRank); - MPI_Comm_size(comm, &commSize); - int rbuf[commSize]; - // FIXME: Use MPI_Group_translate_ranks instead of Allgather. - // MPI_Group_translate_ranks only executes locally. So we can avoid - // the cost of collective communication - // FIXME: cray cc complains "catastrophic error" that can't find - // split-process.h -#if 1 - DMTCP_PLUGIN_DISABLE_CKPT(); - JUMP_TO_LOWER_HALF(lh_info.fsaddr); - NEXT_FUNC(Allgather)(&worldRank, 1, MPI_INT, - rbuf, 1, MPI_INT, realComm); - RETURN_TO_UPPER_HALF(); - DMTCP_PLUGIN_ENABLE_CKPT(); -#else - MPI_Allgather(&worldRank, 1, MPI_INT, rbuf, 1, MPI_INT, comm); -#endif - for (int i = 0; i < commSize; i++) { - gid ^= hash(rbuf[i] + 1); - } - // FIXME: We assume the hash collision between communicators who - // have different members is low. - // FIXME: We want to prune virtual communicators to avoid long - // restart time. - // FIXME: In VASP we observed that for the same virtual communicator - // (adding 1 to each new communicator with the same rank members), - // the virtual group can change over time, using: - // virtual Comm -> real Comm -> real Group -> virtual Group - // We don't understand why since vasp does not seem to free groups. -#if 0 - // FIXME: Some code can create new communicators during execution, - // and so hash conflict may occur later. - // if the new gid already exists in the map, add one and test again - while (1) { - bool found = false; - for (std::pair idPair : globalIdTable) { - if (idPair.second == gid) { - found = true; - break; - } - } - if (found) { - gid++; - } else { - break; - } - } -#endif - globalIdTable[comm] = gid; - return gid; - } - - unsigned int getGlobalId(MPI_Comm comm) { - std::map::iterator it = - globalIdTable.find(comm); - JASSERT(it != globalIdTable.end())(comm) - .Text("Can't find communicator in the global id table"); - return it->second; - } - - static VirtualGlobalCommId& instance() { - static VirtualGlobalCommId _vGlobalId; - return _vGlobalId; - } - - private: - VirtualGlobalCommId() - { - globalIdTable[MPI_COMM_NULL] = MPI_COMM_NULL; - globalIdTable[MPI_COMM_WORLD] = MPI_COMM_WORLD; - } - - void printMap(bool flag = false) { - for (std::pair idPair : globalIdTable) { - if (flag) { - printf("virtual comm: %x, real comm: %x, global id: %x\n", - idPair.first, VIRTUAL_TO_REAL_COMM(idPair.first), - idPair.second); - fflush(stdout); - } else { - JTRACE("Print global id mapping")((void*) (uint64_t) idPair.first) - ((void*) (uint64_t) VIRTUAL_TO_REAL_COMM(idPair.first)) - ((void*) (uint64_t) idPair.second); - } - } - } - // from https://stackoverflow.com/questions/664014/ - // what-integer-hash-function-are-good-that-accepts-an-integer-hash-key - int hash(int i) { - return i * 2654435761 % ((unsigned long)1 << 32); - } - std::map globalIdTable; - }; -}; // namespace dmtcp_mpi + UPDATE_MAP(v, r, MPI_REQUEST_NULL, request_desc_t, MPI_Request) + +struct ggid_desc_t { + int ggid; // hashing results of communicator members + + unsigned long seq_num; + + unsigned long target_num; + +}; + +struct comm_desc_t { + MPI_Comm real_id; // Real MPI communicator in the lower-half + int handle; // A copy of the int type handle generated from the address of this struct + int size; // Size of this communicator + int local_rank; // local rank number of this communicator + int *global_ranks; // list of ranks of the group. + ggid_desc_t* ggid_desc; + + // struct virt_group_t *group; // Or should this field be a pointer to virt_group_t? +}; + +struct group_desc_t { + MPI_Group real_id; // Real MPI group in the lower-half + int handle; // A copy of the int type handle generated from the address of this struct + int size; // The size of this group in ranks. + int *global_ranks; // list of ranks of the group. + // unsigned int ggid; // Global Group ID +}; + +enum mpi_request_kind { + COLLECTIVE, + PEER_TO_PEER +}; + +struct request_desc_t { + MPI_Request real_id; // Real MPI request in the lower-half + int handle; // A copy of the int type handle generated from the address of this struct + mpi_request_kind request_kind; // P2P request or collective request + MPI_Status* status; // Real MPI status in the lower-half +}; + +struct op_desc_t { + MPI_Op real_id; // Real MPI operator in the lower-half + int handle; // A copy of the int type handle generated from the address of this struct + MPI_User_function *user_fn; // Function pointer to the user defined op function + int commute; // True if op is commutative. +}; + +struct datatype_desc_t { + MPI_Datatype real_id; // Real MPI type in the lower-half + int handle; // A copy of the int type handle generated from the address of this struct + // Components of user-defined datatype. + int num_integers; + int *integers; + int num_addresses; + MPI_Aint *addresses; + int num_large_counts; + int *large_counts; + int num_datatypes; + MPI_Datatype *datatypes; // hmmm.. hierarchical restore? + int combiner; + // if is_freed is true, then we should not update the descriptor on a checkpoint. + bool is_freed; +}; + +struct file_desc_t { + MPI_File real_id; + int handle; +}; + +struct comm_keyval_desc_t { + int real_id; + int handle; +}; + +// FIXME: Some of these structs (request_desc_t, file_desc_t, +// comm_keyval_desc_t) Are just very thin wrappers around a real id, with no +// other information. So, should these be virtualized after all? In an "#else" +// branch of an "#if 1" in the main code, VIRTUAL_TO_REAL_REQUEST(id) is +// defined as just id, for instance. +union id_desc_t { + comm_desc_t comm; + group_desc_t group; + request_desc_t request; + op_desc_t op; + datatype_desc_t datatype; + file_desc_t file; + comm_keyval_desc_t comm_keyval; + + operator comm_desc_t () const { return comm; } + operator group_desc_t () const { return group; } + operator request_desc_t () const { return request; } + operator op_desc_t () const { return op; } + operator datatype_desc_t () const { return datatype; } + operator file_desc_t () const { return file; } + operator comm_keyval_desc_t () const { return comm_keyval; } +}; + +extern std::map idDescriptorTable; +extern std::map ggidDescriptorTable; +extern std::map lh_constants_map; +extern int base; +extern int nextvId; +extern MPI_Group g_world_group; +typedef typename std::map::iterator id_desc_iterator; +typedef std::pair id_desc_pair; +typedef typename std::map::iterator ggid_desc_iterator; +typedef std::pair ggid_desc_pair; + +id_desc_t* virtualToDescriptor(int virtId); + +int getggid(MPI_Comm comm); +int hash(int i); + +datatype_desc_t* init_datatype_desc_t(MPI_Datatype realType); +op_desc_t* init_op_desc_t(MPI_Op realOp); +request_desc_t* init_request_desc_t(MPI_Request realReq); +group_desc_t* init_group_desc_t(MPI_Group realGroup); +comm_desc_t* init_comm_desc_t(MPI_Comm realComm); +file_desc_t* init_file_desc_t(MPI_File realFile); + +void destroy_datatype_desc_t(datatype_desc_t* datatype); +void destroy_op_desc_t(op_desc_t* op); +void destroy_request_desc_t(request_desc_t* request); +void destroy_group_desc_t(group_desc_t* group); +void destroy_comm_desc_t(comm_desc_t* comm); +void destroy_file_desc_t(file_desc_t* file); + +void init_comm_world(); +void grant_ggid(MPI_Comm virtualComm); + +void update_datatype_desc_t(datatype_desc_t* datatype); +void update_op_desc_t(op_desc_t* op, MPI_User_function* user_fn, int commute); +void update_request_desc_t(request_desc_t* request); +void update_group_desc_t(group_desc_t* group); +void update_comm_desc_t(comm_desc_t* comm); +void update_file_desc_t(file_desc_t* file); + +void reconstruct_with_datatype_desc_t(datatype_desc_t* datatype); +void reconstruct_with_op_desc_t(op_desc_t* op); +void reconstruct_with_request_desc_t(request_desc_t* request); +void reconstruct_with_group_desc_t(group_desc_t* group); +void reconstruct_with_comm_desc_t(comm_desc_t* comm); +void reconstruct_with_file_desc_t(file_desc_t* file); + +void update_descriptors(); +void reconstruct_with_descriptors(); + +void destroy_g_world_group(); +void write_g_world_group(); + +void init_lh_constants_map(); #endif // ifndef MPI_VIRTUAL_IDS_H diff --git a/restart_plugin/mtcp_restart_plugin.h b/restart_plugin/mtcp_restart_plugin.h index 6c3a8da1b..c57e1af34 100644 --- a/restart_plugin/mtcp_restart_plugin.h +++ b/restart_plugin/mtcp_restart_plugin.h @@ -48,6 +48,8 @@ typedef struct LowerHalfInfo void *g_appContext; void *lh_dlsym; void *getRankFptr; + void *lh_mpi_constants; + #ifdef SINGLE_CART_REORDER void *getCoordinatesFptr; void *getCartesianCommunicatorFptr; diff --git a/restart_plugin/mtcp_split_process.c b/restart_plugin/mtcp_split_process.c index 8751ed496..947eb21ac 100644 --- a/restart_plugin/mtcp_split_process.c +++ b/restart_plugin/mtcp_split_process.c @@ -371,8 +371,8 @@ initializeLowerHalf(RestoreInfo *rinfo) auxvec = (ElfW(auxv_t) *) evp; } // update vDSO linkmap entry to the temporary address - updateVdsoLinkmapEntry(rinfo->currentVdsoStart, - rinfo->pluginInfo.vdsoLdAddrInLinkMap); + // updateVdsoLinkmapEntry(rinfo->currentVdsoStart, + // rinfo->pluginInfo.vdsoLdAddrInLinkMap); JUMP_TO_LOWER_HALF(rinfo->pluginInfo.fsaddr); (*resetMmaps)(); // Set the auxiliary vector to correspond to the values of the lower half