From c9b5919bee6308e63eca8dd5cfd56b06aa6174a5 Mon Sep 17 00:00:00 2001 From: Gene Cooperman Date: Thu, 17 Aug 2023 13:18:04 -0400 Subject: [PATCH 1/3] Improve documentation of reserving lh_memRange --- mpi-proxy-split/lower-half/mmap64.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/mpi-proxy-split/lower-half/mmap64.c b/mpi-proxy-split/lower-half/mmap64.c index 324673cb2..d82a6dda2 100644 --- a/mpi-proxy-split/lower-half/mmap64.c +++ b/mpi-proxy-split/lower-half/mmap64.c @@ -42,12 +42,6 @@ // memory regions using mmaps[] _and_ shms[]. // FIXME: We do not catch any changes in /proc/PID/maps due to ioctl. -// FIXME: When we know this works, remove this preprocessor variable and make -// this permanent. Right now, this causes us to reserve the lh_mem_range -// in two places (during launch and during restart). We should refactor -// to do this in only one place. -#define RESERVE_LH_MEM_RANGE - #define HAS_MAP_FIXED_NOREPLACE LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) #ifdef __NR_mmap2 @@ -141,22 +135,19 @@ resetMmappedList() } numRegions = 0; -#ifdef RESERVE_LH_MEM_RANGE - // We will reserve the lh_memRange memory, so no one can steal it from us. + /************************************************************************** + * Next, We reserve the lh_memRange memory, so no one can steal it from us. + * The upper half (../mtcp_split_process.c) sets lh_memRange.start to + * 0x2aab00000000, and we assume that no one else uses that address range. + * NPTE: We reserve the lh_memRange memory in two places: launch and restart. + **************************************************************************/ size_t length = lh_memRange.end - lh_memRange.start; #if HAS_MAP_FIXED_NOREPLACE // Use inline syscall here, to avoid our __mmap64 wrapper. - // FIXME: Keep only the "dev/zero" '#else' when it's been tested. -# if 0 - void *rc = LH_MMAP_CALL(lh_memRange.start, length, - PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0); -# else int fd = open("/dev/zero", O_RDONLY); void *rc = LH_MMAP_CALL(lh_memRange.start, length, PROT_NONE, MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, fd, 0); -# endif if (rc == MAP_FAILED) { char msg[] = "*** Panic: MANA lower half: can't reserve lh_memRange" " using MAP_FIXED_NOREPLACE\n"; @@ -181,7 +172,6 @@ resetMmappedList() // only later, when mmap64 takes a piece.of it. // memset(lh_memRange.start, CANARY_BYTE, length); mprotect(lh_memRange.start, length, PROT_NONE); -#endif } int From 4aec641ab01dc882ee3ca622185bed052385d00a Mon Sep 17 00:00:00 2001 From: Gene Cooperman Date: Thu, 17 Aug 2023 14:09:45 -0400 Subject: [PATCH 2/3] numRegions->numMmapRegions; mmaps[]: Add sentinel --- mpi-proxy-split/lower-half/mmap64.c | 49 ++++++++++++++-------- mpi-proxy-split/lower-half/mmap_internal.h | 5 +-- mpi-proxy-split/lower-half/munmap.c | 2 +- mpi-proxy-split/lower-half/shmat.c | 14 +++++-- 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/mpi-proxy-split/lower-half/mmap64.c b/mpi-proxy-split/lower-half/mmap64.c index d82a6dda2..a8b10299b 100644 --- a/mpi-proxy-split/lower-half/mmap64.c +++ b/mpi-proxy-split/lower-half/mmap64.c @@ -94,7 +94,8 @@ // 2. Make the size of list dynamic // Number of valid objects in the 'mmaps' list below -int numRegions = 0; +// For i >= numMmapRegions, mmaps[i].addr == MAP_FAILED. +int numMmapRegions = 0; #ifdef LIBMMAP_SO // Lower-half memory range to use. @@ -113,7 +114,7 @@ MmapInfo_t* getMmappedList(int **num) { if (!num) return NULL; - *num = &numRegions; + *num = &numMmapRegions; return mmaps; } @@ -130,10 +131,14 @@ void resetMmappedList() { // Alternatively, just do: memset((void *)mmaps, 0, sizeof(mmaps)); - for (int i = 0; i < numRegions; i++) { + for (int i = 0; i < numMmapRegions; i++) { memset(&mmaps[i], 0, sizeof(mmaps[i])); } - numRegions = 0; + numMmapRegions = 0; + // All mmaps[] now unused. Mark all mmaps with MAP_FAILED sentinel, for GDB + for (int i = 0; i < MAX_TRACK; i++) { + mmaps[i].addr = MAP_FAILED; + } /************************************************************************** * Next, We reserve the lh_memRange memory, so no one can steal it from us. @@ -177,7 +182,7 @@ resetMmappedList() int getMmapIdx(const void *addr) { - for (int i = 0; i < numRegions; i++) { + for (int i = 0; i < numMmapRegions; i++) { if (mmaps[i].addr == addr) { return i; } @@ -217,7 +222,7 @@ getNextAddr(size_t len) static int extendExistingMmap(const void *addr) { - for (int i = 0; i < numRegions; i++) { + for (int i = 0; i < numMmapRegions; i++) { if (addr >= mmaps[i].addr && addr <= mmaps[i].addr + mmaps[i].len) { return i; } @@ -391,28 +396,36 @@ __mmap64 (void *addr, size_t len, int prot, int flags, int fd, __off_t offset) mmaps[idx].unmapped = 0; mmaps[idx].dontuse = 0; } else { + // This tries to find an earlier unmapped region, and re-use it. int idx2 = extendExistingMmap(ret); if (idx2 != -1) { size_t length = ROUND_UP(len) + ((char*)ret - (char*)mmaps[idx2].addr); mmaps[idx2].len = length > mmaps[idx2].len ? length : mmaps[idx2].len; mmaps[idx2].unmapped = 0; idx = idx2; - } else { - mmaps[numRegions].addr = ret; - mmaps[numRegions].len = ROUND_UP(len); - mmaps[numRegions].unmapped = 0; - idx = numRegions; - numRegions = (numRegions + 1) % MAX_TRACK; - // FIXME: A better fix, below, is to set nextFreeAddr back to the start, - // and then to test the size of any gap. That can be done - // by checking each known region. Hopefully, there are not many. - // Or at least, we can re-use any unmapped regions. - if (numRegions == 0) { + } else { // Else use a new region at end. + mmaps[numMmapRegions].addr = ret; + mmaps[numMmapRegions].len = ROUND_UP(len); + mmaps[numMmapRegions].unmapped = 0; + idx = numMmapRegions; + numMmapRegions = numMmapRegions + 1; + // FIXME: Since lh_memRange is located at an address range that doesn't + // conflict with other addresses, a better fix would be to use + // LH_MMAP_CALL(lh_memRange.end, additional_length, PROT_NONE, + // MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, fd, 0) + // for fd pointing to /dev/zero, and so to contiguously extend + // our reserved mmaps[] region, and then adjust MAX_TRACK. + if (numMmapRegions >= MAX_TRACK - 2) { // Keep a MAP_FAILED sentinel char msg[] = "*** Panic: MANA lower half: no more space for mmap.\n"; - write(2, msg, sizeof(msg)); assert(numRegions > 0); + write(2, msg, sizeof(msg)); } + assert(numMmapRegions < MAX_TRACK - 2); // MAX_TRACK-2 for guard page } } + if (mmaps[idx+1].addr != MAP_FAILED) { + // Create an obvious sentinel for ease of debugging in GDB + mmaps[idx+1].addr = mmaps[idx+2].addr = MAP_FAILED; + } if (extraGuardPage) { mmaps[idx].guard = 1; void *lastPage = (char*)ret + ROUND_UP(totalLen) - PAGE_SIZE; diff --git a/mpi-proxy-split/lower-half/mmap_internal.h b/mpi-proxy-split/lower-half/mmap_internal.h index 54691c95a..05ca5a721 100644 --- a/mpi-proxy-split/lower-half/mmap_internal.h +++ b/mpi-proxy-split/lower-half/mmap_internal.h @@ -191,8 +191,7 @@ static uint64_t page_unit; /* * Note that many of these functions/variables have a corresponding * function/variable in the upper half. The correspondences are: - * FIXME: rename numRegions to numMmapRegions - * int numRegions -> int *g_numMmaps + * int numMmapRegions -> int *g_numMmaps * MmapInfo_t mmaps[MAX_TRACK] -> MmapInfo_t *g_list * FIXME: Multiple changes needed; Also rename it to: lh_core_regions_list * LhCoreRegions_t lh_regions_list[MAX_LH_REGIONS] @@ -235,7 +234,7 @@ void* getNextAddr(size_t ); // TODO: // 1. Make the size of list dynamic #define MAX_TRACK 1000 -extern int numRegions; +extern int numMmapRegions; extern MmapInfo_t mmaps[MAX_TRACK]; extern void *nextFreeAddr; diff --git a/mpi-proxy-split/lower-half/munmap.c b/mpi-proxy-split/lower-half/munmap.c index 80dd3db6e..4f795f7f4 100644 --- a/mpi-proxy-split/lower-half/munmap.c +++ b/mpi-proxy-split/lower-half/munmap.c @@ -51,7 +51,7 @@ alignedWithLastPage(const void *haystackStart, static int getOverlappingRegion(void *addr, size_t len) { - for (int i = 0; i < numRegions; i++) { + for (int i = 0; i < numMmapRegions; i++) { void *rstart = mmaps[i].addr; void *rend = (char*)mmaps[i].addr + mmaps[i].len; void *ustart = addr; diff --git a/mpi-proxy-split/lower-half/shmat.c b/mpi-proxy-split/lower-half/shmat.c index d2121bb71..1d0fe3128 100644 --- a/mpi-proxy-split/lower-half/shmat.c +++ b/mpi-proxy-split/lower-half/shmat.c @@ -49,10 +49,16 @@ __wrap_shmat(int shmid, const void *shmaddr, int shmflg) mmaps[idx].len = len; mmaps[idx].unmapped = 0; } else { - mmaps[numRegions].addr = ret; - mmaps[numRegions].len = len; - mmaps[numRegions].unmapped = 0; - numRegions = (numRegions + 1) % MAX_TRACK; + mmaps[numMmapRegions].addr = ret; + mmaps[numMmapRegions].len = len; + mmaps[numMmapRegions].unmapped = 0; + numMmapRegions = numMmapRegions + 1; + // FIXME: shmat() is not creating a guard page, unlike mmap(). + if (numMmapRegions >= MAX_TRACK - 2) { // Keep MAP_FAILED sentinel + char msg[] = "*** Panic: MANA lower half: no more space for mmap.\n"; + write(2, msg, sizeof(msg)); + } + assert(numMmapRegions < MAX_TRACK - 2); //MAX_TRACK-2 in case guard page } } return ret; From 2a5cfbf5be23669b4bb8bc26155e151cfff63ef8 Mon Sep 17 00:00:00 2001 From: Gene Cooperman Date: Thu, 17 Aug 2023 14:35:33 -0400 Subject: [PATCH 3/3] Add JASSERT of lh regions before computeUnion...; --- mpi-proxy-split/mpi_plugin.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mpi-proxy-split/mpi_plugin.cpp b/mpi-proxy-split/mpi_plugin.cpp index 0141322e8..a86dcf7e9 100644 --- a/mpi-proxy-split/mpi_plugin.cpp +++ b/mpi-proxy-split/mpi_plugin.cpp @@ -1101,6 +1101,7 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) } case DMTCP_EVENT_PRESUSPEND: { + // User threads are not yet suspended. mana_state = CKPT_COLLECTIVE; // preSuspendBarrier() will send coord response and get worker state. // FIXME: See commant at: dmtcpplugin.cpp:'case DMTCP_EVENT_PRESUSPEND' @@ -1110,6 +1111,7 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) } case DMTCP_EVENT_PRECHECKPOINT: { + // User threads are now suspended. recordMpiInitMaps(); recordOpenFds(); dmtcp_local_barrier("MPI:GetLocalLhMmapList"); @@ -1123,6 +1125,13 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) registerLocalSendsAndRecvs(); // p2p_drain_send_recv.cpp dmtcp_global_barrier("MPI:Drain-Send-Recv"); drainSendRecv(); // p2p_drain_send_recv.cpp + // Pointers to lh for g_list and g_numMmaps were set by getLhMmapList() + JASSERT(g_list[*g_numMmaps-1].addr != MAP_FAILED && + ( g_list[*g_numMmaps].addr == MAP_FAILED /* sentinel */ || + ( g_list[*g_numMmaps].guard == 1 && + g_list[*g_numMmaps].addr == MAP_FAILED ))) + (*g_numMmaps) (g_list[*g_numMmaps-1].addr) + (g_list[*g_numMmaps].addr) (g_list[*g_numMmaps].addr); computeUnionOfCkptImageAddresses(); dmtcp_global_barrier("MPI:save-mana-header-and-mpi-files"); const char *file = get_mana_header_file_name();