From e5f33b588a80a201e8b389ed364b318d0329c633 Mon Sep 17 00:00:00 2001 From: Patrick Lavin Date: Tue, 2 Apr 2024 08:31:29 -0700 Subject: [PATCH] Add ability for Ariel to trace MPI applications --- src/sst/elements/ariel/Makefile.am | 3 + src/sst/elements/ariel/api/Makefile | 4 +- src/sst/elements/ariel/api/arielapi.c | 66 +++++ src/sst/elements/ariel/api/arielapi.h | 5 + src/sst/elements/ariel/arielcpu.h | 4 + .../elements/ariel/frontend/pin3/fesimple.cc | 114 ++++++++- .../ariel/frontend/pin3/pin3frontend.cc | 83 ++++++- .../ariel/frontend/pin3/pin3frontend.h | 10 + .../refFiles/test_Ariel_ariel_snb_mlm.out | 100 ++++---- src/sst/elements/ariel/mpi/.gitignore | 6 + src/sst/elements/ariel/mpi/Makefile | 28 +++ src/sst/elements/ariel/mpi/ariel-reduce.py | 117 +++++++++ src/sst/elements/ariel/mpi/arielapi.h | 33 +++ src/sst/elements/ariel/mpi/demo1.py | 101 ++++++++ src/sst/elements/ariel/mpi/fakepin.cc | 47 ++++ src/sst/elements/ariel/mpi/hello.cc | 75 ++++++ src/sst/elements/ariel/mpi/libarielapi.c | 35 +++ src/sst/elements/ariel/mpi/mpilauncher.cc | 197 +++++++++++++++ src/sst/elements/ariel/mpi/reduce.cc | 103 ++++++++ .../elements/ariel/tests/testMPI/.gitignore | 5 + src/sst/elements/ariel/tests/testMPI/Makefile | 18 ++ .../elements/ariel/tests/testMPI/fakepin.cc | 47 ++++ src/sst/elements/ariel/tests/testMPI/hello.cc | 76 ++++++ .../elements/ariel/tests/testMPI/reduce.cc | 122 +++++++++ .../elements/ariel/tests/testMPI/test-mpi.py | 130 ++++++++++ .../ariel/tests/testsuite_default_Ariel.py | 5 +- .../ariel/tests/testsuite_mpi_Ariel.py | 231 ++++++++++++++++++ .../ariel/tests/testsuite_testio_Ariel.py | 2 +- 28 files changed, 1712 insertions(+), 55 deletions(-) create mode 100644 src/sst/elements/ariel/mpi/.gitignore create mode 100644 src/sst/elements/ariel/mpi/Makefile create mode 100644 src/sst/elements/ariel/mpi/ariel-reduce.py create mode 100644 src/sst/elements/ariel/mpi/arielapi.h create mode 100644 src/sst/elements/ariel/mpi/demo1.py create mode 100644 src/sst/elements/ariel/mpi/fakepin.cc create mode 100644 src/sst/elements/ariel/mpi/hello.cc create mode 100644 src/sst/elements/ariel/mpi/libarielapi.c create mode 100644 src/sst/elements/ariel/mpi/mpilauncher.cc create mode 100644 src/sst/elements/ariel/mpi/reduce.cc create mode 100644 src/sst/elements/ariel/tests/testMPI/.gitignore create mode 100644 src/sst/elements/ariel/tests/testMPI/Makefile create mode 100644 src/sst/elements/ariel/tests/testMPI/fakepin.cc create mode 100644 src/sst/elements/ariel/tests/testMPI/hello.cc create mode 100644 src/sst/elements/ariel/tests/testMPI/reduce.cc create mode 100644 src/sst/elements/ariel/tests/testMPI/test-mpi.py create mode 100644 src/sst/elements/ariel/tests/testsuite_mpi_Ariel.py diff --git a/src/sst/elements/ariel/Makefile.am b/src/sst/elements/ariel/Makefile.am index a664f834ad..0726342b85 100644 --- a/src/sst/elements/ariel/Makefile.am +++ b/src/sst/elements/ariel/Makefile.am @@ -76,6 +76,8 @@ EXTRA_DIST = \ frontend/simple/examples/stream/tests/refFiles/test_Ariel_runstreamNB.out \ frontend/simple/examples/stream/tests/refFiles/test_Ariel_runstreamSt.out \ tests/testsuite_default_Ariel.py \ + tests/testsuite_testio_Ariel.py \ + tests/testsuite_mpi_Ariel.py \ tests/testopenMP/ompmybarrier/ompmybarrier.c \ tests/testopenMP/ompmybarrier/Makefile @@ -104,6 +106,7 @@ libariel_la_CPPFLAGS = \ -DARIEL_TRACE_LIB=$(libdir)/sst \ -DARIEL_TOOL_DIR="$(libexecdir)" \ -DPINTOOL_EXECUTABLE="$(PINTOOL_RUNTIME)" \ + -DMPILAUNCHER_EXECUTABLE="$(CURDIR)/mpi/mpilauncher" \ $(AM_CPPFLAGS) $(CPPFLAGS) sstdir = $(includedir)/sst/elements/ariel diff --git a/src/sst/elements/ariel/api/Makefile b/src/sst/elements/ariel/api/Makefile index 18e9a6ffd2..584a966ef3 100644 --- a/src/sst/elements/ariel/api/Makefile +++ b/src/sst/elements/ariel/api/Makefile @@ -1,10 +1,10 @@ all: libarielapi.so libarielapi.a libarielapi.so: arielapi.c arielapi.h - $(CC) -fPIC -shared -o libarielapi.so arielapi.c + $(CC) -fPIC -shared -o libarielapi.so arielapi.c -fopenmp arielapi.o: arielapi.c arielapi.h - $(CC) -c -o arielapi.o arielapi.c + $(CC) -c -o arielapi.o arielapi.c -fopenmp libarielapi.a: arielapi.o ar rcs $@ $^ diff --git a/src/sst/elements/ariel/api/arielapi.c b/src/sst/elements/ariel/api/arielapi.c index 11e75f26e1..ee5e1976d6 100644 --- a/src/sst/elements/ariel/api/arielapi.c +++ b/src/sst/elements/ariel/api/arielapi.c @@ -15,7 +15,12 @@ #include "arielapi.h" #include +#include #include +#if __has_include() +#include +#define HAVE_MPI_H +#endif /* These definitions are replaced during simulation */ @@ -23,6 +28,10 @@ void ariel_enable() { printf("ARIEL: ENABLE called in Ariel API.\n"); } +void ariel_disable() { + printf("ARIEL: DISABLE called in Ariel API.\n"); +} + void ariel_fence() { printf("ARIEL: FENCE called in Ariel API.\n"); } @@ -38,3 +47,60 @@ void ariel_output_stats() { void ariel_malloc_flag(int64_t id, int count, int level) { printf("ARIEL: flagging next %d mallocs at id %" PRId64 "\n", count, id); } + +// To ensure that the Pintool (fesimple.cc) numbers our application's OpenMP threads +// from 0..N-1, we need to run an OpenMP parallel region before calling MPI Init. +// Otherwise, some MPI threads which aren't used for our application will be +// numbered 1 and 2. +void omp_parallel_region() { + volatile int x = 0; +#if defined(_OPENMP) +#pragma omp parallel + { +#pragma omp critical + { + x += 1; + } + } +#else + printf("ERROR: libarielapi.c: libarielapi was compiled without OpenMP enabled\n"); + exit(1); +#endif +} + +// This function only exists to get mapped by the frontend. It should only be called +// from MPI_Init or MPI_Init_thread to allow the frontend to distinguish between our +// custom versions of of those functions and the normal MPI library's versions. +int _api_mpi_init() { + printf("notifying fesimple\n"); +} + +// Custom version of MPI_Init. We override the normal version in order to call an +// OpenMP parallel region to ensure threads are numbered properly by the frontend. +int MPI_Init(int *argc, char ***argv) { +#ifdef HAVE_MPI_H + // Communicate to the frontend that we have replaced the nomal MPI_Init with + // the one in the Ariel API + _api_mpi_init(); + omp_parallel_region(); + return PMPI_Init(argc, argv); +#else + printf("Error: arielapi.c: MPI_Init called in arielapi.c but this file was compiled without MPI. Please recompile the API with `CC=mpicc make`.\n"); + exit(1); +#endif +} + +// Custom version of MPI_Init_thread. We override the normal verison in order to call an +// OpenMP parallel region to ensure threads are numbered properly by the frontend. +int MPI_Init_thread(int *argc, char ***argv, int required, int *provided) { +#ifdef HAVE_MPI_H + // Communicate to the frontend that we have replaced the nomal MPI_Init_thread with + // the one in the Ariel API + _api_mpi_init(); + omp_parallel_region(); + return PMPI_Init_thread(argc, argv, required, provided); +#else + printf("Error: arielapi.c: MPI_Init_thread called in arielapi.c but this file was compiled without MPI. Please recompile the API with `CC=mpicc make`.\n"); + exit(1); +#endif +} diff --git a/src/sst/elements/ariel/api/arielapi.h b/src/sst/elements/ariel/api/arielapi.h index 0dbbc700b7..9fdb978c70 100644 --- a/src/sst/elements/ariel/api/arielapi.h +++ b/src/sst/elements/ariel/api/arielapi.h @@ -32,6 +32,11 @@ extern "C" { */ void ariel_enable(); +/* Disable simulation when this fucntion is encountered. + * Works regardless of the 'arielmode' parameter. + */ +void ariel_disable(); + /* Execute a fence */ void ariel_fence(); diff --git a/src/sst/elements/ariel/arielcpu.h b/src/sst/elements/ariel/arielcpu.h index 1231720ccb..b083f306df 100644 --- a/src/sst/elements/ariel/arielcpu.h +++ b/src/sst/elements/ariel/arielcpu.h @@ -63,6 +63,10 @@ class ArielCPU : public SST::Component { {"appstderrappend", "If appstderr is set, set this to 1 to append the file intead of overwriting", "0"}, {"launchparamcount", "Number of parameters supplied for the launch tool", "0" }, {"launchparam%(launchparamcount)d", "Set the parameter to the launcher", "" }, + {"mpimode", "Whether to use to to launch in order to trace MPI-enabled applications.", "0"}, + {"mpilauncher", "Specify a launcher to be used for MPI executables in conjuction with ", STRINGIZE(MPILAUNCHER_EXECUTABLE)}, + {"mpiranks", "Number of ranks to be launched by . Only will be traced by .", "1" }, + {"mpitracerank", "Rank to be traced by .", "0" }, {"envparamcount", "Number of environment parameters to supply to the Ariel executable, default=-1 (use SST environment)", "-1"}, {"envparamname%(envparamcount)d", "Sets the environment parameter name", ""}, {"envparamval%(envparamcount)d", "Sets the environment parameter value", ""}, diff --git a/src/sst/elements/ariel/frontend/pin3/fesimple.cc b/src/sst/elements/ariel/frontend/pin3/fesimple.cc index f74e0a1b34..cf43627242 100644 --- a/src/sst/elements/ariel/frontend/pin3/fesimple.cc +++ b/src/sst/elements/ariel/frontend/pin3/fesimple.cc @@ -33,6 +33,12 @@ #include "builtin_types.h" #endif +#if __has_include() +#include +#define HAVE_MPI_H +#endif + + // TODO add check for PinCRT compatible libz and try to pick that up /*#ifdef HAVE_PINCRT_LIBZ @@ -130,11 +136,15 @@ GpuDataTunnel *tunnelD = NULL; // Time function interception struct timeval offset_tv; +int timer_initialized = 0; #if !defined(__APPLE__) struct timespec offset_tp_mono; struct timespec offset_tp_real; #endif +// MPI +int api_mpi_init_used = 0; + /****************************************************************/ /********************** SHADOW STACK ****************************/ /* Used by 'sieve' to associate mallocs to the code they */ @@ -367,6 +377,7 @@ VOID WriteInstructionRead(ADDRINT* address, UINT32 readSize, THREADID thr, ADDRI ac.inst.instClass = instClass; ac.inst.simdElemCount = simdOpWidth; + //printf("fesimple.cc: patrick: writing end instuction marker\n"); tunnel->writeMessage(thr, ac); } @@ -409,6 +420,7 @@ VOID WriteStartInstructionMarker(UINT32 thr, ADDRINT ip, UINT32 instClass, UINT3 ac.instPtr = (uint64_t) ip; ac.inst.simdElemCount = simdOpWidth; ac.inst.instClass = instClass; + //printf("fesimple.cc: patrick: writing start instuction marker\n"); tunnel->writeMessage(thr, ac); } @@ -417,6 +429,7 @@ VOID WriteEndInstructionMarker(UINT32 thr, ADDRINT ip) ArielCommand ac; ac.command = ARIEL_END_INSTRUCTION; ac.instPtr = (uint64_t) ip; + //printf("fesimple.cc: patrick: writing end instuction marker\n"); tunnel->writeMessage(thr, ac); } @@ -664,6 +677,57 @@ void mapped_ariel_enable() return; } + // Setup timers to count start time + elapsed simulated time + // Only do this the first time Ariel is enabled + if (!timer_initialized) { + timer_initialized = 1; + struct timeval tvsim; + gettimeofday(&offset_tv, NULL); + tunnel->getTime(&tvsim); + offset_tv.tv_sec -= tvsim.tv_sec; + offset_tv.tv_usec -= tvsim.tv_usec; +#if ! defined(__APPLE__) + struct timespec tpsim; + tunnel->getTimeNs(&tpsim); + offset_tp_mono.tv_sec = tvsim.tv_sec - tpsim.tv_sec; + offset_tp_mono.tv_nsec = (tvsim.tv_usec * 1000) - tpsim.tv_nsec; + offset_tp_real.tv_sec = tvsim.tv_sec - tpsim.tv_sec; + offset_tp_real.tv_nsec = (tvsim.tv_usec * 1000) - tpsim.tv_nsec; +#endif + /* ENABLE */ + } + enable_output = true; + + /* UNLOCK */ + PIN_ReleaseLock(&mainLock); + + fprintf(stderr, "ARIEL: Enabling memory and instruction tracing from program control at simulated Ariel cycle %" PRIu64 ".\n", + tunnel->getCycles()); + fflush(stdout); + fflush(stderr); +} + +/* Intercept ariel_disable() in application & start simulating instructions */ +void mapped_ariel_disable() +{ + + // Note + // By adding clock offset calculation, this function now has visible side-effects when called more than once + // In most cases won't matter -> ariel_disable() called once or close together in time so offsets will stabilize quickly + // In some cases could cause a big jump in time in the middle of simulation -> ariel_disable() left in app but mode is always-on + // So, update ariel_disable & offsets in lock & don't update if already enabled + + /* LOCK */ + THREADID thr = PIN_ThreadId(); + PIN_GetLock(&mainLock, thr); + + if (!enable_output) { + PIN_ReleaseLock(&mainLock); + return; + } + + // TODO: I just copied these timers from enable. Need to figure out what to do with them here + // so that we can re-enable properly later. // Setup timers to count start time + elapsed simulated time struct timeval tvsim; gettimeofday(&offset_tv, NULL); @@ -678,13 +742,13 @@ void mapped_ariel_enable() offset_tp_real.tv_sec = tvsim.tv_sec - tpsim.tv_sec; offset_tp_real.tv_nsec = (tvsim.tv_usec * 1000) - tpsim.tv_nsec; #endif - /* ENABLE */ - enable_output = true; + /* DISABLE */ + enable_output = false; /* UNLOCK */ PIN_ReleaseLock(&mainLock); - fprintf(stderr, "ARIEL: Enabling memory and instruction tracing from program control at simulated Ariel cycle %" PRIu64 ".\n", + fprintf(stderr, "ARIEL: Disabling memory and instruction tracing from program control at simulated Ariel cycle %" PRIu64 ".\n", tunnel->getCycles()); fflush(stdout); fflush(stderr); @@ -787,6 +851,18 @@ void mapped_ariel_fence(void *virtualAddress) WriteFenceInstructionMarker(thr, ip); } +void mapped_api_mpi_init() { + api_mpi_init_used = 1; +} + +int check_for_api_mpi_init() { + if (!api_mpi_init_used && !getenv("ARIEL_DISABLE_MPI_INIT_CHECK")) { + fprintf(stderr, "Error: fesimple.cc: The Ariel API verion of MPI_Init_{thread} was not used, which can result in errors when used in conjunction with OpenMP. Please link against the Ariel API (included in this distribution at src/sst/elements/ariel/api) or disable this message by setting the environment variable `ARIEL_DISABLE_MPI_INIT_CHECK`\n"); + exit(1); + } + return 0; +} + int ariel_mlm_memcpy(void* dest, void* source, size_t size) { #ifdef ARIEL_DEBUG fprintf(stderr, "Perform a mlm_memcpy from Ariel from %p to %p length %llu\n", @@ -1664,6 +1740,11 @@ VOID InstrumentRoutine(RTN rtn, VOID* args) enable_output = false; } return; + } else if (RTN_Name(rtn) == "ariel_disable" || RTN_Name(rtn) == "_ariel_disable" || RTN_Name(rtn) == "__arielfort_MOD_ariel_disable") { + fprintf(stderr,"Identified routine: ariel_disable, replacing with Ariel equivalent...\n"); + RTN_Replace(rtn, (AFUNPTR) mapped_ariel_disable); + fprintf(stderr,"Replacement complete.\n"); + return; } else if (RTN_Name(rtn) == "gettimeofday" || RTN_Name(rtn) == "_gettimeofday") { fprintf(stderr,"Identified routine: gettimeofday, replacing with Ariel equivalent...\n"); RTN_Replace(rtn, (AFUNPTR) mapped_gettimeofday); @@ -1674,6 +1755,24 @@ VOID InstrumentRoutine(RTN rtn, VOID* args) RTN_Replace(rtn, (AFUNPTR) mapped_ariel_cycles); fprintf(stderr, "Replacement complete\n"); return; + } else if (RTN_Name(rtn) == "MPI_Init" || RTN_Name(rtn) == "_MPI_Init") { + fprintf(stderr, "Identified routine: MPI_Init. Instrumenting.\n"); + RTN_Open(rtn); + RTN_InsertCall(rtn, IPOINT_AFTER, (AFUNPTR) check_for_api_mpi_init, IARG_END); + RTN_Close(rtn); + fprintf(stderr, "Instrumentation complete\n"); + } else if (RTN_Name(rtn) == "MPI_Init_thread" || RTN_Name(rtn) == "_MPI_Init_thread") { + fprintf(stderr, "Identified routine: MPI_Init_thread. Instrumenting.\n"); + RTN_Open(rtn); + RTN_InsertCall(rtn, IPOINT_AFTER, (AFUNPTR) check_for_api_mpi_init, IARG_END); + RTN_Close(rtn); + fprintf(stderr, "Instrumentation complete\n"); + } else if (RTN_Name(rtn) == "api_mpi_init" || RTN_Name(rtn) == "_api_mpi_init") { + fprintf(stderr, "Replacing api_mpi_init with mapped_api_mpi_init.\n"); + RTN_Replace(rtn, (AFUNPTR) mapped_api_mpi_init); + fprintf(stderr, "Replacement complete\n"); + return; + return; #if ! defined(__APPLE__) } else if (RTN_Name(rtn) == "clock_gettime" || RTN_Name(rtn) == "_clock_gettime" || RTN_Name(rtn) == "__clock_gettime") { @@ -1836,6 +1935,11 @@ VOID InstrumentRoutine(RTN rtn, VOID* args) } } +void fork_disable_child_output(THREADID threadid, const CONTEXT *ctx, VOID *v) { + fprintf(stderr, "Warning: fesimple cannot trace forked processes. Disabling Pin for pid %d\n", getpid()); + PIN_Detach(); +} + void loadFastMemLocations() { std::ifstream infile(UseMallocMap.Value().c_str()); @@ -1913,6 +2017,7 @@ int main(int argc, char *argv[]) // Pin version specific tunnel attach tunnelmgr = new SST::Core::Interprocess::MMAPChild_Pin3(SSTNamedPipe.Value()); tunnel = tunnelmgr->getTunnel(); + //printf("fesimple.cc: patrick : got tunnel, %s\n", SSTNamedPipe.Value().c_str()); #ifdef HAVE_CUDA tunnelRmgr = new SST::Core::Interprocess::MMAPChild_Pin3(SSTNamedPipe2.Value()); tunnelDmgr = new SST::Core::Interprocess::MMAPChild_Pin3(SSTNamedPipe3.Value()); @@ -1998,6 +2103,9 @@ int main(int argc, char *argv[]) } } + // Fork callback + PIN_AddForkFunction(FPOINT_AFTER_IN_CHILD, (FORK_CALLBACK) fork_disable_child_output, NULL); + fprintf(stderr, "ARIEL: Starting program.\n"); fflush(stdout); PIN_StartProgram(); diff --git a/src/sst/elements/ariel/frontend/pin3/pin3frontend.cc b/src/sst/elements/ariel/frontend/pin3/pin3frontend.cc index c9803ef0ff..a50ad4d073 100644 --- a/src/sst/elements/ariel/frontend/pin3/pin3frontend.cc +++ b/src/sst/elements/ariel/frontend/pin3/pin3frontend.cc @@ -123,23 +123,89 @@ Pin3Frontend::Pin3Frontend(ComponentId_t id, Params& params, uint32_t cores, uin output->verbose(CALL_INFO, 1, 0, "Base pipe name: %s\n", shmem_region_name3.c_str()); #endif + // MPI Launcher options + mpimode = params.find("mpimode", 0); + if (mpimode) { + mpilauncher = params.find("mpilauncher", ARIEL_STRINGIZE(MPILAUNCHER_EXECUTABLE)); + mpiranks = params.find("mpiranks", 1); + mpitracerank = params.find("mpitracerank", 0); + } + + // MPI Launcher error checking + if (mpimode == 1) { + if (mpilauncher.compare("") == 0) { + output->fatal(CALL_INFO, -1, "mpimode=1 was specified but parameter `mpilauncher` is an empty string"); + } + if (redirect_info.stdin_file.compare("") != 0 || redirect_info.stdout_file.compare("") != 0 || redirect_info.stderr_file.compare("") != 0) { + output->fatal(CALL_INFO, -1, "Using an MPI launcher and redirected I/O is not supported.\n"); + } +#ifdef HAVE_CUDA + output->fatal(CALL_INFO, -1, "Using an MPI launcher and CUDA is not supported.\n"); +#endif + if (mpiranks < 1) { + output->fatal(CALL_INFO, -1, "You must specify a positive number for `mpiranks` when using an MPI launhcer. Got %d.\n", mpiranks); + } + if (mpitracerank < 0 || mpitracerank >= mpiranks) { + output->fatal(CALL_INFO, -1, "The value of `mpitracerank` must be in [0,mpiranks) Got %d.\n", mpitracerank); + } + + } + + if (mpimode == 1) { + output->verbose(CALL_INFO, 1, 0, "Ariel-MPI: MPI launcher: %s\n", mpilauncher.c_str()); + output->verbose(CALL_INFO, 1, 0, "Ariel-MPI: MPI ranks: %d\n", mpiranks); + output->verbose(CALL_INFO, 1, 0, "Ariel-MPI: MPI trace rank: %d\n", mpitracerank); + } + + appLauncher = params.find("launcher", PINTOOL_EXECUTABLE); const uint32_t launch_param_count = (uint32_t) params.find("launchparamcount", 0); const uint32_t pin_arg_count = 37 + launch_param_count; - execute_args = (char**) malloc(sizeof(char*) * (pin_arg_count + app_argc)); + uint32_t mpi_args = 0; + if (mpimode == 1) { + // We need one argument for the launcher, one for the number of ranks, + // and one for the rank to trace + mpi_args = 3; + } + + execute_args = (char**) malloc(sizeof(char*) * (mpi_args + pin_arg_count + app_argc)); + uint32_t arg = 0; // Track current arg + + if (mpimode == 1) { + // Prepend mpilauncher to execute_args + output->verbose(CALL_INFO, 1, 0, "Processing mpilauncher arguments...\n"); + std::string mpiranks_str = std::to_string(mpiranks); + std::string mpitracerank_str = std::to_string(mpitracerank); + + size_t mpilauncher_size = sizeof(char) * (mpilauncher.size() + 2); + execute_args[arg] = (char*) malloc(mpilauncher_size); + snprintf(execute_args[arg], mpilauncher_size, "%s", mpilauncher.c_str()); + arg++; + + size_t mpiranks_str_size = sizeof(char) * (mpiranks_str.size() + 2); + execute_args[arg] = (char*) malloc(mpiranks_str_size); + snprintf(execute_args[arg], mpiranks_str_size, "%s", mpiranks_str.c_str()); + arg++; + + size_t mpitracerank_str_size = sizeof(char) * (mpitracerank_str.size() + 2); + execute_args[arg] = (char*) malloc(mpitracerank_str_size); + snprintf(execute_args[arg], mpitracerank_str_size, "%s", mpitracerank_str.c_str()); + arg++; + } const uint32_t profileFunctions = (uint32_t) params.find("profilefunctions", 0); output->verbose(CALL_INFO, 1, 0, "Processing application arguments...\n"); - uint32_t arg = 0; size_t execute_args_size = sizeof(char) * (appLauncher.size() + 2); - execute_args[0] = (char*) malloc(execute_args_size); - snprintf(execute_args[0], execute_args_size, "%s", appLauncher.c_str()); + execute_args[arg] = (char*) malloc(execute_args_size); + snprintf(execute_args[arg], execute_args_size, "%s", appLauncher.c_str()); arg++; + + #if 0 execute_args[arg++] = const_cast("-pause_tool"); execute_args[arg++] = const_cast("15"); @@ -282,7 +348,12 @@ void Pin3Frontend::init(unsigned int phase) // Init the child_pid = 0, this prevents problems in emergencyShutdown() // if forkPINChild() calls fatal (i.e. the child_pid would not be set) child_pid = 0; - child_pid = forkPINChild(appLauncher.c_str(), execute_args, execute_env, redirect_info); + if (mpimode == 1) { + // Ariel will fork the MPI launcher which will itself fork pin + child_pid = forkPINChild(mpilauncher.c_str(), execute_args, execute_env, redirect_info); + } else { + child_pid = forkPINChild(appLauncher.c_str(), execute_args, execute_env, redirect_info); + } output->verbose(CALL_INFO, 1, 0, "Returned from launching PIN. Waiting for child to attach.\n"); tunnel->waitForChild(); @@ -295,7 +366,7 @@ void Pin3Frontend::finish() { // may still be executing. It will become a zombie if we do not // kill it. if (child_pid != 0) { - kill(child_pid, SIGKILL); + kill(child_pid, SIGTERM); } } diff --git a/src/sst/elements/ariel/frontend/pin3/pin3frontend.h b/src/sst/elements/ariel/frontend/pin3/pin3frontend.h index 6b3dbbd073..43ccaa203b 100644 --- a/src/sst/elements/ariel/frontend/pin3/pin3frontend.h +++ b/src/sst/elements/ariel/frontend/pin3/pin3frontend.h @@ -64,6 +64,10 @@ class Pin3Frontend : public ArielFrontend { {"appstderrappend", "If appstderr is set, set this to 1 to append the file intead of overwriting", "0"}, {"launchparamcount", "Number of parameters supplied for the launch tool", "0" }, {"launchparam%(launchparamcount)d", "Set the parameter to the launcher", "" }, + {"mpimode", "Whether to use to to launch in order to trace MPI-enabled applications.", "0"}, + {"mpilauncher", "Specify a launcher to be used for MPI executables in conjuction with ", STRINGIZE(MPILAUNCHER_EXECUTABLE)}, + {"mpiranks", "Number of ranks to be launched by . Only will be traced by .", "1" }, + {"mpitracerank", "Rank to be traced by .", "0" }, {"envparamcount", "Number of environment parameters to supply to the Ariel executable, default=-1 (use SST environment)", "-1"}, {"envparamname%(envparamcount)d", "Sets the environment parameter name", ""}, {"envparamval%(envparamcount)d", "Sets the environment parameter value", ""}, @@ -115,6 +119,12 @@ class Pin3Frontend : public ArielFrontend { std::string appLauncher; redirect_info_t redirect_info; + int mpimode; + std::string mpilauncher; + int mpiranks; + int mpitracerank; + bool use_mpilauncher; + char **execute_args; std::map execute_env; diff --git a/src/sst/elements/ariel/frontend/simple/examples/stream/tests/refFiles/test_Ariel_ariel_snb_mlm.out b/src/sst/elements/ariel/frontend/simple/examples/stream/tests/refFiles/test_Ariel_ariel_snb_mlm.out index 127f5a88a6..7ccbcaad5e 100644 --- a/src/sst/elements/ariel/frontend/simple/examples/stream/tests/refFiles/test_Ariel_ariel_snb_mlm.out +++ b/src/sst/elements/ariel/frontend/simple/examples/stream/tests/refFiles/test_Ariel_ariel_snb_mlm.out @@ -33,69 +33,87 @@ Creating L3 cache block: 2 in group: 3 Creating L3 cache block: 3 in group: 3 Creating L3 cache block: 4 in group: 3 Completed configuring the SST Sandy Bridge model -ArielComponent[arielcpu.cc:38:ArielCPU] Creating Ariel component... -ArielComponent[arielcpu.cc:44:ArielCPU] Configuring for 8 cores... -ArielComponent[arielcpu.cc:47:ArielCPU] Configuring for check addresses = no +ArielComponent[arielcpu.cc:37:ArielCPU] Creating Ariel component... +ArielComponent[arielcpu.cc:43:ArielCPU] Configuring for 8 cores... +ArielComponent[arielcpu.cc:46:ArielCPU] Configuring for check addresses = no ArielComponent[arielcpu.cc:120:ArielCPU] Loaded memory manager: A0:memmgr ArielComponent[arielcpu.cc:134:ArielCPU] Memory manager construction is completed. -Pin3Frontend[frontend/pin3/pin3frontend.cc:75:Pin3Frontend] Model specifies that there are 0 application arguments -Pin3Frontend[frontend/pin3/pin3frontend.cc:85:Pin3Frontend] Interception and instrumentation of multi-level memory and malloc/free calls is ENABLED. -Pin3Frontend[frontend/pin3/pin3frontend.cc:90:Pin3Frontend] Tracking the stack and dumping on malloc calls is DISABLED. -Pin3Frontend[frontend/pin3/pin3frontend.cc:97:Pin3Frontend] Malloc map file is ENABLED, using file 'malloc.txt' -Pin3Frontend[frontend/pin3/pin3frontend.cc:104:Pin3Frontend] Base pipe name: /tmp/sst_shmem_37338-0-1804289383 -Pin3Frontend[frontend/pin3/pin3frontend.cc:128:Pin3Frontend] Processing application arguments... -Pin3Frontend[frontend/pin3/pin3frontend.cc:258:Pin3Frontend] Completed processing application arguments. -Pin3Frontend[frontend/pin3/pin3frontend.cc:263:Pin3Frontend] Completed initialization of the Ariel CPU. +Pin3Frontend[frontend/pin3/pin3frontend.cc:82:Pin3Frontend] Model specifies that there are 0 application arguments +Pin3Frontend[frontend/pin3/pin3frontend.cc:92:Pin3Frontend] Interception and instrumentation of multi-level memory and malloc/free calls is ENABLED. +Pin3Frontend[frontend/pin3/pin3frontend.cc:97:Pin3Frontend] Tracking the stack and dumping on malloc calls is DISABLED. +Pin3Frontend[frontend/pin3/pin3frontend.cc:104:Pin3Frontend] Malloc map file is ENABLED, using file 'malloc.txt' +Pin3Frontend[frontend/pin3/pin3frontend.cc:111:Pin3Frontend] Base pipe name: /tmp/sst_shmem_4046714-0-1804289383 +Pin3Frontend[frontend/pin3/pin3frontend.cc:200:Pin3Frontend] Processing application arguments... +Pin3Frontend[frontend/pin3/pin3frontend.cc:335:Pin3Frontend] Completed processing application arguments. +Pin3Frontend[frontend/pin3/pin3frontend.cc:340:Pin3Frontend] Completed initialization of the Ariel CPU. ArielComponent[arielcpu.cc:170:ArielCPU] Registering ArielCPU clock at 2660MHz ArielComponent[arielcpu.cc:174:ArielCPU] Clocks registered. ArielComponent[arielcpu.cc:176:ArielCPU] Creating core to cache links... ArielComponent[arielcpu.cc:178:ArielCPU] Creating processor cores and cache links... ArielComponent[arielcpu.cc:180:ArielCPU] Configuring cores and cache links... -ArielComponent[arielcpu.cc:239:ArielCPU] Completed initialization of the Ariel CPU. -SSTARIEL: Loading Ariel Tool to connect to SST on pipe: /tmp/sst_shmem_37338-0-1804289383 max core count: 8 +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:235:ArielCPU] Completed initialization of the Ariel RTL Link. +ArielComponent[arielcpu.cc:247:ArielCPU] Completed initialization of the Ariel CPU. +SSTARIEL: Loading Ariel Tool to connect to SST on pipe: /tmp/sst_shmem_4046714-0-1804289383 max core count: 8 SSTARIEL: Function profiling is disabled. +Allocating arrays of size 2000 elements. +Done allocating arrays. +Perfoming the fast_c compute loop... +Sum of arrays is: 6999500.000000 +Freeing arrays... +Done. +SSTARIEL: Execution completed, shutting down. +Pin3Frontend[frontend/pin3/pin3frontend.cc:347:init] Launching PIN... +Pin3Frontend[frontend/pin3/pin3frontend.cc:415:forkPINChild] Executing PIN command: /home/prlavin/DELETEME-sst-tests/pin-3.28/pin -follow_execv -ifeellucky -t /home/prlavin/DELETEME-sst-tests/install/libexec/fesimple.so -w 0 -E 1 -p /tmp/sst_shmem_4046714-0-1804289383 -v 1 -t 0 -c 8 -s 1 -m 1 -k 0 -u malloc.txt -d 0 -- /home/prlavin/DELETEME-sst-tests/sst-elements/src/sst/elements/ariel/frontend/simple/examples/stream/stream_mlm +Pin3Frontend[frontend/pin3/pin3frontend.cc:357:init] Returned from launching PIN. Waiting for child to attach. +Pin3Frontend[frontend/pin3/pin3frontend.cc:360:init] Child has attached! +CORE ID: 0 PROCESSED AN EXIT EVENT +ArielComponent[arielcpu.cc:265:finish] Ariel Processor Information: +ArielComponent[arielcpu.cc:266:finish] Completed at: 3191303 nanoseconds. +ArielComponent[arielcpu.cc:267:finish] Ariel Component Statistics (By Core) + +Ariel Memory Management Statistics: +--------------------------------------------------------------------- +Page Table Sizes: +- Demand map entries at level 0 328 +- Demand map entries at level 1 0 +Page Table Coverages: +- Demand bytes at level 0 1343488 +- Demand bytes at level 1 0 +Simulation is complete, simulated time: 3.1913 ms ARIEL-SST: Did not find ARIEL_OVERRIDE_POOL in the environment, no override applies. ARIEL-SST PIN tool activating with 8 threads ARIEL: Default memory pool set to 0 ARIEL: Tool is configured to begin with profiling immediately. ARIEL: Starting program. -Identified routine: malloc/_malloc, replacing with Ariel equivalent... -Identified routine: malloc/_malloc, replacing with Ariel equivalent... -Identified routine: free/_free, replacing with Ariel equivalent... Identified routine: clock_gettime, replacing with Ariel equivalent... Replacement complete. -Pin3Frontend[frontend/pin3/pin3frontend.cc:270:init] Launching PIN... -Pin3Frontend[frontend/pin3/pin3frontend.cc:326:forkPINChild] Executing PIN command: /usr/local/module-pkgs/pin/pin-3.22-98547-g7a303a835-gcc-linux/pin -follow_execv -ifeellucky -t /ascldap/users/grvosku/dev/build/sst-elements/libexec/fesimple.so -w 0 -E 1 -p /tmp/sst_shmem_37338-0-1804289383 -v 1 -t 0 -c 8 -s 1 -m 1 -k 0 -u malloc.txt -d 0 -- ./stream_mlm -Pin3Frontend[frontend/pin3/pin3frontend.cc:275:init] Returned from launching PIN. Waiting for child to attach. -Pin3Frontend[frontend/pin3/pin3frontend.cc:278:init] Child has attached! Identified routine: ariel_enable, replacing with Ariel equivalent... Replacement complete. +Identified routine: ariel_disable, replacing with Ariel equivalent... +Replacement complete. +Identified routine: ariel_fence, replacing with Ariel equivalent.. +Replacement complete Identified routine: ariel_cycles, replacing with Ariel equivalent.. Replacement complete Identified routine: ariel_output_stats, replacing with Ariel equivalent.. Replacement complete Identified routine: ariel_malloc_flag, replacing with Ariel equivalent.. +Replacing api_mpi_init with mapped_api_mpi_init. +Replacement complete +Identified routine: MPI_Init. Instrumenting. +Instrumentation complete +Identified routine: MPI_Init_thread. Instrumenting. +Instrumentation complete Identified routine: malloc/_malloc, replacing with Ariel equivalent... +Identified routine: free/_free, replacing with Ariel equivalent... Identified routine: clock_gettime, replacing with Ariel equivalent... Replacement complete. -Allocating arrays of size 2000 elements. -Done allocating arrays. -Perfoming the fast_c compute loop... -Sum of arrays is: 6999500.000000 -Freeing arrays... -Done. -SSTARIEL: Execution completed, shutting down. -CORE ID: 0 PROCESSED AN EXIT EVENT -ArielComponent[arielcpu.cc:257:finish] Ariel Processor Information: -ArielComponent[arielcpu.cc:258:finish] Completed at: 1651433 nanoseconds. -ArielComponent[arielcpu.cc:259:finish] Ariel Component Statistics (By Core) - -Ariel Memory Management Statistics: ---------------------------------------------------------------------- -Page Table Sizes: -- Demand map entries at level 0 251 -- Demand map entries at level 1 0 -Page Table Coverages: -- Demand bytes at level 0 1028096 -- Demand bytes at level 1 0 -Simulation is complete, simulated time: 1.65143 ms +Identified routine: MPI_Init_thread. Instrumenting. +Instrumentation complete diff --git a/src/sst/elements/ariel/mpi/.gitignore b/src/sst/elements/ariel/mpi/.gitignore new file mode 100644 index 0000000000..0cd642e8d6 --- /dev/null +++ b/src/sst/elements/ariel/mpi/.gitignore @@ -0,0 +1,6 @@ +*.o +*.so +reduce +hello +mpilauncher +fakepin diff --git a/src/sst/elements/ariel/mpi/Makefile b/src/sst/elements/ariel/mpi/Makefile new file mode 100644 index 0000000000..fa1ab49bcb --- /dev/null +++ b/src/sst/elements/ariel/mpi/Makefile @@ -0,0 +1,28 @@ +CC=gcc +CXX=g++ +MPICXX=mpic++ + +all: mpilauncher hello fakepin reduce libarielapi.so + +clean: + rm -rf mpilauncher mpilauncher hello fakepin reduce *.o *.so + +mpilauncher: mpilauncher.cc + g++ -g -o mpilauncher mpilauncher.cc + +hello: hello.cc + mpic++ -fopenmp -g -o hello hello.cc +reduce: reduce.cc libarielapi.so + mpic++ -fopenmp -g -o reduce reduce.cc -L. -larielapi + +fakepin: fakepin.cc + g++ -g -o fakepin fakepin.cc + +libarielapi.so: libarielapi.o + gcc -shared -o libarielapi.so libarielapi.o + +libarielapi.o: libarielapi.c + gcc -o libarielapi.o -fPIC -I./ -c libarielapi.c + +.PHONY: all clean + diff --git a/src/sst/elements/ariel/mpi/ariel-reduce.py b/src/sst/elements/ariel/mpi/ariel-reduce.py new file mode 100644 index 0000000000..41a4796e60 --- /dev/null +++ b/src/sst/elements/ariel/mpi/ariel-reduce.py @@ -0,0 +1,117 @@ +import sst +import sys +import os + +# Detect if we will use MPI mode or not +mpi_mode = True +ncores= 1 +mpiranks = 1 +tracerank = 0 +size = 1024 +#size = 2048000 + +if (len(sys.argv) > 1): + mpiranks = int(sys.argv[1]) +if (len(sys.argv) > 2): + ncores = int(sys.argv[2]) +if (len(sys.argv) > 3): + size = int(sys.argv[3]) + +print(f'Running with {mpiranks} ranks and {ncores} threads per rank. Tracing rank {tracerank}. Size {size}') + +os.environ['OMP_NUM_THREADS'] = str(ncores) + +######################################################################### +## Define SST core options +######################################################################### +# If this demo gets to 100ms, something has gone very wrong! +sst.setProgramOption("stop-at", "200ms") + +######################################################################### +## Declare components +######################################################################### +core = sst.Component("core", "ariel.ariel") +cache = [sst.Component("cache_"+str(i), "memHierarchy.Cache") for i in range(ncores)] +memctrl = sst.Component("memory", "memHierarchy.MemController") +bus = sst.Component("bus", "memHierarchy.Bus") + +######################################################################### +## Set component parameters and fill subcomponent slots +######################################################################### +# Core: 2.4GHz, 2 accesses/cycle, STREAM (triad) pattern generator with 1000 elements per array +core.addParams({ + "clock" : "2.4GHz", + "verbose" : 1, + #"executable" : "./hello-nompi" + "executable" : "./reduce", + #"executable" : "/home/prlavin/projects/reference-paper-2024/apps/install/bin/amg", + "arielmode" : 0, + "corecount" : ncores, + "appargcount" : 1, + #"apparg0" : 500000000, + #"apparg0" : 250000000, + "apparg0" : size, +}) + +if mpi_mode: + core.addParams({ + "mpilauncher": "./mpilauncher", + "mpiranks": mpiranks, + "mpitracerank" : tracerank, + }) + +# Cache: L1, 2.4GHz, 2KB, 4-way set associative, 64B lines, LRU replacement, MESI coherence +for i in range(ncores): + cache[i].addParams({ + "L1" : 1, + "cache_frequency" : "2.4GHz", + "access_latency_cycles" : 2, + "cache_size" : "2KiB", + "associativity" : 4, + "replacement_policy" : "lru", + "coherence_policy" : "MESI", + "cache_line_size" : 64, + }) + +# Memory: 50ns access, 1GB +memctrl.addParams({ + "clock" : "1GHz", + "backing" : "none", # We're not using real memory values, just addresses + "addr_range_end" : 1024*1024*1024-1, +}) +memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem") +memory.addParams({ + "mem_size" : "1GiB", + "access_time" : "50ns", +}) + +bus.addParams({ + "bus_frequency": "2.0GHz", +}) + +######################################################################### +## Declare links +######################################################################### +core_cache = [sst.Link("core_to_cache_"+str(i)) for i in range(ncores)] +cache_bus = [sst.Link("cache_" + str(i) + "_to_bus") for i in range(ncores)] +bus_mem = sst.Link("bus_to_memory") + + +######################################################################### +## Connect components with the links +######################################################################### +[core_cache[i].connect( (core, "cache_link_"+str(i), "100ps"), (cache[i], "high_network_0", "100ps") ) for i in range(ncores)] +[cache_bus[i].connect( (cache[i], "low_network_0", "100ps"), (bus, "high_network_"+str(i), "100ps") ) for i in range(ncores)] +bus_mem.connect( (bus, "low_network_0", "100ps"), (memctrl, "direct_link", "100ps") ) + +sst.setStatisticOutput("sst.statoutputtxt") + +# Send the statistics to a fiel called 'stats.csv' +sst.setStatisticOutputOptions( { "filepath" : "stats.csv" }) + +# Print statistics of level 5 and below (0-5) +sst.setStatisticLoadLevel(5) + +# Enable statistics for all the component +sst.enableAllStatisticsForAllComponents() +################################ The End ################################ diff --git a/src/sst/elements/ariel/mpi/arielapi.h b/src/sst/elements/ariel/mpi/arielapi.h new file mode 100644 index 0000000000..4ef5ae367e --- /dev/null +++ b/src/sst/elements/ariel/mpi/arielapi.h @@ -0,0 +1,33 @@ +// Copyright 2009-2015 Sandia Corporation. Under the terms +// of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. +// Government retains certain rights in this software. +// +// Copyright (c) 2009-2015, Sandia Corporation +// All rights reserved. +// +// This file is part of the SST software package. For license +// information, see the LICENSE file in the top level directory of the +// distribution. + + +#ifndef _H_ARIEL_API +#define _H_ARIEL_API + +#include + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +void ariel_enable(); +void ariel_disable(); +uint64_t ariel_cycles(); +void ariel_output_stats(); +void ariel_malloc_flag(int64_t id, int count, int level); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + +#endif + diff --git a/src/sst/elements/ariel/mpi/demo1.py b/src/sst/elements/ariel/mpi/demo1.py new file mode 100644 index 0000000000..406349363c --- /dev/null +++ b/src/sst/elements/ariel/mpi/demo1.py @@ -0,0 +1,101 @@ +import sst +import sys +import os + +# Detect if we will use MPI mode or not +mpi_mode = True +ncores= 2 +mpiranks = 4 + +os.environ['OMP_NUM_THREADS'] = str(ncores) + +######################################################################### +## Define SST core options +######################################################################### +# If this demo gets to 100ms, something has gone very wrong! +sst.setProgramOption("stop-at", "1000ms") + +######################################################################### +## Declare components +######################################################################### +core = sst.Component("core", "ariel.ariel") +cache = [sst.Component("cache_"+str(i), "memHierarchy.Cache") for i in range(ncores)] +memctrl = sst.Component("memory", "memHierarchy.MemController") +bus = sst.Component("bus", "memHierarchy.Bus") + +######################################################################### +## Set component parameters and fill subcomponent slots +######################################################################### +# Core: 2.4GHz, 2 accesses/cycle, STREAM (triad) pattern generator with 1000 elements per array +core.addParams({ + "clock" : "2.4GHz", + "verbose" : 1, + #"executable" : "./hello-nompi" + "executable" : "./hello", + #"executable" : "/home/prlavin/projects/reference-paper-2024/apps/install/bin/amg", + "arielmode" : 1, + "corecount" : ncores, +}) + +if mpi_mode: + core.addParams({ + "mpilauncher": "./mpilauncher", + "mpiranks": mpiranks, + "mpitracerank" : 0, + }) + +# Cache: L1, 2.4GHz, 2KB, 4-way set associative, 64B lines, LRU replacement, MESI coherence +for i in range(ncores): + cache[i].addParams({ + "L1" : 1, + "cache_frequency" : "2.4GHz", + "access_latency_cycles" : 2, + "cache_size" : "2KiB", + "associativity" : 4, + "replacement_policy" : "lru", + "coherence_policy" : "MESI", + "cache_line_size" : 64, + }) + +# Memory: 50ns access, 1GB +memctrl.addParams({ + "clock" : "1GHz", + "backing" : "none", # We're not using real memory values, just addresses + "addr_range_end" : 1024*1024*1024-1, +}) +memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem") +memory.addParams({ + "mem_size" : "1GiB", + "access_time" : "50ns", +}) + +bus.addParams({ + "bus_frequency": "2.0GHz", +}) + +######################################################################### +## Declare links +######################################################################### +core_cache = [sst.Link("core_to_cache_"+str(i)) for i in range(ncores)] +cache_bus = [sst.Link("cache_" + str(i) + "_to_bus") for i in range(ncores)] +bus_mem = sst.Link("bus_to_memory") + + +######################################################################### +## Connect components with the links +######################################################################### +[core_cache[i].connect( (core, "cache_link_"+str(i), "100ps"), (cache[i], "high_network_0", "100ps") ) for i in range(ncores)] +[cache_bus[i].connect( (cache[i], "low_network_0", "100ps"), (bus, "high_network_"+str(i), "100ps") ) for i in range(ncores)] +bus_mem.connect( (bus, "low_network_0", "100ps"), (memctrl, "direct_link", "100ps") ) + +sst.setStatisticOutput("sst.statoutputtxt") + +# Send the statistics to a fiel called 'stats.csv' +sst.setStatisticOutputOptions( { "filepath" : "stats.csv" }) + +# Print statistics of level 5 and below (0-5) +sst.setStatisticLoadLevel(5) + +# Enable statistics for all the component +sst.enableAllStatisticsForAllComponents() +################################ The End ################################ diff --git a/src/sst/elements/ariel/mpi/fakepin.cc b/src/sst/elements/ariel/mpi/fakepin.cc new file mode 100644 index 0000000000..1d49080eab --- /dev/null +++ b/src/sst/elements/ariel/mpi/fakepin.cc @@ -0,0 +1,47 @@ +#include +#include +#include +#include + +int main(int argc, char *argv[], char *envp[]) +{ + if (argc < 3) { + printf("Usage: ./fakepin -- [program args...]\n"); + exit(1); + } + + int prog_idx = 1; + + while (strcmp("--", argv[prog_idx])) { + prog_idx++; + } + prog_idx++; + + printf("prog_name: %s\n", argv[prog_idx]); + + // Make a copy of envp so we can add FAKEPIN=1 + char **envp_copy; + + int envp_len = 0; + while(envp[envp_len]!=NULL) { + envp_len++; + } + + envp_copy = (char**) malloc(sizeof(char*) * (envp_len + 1)); + for (int i = 0; i < envp_len - 1; i++) { + envp_copy[i] = (char*) malloc(sizeof(char) * (strlen(envp[i])+1)); + strcpy(envp_copy[i], envp[i]); + } + envp_copy[envp_len-1] = (char*) malloc(sizeof(char) * 10); + strcpy(envp_copy[envp_len-1], "FAKEPIN=1"); + envp_copy[envp_len] = NULL; + + + // Launch the program + char* _argv[] = {NULL}; + printf("Fakepin launching [%s]\n", argv[prog_idx]); + + if (execve(argv[prog_idx], &argv[prog_idx], envp_copy) == -1) { + perror("Could not execve"); + } +} diff --git a/src/sst/elements/ariel/mpi/hello.cc b/src/sst/elements/ariel/mpi/hello.cc new file mode 100644 index 0000000000..fc433f4e5e --- /dev/null +++ b/src/sst/elements/ariel/mpi/hello.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +int ariel_enable() { + printf("App: ariel_enable called\n"); + return 0; +} + +int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + ariel_enable(); + int rank = 0; + int nranks = 0; + int ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + ret = MPI_Comm_size(MPI_COMM_WORLD, &nranks); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + +#pragma omp parallel + { + int thread = omp_get_thread_num(); + +#pragma omp critical + { + if (!std::getenv("FAKEPIN")) { + printf("Hello from rank %d/%d, thread %d!", rank, nranks, thread); + } else { + printf("Hello from rank %d/%d, thread %d! (Launched by fakepin)", rank, nranks, thread); + } + + for (int i = 1; i < argc; i++) { + printf(" -- %s", argv[i]); + } + printf("\n"); + } + } + + int compute = 0; + if (argc > 1) { + compute = atoi(argv[1]); + } + + if (compute) { + if (rank == 0) { + int *vec_a = (int*) malloc(sizeof(int) * compute); + for (int i = 0; i < compute; i++) { + vec_a[i] = 2; + } + for (int i = 0; i < compute; i++) { + for (int j = 0; j < i; j++) { + for (int k = 0; k < j; k++) { + vec_a[i] += vec_a[j] + vec_a[k] + (i % 7 == 0 ? 3 : 5); + } + } + } + printf("Rank 0: vec_a[%d] is %d\n", compute-1, vec_a[compute-1]); + } + } + + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) { + printf("Rank 0: Barrier complete.\n"); + } + + MPI_Finalize(); +} diff --git a/src/sst/elements/ariel/mpi/libarielapi.c b/src/sst/elements/ariel/mpi/libarielapi.c new file mode 100644 index 0000000000..4079d19eb8 --- /dev/null +++ b/src/sst/elements/ariel/mpi/libarielapi.c @@ -0,0 +1,35 @@ +// Copyright 2009-2015 Sandia Corporation. Under the terms +// of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. +// Government retains certain rights in this software. +// +// Copyright (c) 2009-2015, Sandia Corporation +// All rights reserved. +// +// This file is part of the SST software package. For license +// information, see the LICENSE file in the top level directory of the +// distribution. + + +#include +#include +#include + +void ariel_enable() { + printf("ARIEL-CLIENT: Library enabled.\n"); +} + +void ariel_disable() { + printf("ARIEL-CLIENT: Library disabled.\n"); +} + +uint64_t ariel_cycles() { + return 0; +} + +void ariel_output_stats() { + printf("ARIEL-CLIENT: Printing statistics.\n"); +} + +void ariel_malloc_flag(int64_t id, int count, int level) { + printf("ARIEL-CLIENT: flagging next %d mallocs at id %" PRId64 "\n", count, id); +} diff --git a/src/sst/elements/ariel/mpi/mpilauncher.cc b/src/sst/elements/ariel/mpi/mpilauncher.cc new file mode 100644 index 0000000000..85dc36959e --- /dev/null +++ b/src/sst/elements/ariel/mpi/mpilauncher.cc @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * SLURM-specific MPI launcher for Ariel simulations + * Ariel forks this process which initiates mpirun + * + * If one rank is found in the MPI allocation, all ranks + * will run there. If multiple ranks are found, a single rank + * will run on the node with SST, and the remaining + * ranks will be distributed on the other nodes. + */ + +int pid = 0; // global so we can use it in the signal handler + +// Catch SIGTERM so we can try and shut down the child process +void signalHandler(int signum) { + std::cout << "Caught signal " << signum << ", exiting gracefully." << std::endl; + if (pid != 0) { + kill(pid, signum); + } + exit(0); +} + +int main(int argc, char *argv[]) { + + if (argc < 4 || std::string(argv[1]).compare("-H") == 0) { + std::cout << "Usage: " << argv[0] << " [pin args] -- [program args]\n"; + exit(1); + } + + signal(SIGTERM, signalHandler); + + std::array buffer; + + // Get node that SST is running on. + // All processes will run on the same node. + gethostname(buffer.data(), 128); + /* + std::string pin_host = buffer.data(); + size_t pos = pin_host.find('.'); + std::string pin_host = pin_host.substr(0,pos); + */ + std::string host = buffer.data(); + + int procs = atoi(argv[1]); + int tracerank = atoi(argv[2]); + + if (procs < 1) { + printf("Error: %s: must be positive\n", argv[0]); + exit(1); + } + + if (tracerank < 0 || tracerank >= procs) { + printf("Error: %s: must be in [0,nprocs)\n", argv[0]); + exit(1); + } + + // In order to trace the appropriate rank, determine how many + // should launch before the traced rank, and how many should launch after + int ranks_before = tracerank; + int ranks_after = procs - tracerank - 1; + if (ranks_after < 0) { + ranks_after = 0; + } + + // `pinstring` will contain the command to launch pin and all of its arguments + // `binary` will contain the traced program and all of its arguments + std::string pinstring = ""; + std::string binary = ""; + bool getbinary = false; + std::string arg; + for (int i = 3; i < argc; i++) { + arg = argv[i]; + + // Pin string + pinstring += arg; + pinstring += " "; + + // Binary string + if (getbinary) { + binary += arg; + binary += " "; + } + + if (arg == "--") + getbinary = true; + } + + // Build the mpirun command + std::string mpicmd = "mpirun --oversubscribe"; + + if (ranks_before > 0) { + mpicmd += " -H "; + mpicmd += host; + mpicmd += " -np "; + mpicmd += std::to_string(ranks_before); + mpicmd += " "; + mpicmd += binary; + mpicmd += " : "; + } + + mpicmd += " -H "; + mpicmd += host; + mpicmd += " -np "; + mpicmd += std::to_string(1); + mpicmd += " "; + mpicmd += pinstring; + + if (ranks_after > 0) { + mpicmd += " : -H "; + mpicmd += host; + mpicmd += " -np "; + mpicmd += std::to_string(ranks_after); + mpicmd += " "; + mpicmd += binary; + } + + int use_system = 0; + if (use_system) { + printf("Wrapper starting...\n"); + printf("Arg to system: %s\n", mpicmd.c_str()); + system(mpicmd.c_str()); + printf("Wrapper complete...\n"); + } else { + // Use execve to make sure that the child processes exits when killed by SST + // I am lazily assuming that there are no spaces in any of the arguments. + + // Get a mutable copy + char * cmd_copy = new char[mpicmd.length() + 1]; + std::strcpy(cmd_copy, mpicmd.c_str()); + + // Calculate an upper bound for the number of arguments + const int MAX_ARGS = std::strlen(cmd_copy) / 2 + 2; + + // Allocate memory for the pointers + char** argv = new char*[MAX_ARGS]; + for (int i = 0;i < MAX_ARGS; i++) { + argv[i] = NULL; + } + + // Temporary variable to hold each word + char* word; + + // Counter for the number of words + int argc = 0; + + // Use strtok to split the string by spaces + word = std::strtok(cmd_copy, " "); + while (word != nullptr) { + // Allocate memory for the word and copy it + argv[argc] = new char[std::strlen(word) + 1]; + std::strcpy(argv[argc], word); + + // Move to the next word + word = std::strtok(nullptr, " "); + argc++; + } + + assert(argv[argc] == NULL); + + printf("MPI Command: %s\n", mpicmd.c_str()); + + // Forking child process so we can use the parent to kill it if we need to + pid = fork(); + if (pid == -1) { + printf("mpilauncher.cc: fork error: %d, %s\n", errno, strerror(errno)); + exit(-1); + } else if (pid > 1) { // Parent + int status; + waitpid(pid, &status, 0); + if (!WIFEXITED(status)) { + printf("Warning: mpilauncher.cc: Forked process did not exit normally.\n"); + } if (WEXITSTATUS(status) != 0) { + printf("Warning: mpilauncher.cc: Forked process has non-zero exit code: %d\n", WEXITSTATUS(status)); + } + exit(0); + } else { // Child + int ret = execvp(argv[0], argv); + printf("Error: mpilauncher.cc: This should be unreachable. execvp error: %d, %s\n", errno, strerror(errno)); + exit(1); + } + + } +} diff --git a/src/sst/elements/ariel/mpi/reduce.cc b/src/sst/elements/ariel/mpi/reduce.cc new file mode 100644 index 0000000000..81ea10f484 --- /dev/null +++ b/src/sst/elements/ariel/mpi/reduce.cc @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include +#include +#include "arielapi.h" + +#define DEBUG 0 + +int main(int argc, char* argv[]) { + + MPI_Init(&argc, &argv); + + if (argc < 2) { + printf("Too few args\n"); + exit(1); + } + + int len = atoi(argv[1]); + + if (len < 1) { + printf("Please specify positive len\n"); + exit(1); + } + + + int rank = 0; + int nranks = 0; + + int ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + ret = MPI_Comm_size(MPI_COMM_WORLD, &nranks); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + + if (len % nranks != 0) { + printf("MPI ranks must divide vector length (len = %d, nranks = %d)\n", len, nranks); + exit(1); + } + + len = len / nranks; + + int nthreads = omp_get_max_threads(); + +#if DEBUG + printf("Running on %d ranks, %d threads per rank\n", nranks, nthreads); +#endif + + + // Initialize + int *vec = (int*) malloc(sizeof(int) * len); + for (int i = 0; i < len; i++) { + vec[i] = rank*len + i; + } + + ariel_enable(); + auto begin = std::chrono::high_resolution_clock::now(); + long int sum = 0; + #pragma omp parallel for reduction(+:sum) + for (int i = 0; i < len; i++) { + sum += vec[i]; + } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end-begin).count(); + if (rank == 0) { + std::cout << nranks << " " << nthreads << " " << duration/1000000 << "\n"; + } + ariel_disable(); + +#if DEBUG + printf("Rank %d: sum is %ld\n", rank, sum); +#endif + + long int tot = 0; + MPI_Allreduce( + &sum, + &tot, + 1, + MPI_LONG, + MPI_SUM, + MPI_COMM_WORLD); + + +#if DEBUG + printf("Rank %d: tot is %ld\n", rank, tot); +#endif + + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) { +#if DEBUG + printf("Rank 0: Barrier complete.\n"); +#endif + } + + MPI_Finalize(); +} diff --git a/src/sst/elements/ariel/tests/testMPI/.gitignore b/src/sst/elements/ariel/tests/testMPI/.gitignore new file mode 100644 index 0000000000..7451a50799 --- /dev/null +++ b/src/sst/elements/ariel/tests/testMPI/.gitignore @@ -0,0 +1,5 @@ +fakepin +hello +reduce +stats.csv +tmp.out* diff --git a/src/sst/elements/ariel/tests/testMPI/Makefile b/src/sst/elements/ariel/tests/testMPI/Makefile new file mode 100644 index 0000000000..c9c967083f --- /dev/null +++ b/src/sst/elements/ariel/tests/testMPI/Makefile @@ -0,0 +1,18 @@ +CC := mpic++ +API_DIR := ../../api +CFLAGS := -fopenmp -I$(API_DIR) +LDFLAGS := -L$(API_DIR) -larielapi +TARGETS := hello fakepin reduce + +all: $(TARGETS) + +%: %.cc libarielapi + $(CC) $(CFLAGS) $< -o $@ $(LDFLAGS) + +libarielapi: + CC=mpicc $(MAKE) -C $(API_DIR) + +clean: + rm -rf $(TARGETS) + +.PHONY: clean libarielapi diff --git a/src/sst/elements/ariel/tests/testMPI/fakepin.cc b/src/sst/elements/ariel/tests/testMPI/fakepin.cc new file mode 100644 index 0000000000..1d49080eab --- /dev/null +++ b/src/sst/elements/ariel/tests/testMPI/fakepin.cc @@ -0,0 +1,47 @@ +#include +#include +#include +#include + +int main(int argc, char *argv[], char *envp[]) +{ + if (argc < 3) { + printf("Usage: ./fakepin -- [program args...]\n"); + exit(1); + } + + int prog_idx = 1; + + while (strcmp("--", argv[prog_idx])) { + prog_idx++; + } + prog_idx++; + + printf("prog_name: %s\n", argv[prog_idx]); + + // Make a copy of envp so we can add FAKEPIN=1 + char **envp_copy; + + int envp_len = 0; + while(envp[envp_len]!=NULL) { + envp_len++; + } + + envp_copy = (char**) malloc(sizeof(char*) * (envp_len + 1)); + for (int i = 0; i < envp_len - 1; i++) { + envp_copy[i] = (char*) malloc(sizeof(char) * (strlen(envp[i])+1)); + strcpy(envp_copy[i], envp[i]); + } + envp_copy[envp_len-1] = (char*) malloc(sizeof(char) * 10); + strcpy(envp_copy[envp_len-1], "FAKEPIN=1"); + envp_copy[envp_len] = NULL; + + + // Launch the program + char* _argv[] = {NULL}; + printf("Fakepin launching [%s]\n", argv[prog_idx]); + + if (execve(argv[prog_idx], &argv[prog_idx], envp_copy) == -1) { + perror("Could not execve"); + } +} diff --git a/src/sst/elements/ariel/tests/testMPI/hello.cc b/src/sst/elements/ariel/tests/testMPI/hello.cc new file mode 100644 index 0000000000..a1ecbce3ea --- /dev/null +++ b/src/sst/elements/ariel/tests/testMPI/hello.cc @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include + +// Useage: ./hello [output-file] +// If running with multiple ranks, each will output to its own file +int main(int argc, char* argv[]) { + + MPI_Init(&argc, &argv); + + int rank = 0; + int nranks = 0; + int ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + ret = MPI_Comm_size(MPI_COMM_WORLD, &nranks); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + + // Redirect output to a file if an argument was given. Append the rank to each filename. + FILE *output = stdout; + if (argc > 1) { + int len = strlen(argv[1]) + 12;// Space for underscore, plus up to a 10 digit integer, plus the null character + char *outfile = (char*)malloc(len); + if (!outfile) { + printf("Error allocating space for filename\n"); + } + snprintf(outfile, len, "%s_%d", argv[1], rank); + output = fopen(outfile, "w"); + if (!output) { + printf("Error opening %s\n"); + exit(1); + } + free(outfile); + } + + + ariel_enable(); +#pragma omp parallel + { + int thread = omp_get_thread_num(); +#pragma omp critical + { + // ./fakepin sets the FAKEPIN environment variale. This is useful for debugging but + // not needed for our Ariel MPI testsuite. + // We only want output from the traced process + if (std::getenv("FAKEPIN")) { + fprintf(output, "Hello from rank %d of %d, thread %d! (Launched by fakepin)\n", rank, nranks, thread); + } else if (std::getenv("PIN_CRT_TZDATA") || std::getenv("PIN_APP_LD_LIBRARY_PATH")) { + fprintf(output, "Hello from rank %d of %d, thread %d! (Launched by pin)\n", rank, nranks, thread); + } else { + fprintf(output, "Hello from rank %d of %d, thread %d!\n", rank, nranks, thread); + } + } + } + + // This is here just to make sure it doesn't crash when the processes try to communicate. + MPI_Barrier(MPI_COMM_WORLD); + ariel_disable(); + if (rank == 0) { + printf("Rank 0: Barrier complete.\n"); + } + + if (argc > 1) { + fclose(output); + } + + MPI_Finalize(); +} diff --git a/src/sst/elements/ariel/tests/testMPI/reduce.cc b/src/sst/elements/ariel/tests/testMPI/reduce.cc new file mode 100644 index 0000000000..a889167dc1 --- /dev/null +++ b/src/sst/elements/ariel/tests/testMPI/reduce.cc @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include "arielapi.h" + +#define DEBUG 0 +#define TIMING 1 + +int main(int argc, char* argv[]) { + + int prov; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &prov); + + if (argc < 2) { + printf("Too few args\n"); + exit(1); + } + + int len = atoi(argv[1]); + + if (len < 1) { + printf("Please specify positive len\n"); + exit(1); + } + + int rank = 0; + int nranks = 0; + + int ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + ret = MPI_Comm_size(MPI_COMM_WORLD, &nranks); + if (ret != MPI_SUCCESS) { + printf("Error: MPI_Comm_rank retuned error: %d\n", ret); + exit(1); + } + + if (len % nranks != 0) { + printf("MPI ranks must divide vector length (len = %d, nranks = %d)\n", len, nranks); + exit(1); + } + + len = len / nranks; + + FILE *output = stdout; + + if (argc > 2) { + int len = strlen(argv[2]) + 12;// Space for underscore, plus up to a 10 digit integer, plus the null character + char *outfile = (char*)malloc(len); + if (!outfile) { + printf("Error allocating space for filename\n"); + } + snprintf(outfile, len, "%s_%d", argv[2], rank); + + output = fopen(outfile, "w"); + if (!output) { + printf("Unable to open %s\n", outfile); + exit(1); + } + } + + + int nthreads = omp_get_max_threads(); + +#if DEBUG + printf("Running on %d ranks, %d threads per rank\n", nranks, nthreads); +#endif + + // Initialize + int *vec = (int*) malloc(sizeof(int) * len); + for (int i = 0; i < len; i++) { + vec[i] = rank*len + i; + } + + ariel_enable(); + +#if TIMING + auto begin = std::chrono::high_resolution_clock::now(); +#endif + + long int sum = 0; + #pragma omp parallel for reduction(+:sum) + for (int i = 0; i < len; i++) { + sum += vec[i]; + } + +#if TIMING + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end-begin).count(); + if (rank == 0) { + std::cout << nranks << " " << nthreads << " " << duration/1000000 << "\n"; + } +#endif + + ariel_disable(); + + long int tot = 0; + MPI_Allreduce( + &sum, + &tot, + 1, + MPI_LONG, + MPI_SUM, + MPI_COMM_WORLD); + + fprintf(output, "Rank %d partial sum is %ld, total sum is %d\n", rank, sum, tot); + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) { +#if DEBUG + printf("Rank 0: Barrier complete.\n"); +#endif + } + + MPI_Finalize(); +} diff --git a/src/sst/elements/ariel/tests/testMPI/test-mpi.py b/src/sst/elements/ariel/tests/testMPI/test-mpi.py new file mode 100644 index 0000000000..5c70fa70b2 --- /dev/null +++ b/src/sst/elements/ariel/tests/testMPI/test-mpi.py @@ -0,0 +1,130 @@ +import sst +import sys +import os +import argparse + +parser = argparse.ArgumentParser( + prog=f'sst [sst-args] test-mpi.py --', + description='Used for testing Ariel\'s MPI features') + +parser.add_argument('program', help='Which program to run. Either "hello" or "reduce".') +parser.add_argument('-r', dest='ranks', default=1, help='How many ranks of the traced program to run.') +parser.add_argument('-a', dest='tracerank', default=0, help='Which of the MPI ranks will be traced.') +parser.add_argument('-t', dest='threads', default=1, help='The number of OpenMP threads to use per rank.') +parser.add_argument('-s', dest='size', default=2048, help='The input value for the "reduce" program') +parser.add_argument('-o', dest='output', help='Optional argument to both programs to change stdout') + +args = parser.parse_args() + +ncores = int(args.threads) +mpiranks = int(args.ranks) +tracerank = int(args.tracerank) +size = int(args.size) + +if args.program not in ['hello', 'reduce']: + print('Error: supported values for `program` are "hello" and "reduce".') + +program_string = f'./{args.program}' +if args.program == 'reduce': + program_string += f' (size={size})' + +print(f'mpi-test.py: Running {program_string} with {mpiranks} rank(s) and {ncores} thread(s) per rank. Tracing rank {tracerank}') + +os.environ['OMP_NUM_THREADS'] = str(ncores) + + +######################################################################### +## Declare components +######################################################################### +core = sst.Component("core", "ariel.ariel") +cache = [sst.Component("cache_"+str(i), "memHierarchy.Cache") for i in range(ncores)] +memctrl = sst.Component("memory", "memHierarchy.MemController") +bus = sst.Component("bus", "memHierarchy.Bus") + +######################################################################### +## Set component parameters and fill subcomponent slots +######################################################################### +# 2.4GHz cores. One for each omp thread +core.addParams({ + "clock" : "2.4GHz", + "verbose" : 1, + "executable" : f"./{args.program}", + "arielmode" : 0, # Disable tracing at start + "corecount" : ncores, + "mpimode" : 1, + "mpiranks" : mpiranks, + "mpitracerank" : tracerank, +}) + +# Set the size of the reduce vector and optionally set the output file +if args.program == "reduce": + if args.output is not None: + core.addParams({ + "appargcount" : 2, + "apparg0" : size, + "apparg1" : args.output, + }) + else: + core.addParams({ + "appargcount" : 1, + "apparg0" : size, + }) +# Set the output file for the hello program +elif args.output is not None: + core.addParams({ + "appargcount" : 1, + "apparg0" : args.output, + }) + + +# Cache: L1, 2.4GHz, 2KB, 4-way set associative, 64B lines, LRU replacement, MESI coherence +for i in range(ncores): + cache[i].addParams({ + "L1" : 1, + "cache_frequency" : "2.4GHz", + "access_latency_cycles" : 2, + "cache_size" : "2KiB", + "associativity" : 4, + "replacement_policy" : "lru", + "coherence_policy" : "MESI", + "cache_line_size" : 64, + }) + +# Memory: 50ns access, 1GB +memctrl.addParams({ + "clock" : "1GHz", + "backing" : "none", # We're not using real memory values, just addresses + "addr_range_end" : 1024*1024*1024-1, +}) +memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem") +memory.addParams({ + "mem_size" : "1GiB", + "access_time" : "50ns", +}) + +bus.addParams({ + "bus_frequency": "2.0GHz", +}) + +######################################################################### +## Declare links +######################################################################### +core_cache = [sst.Link("core_to_cache_"+str(i)) for i in range(ncores)] +cache_bus = [sst.Link("cache_" + str(i) + "_to_bus") for i in range(ncores)] +bus_mem = sst.Link("bus_to_memory") + +######################################################################### +## Connect components with the links +######################################################################### +[core_cache[i].connect( (core, "cache_link_"+str(i), "100ps"), (cache[i], "high_network_0", "100ps") ) for i in range(ncores)] +[cache_bus[i].connect( (cache[i], "low_network_0", "100ps"), (bus, "high_network_"+str(i), "100ps") ) for i in range(ncores)] +bus_mem.connect( (bus, "low_network_0", "100ps"), (memctrl, "direct_link", "100ps") ) + +######################################################################### +## Define SST core options +######################################################################### +sst.setProgramOption("stop-at", "200ms") +sst.setStatisticOutput("sst.statoutputtxt") +sst.setStatisticOutputOptions( { "filepath" : "stats.csv" }) +sst.setStatisticLoadLevel(5) +sst.enableAllStatisticsForAllComponents() diff --git a/src/sst/elements/ariel/tests/testsuite_default_Ariel.py b/src/sst/elements/ariel/tests/testsuite_default_Ariel.py index 52e14ccf7f..1934ef8156 100644 --- a/src/sst/elements/ariel/tests/testsuite_default_Ariel.py +++ b/src/sst/elements/ariel/tests/testsuite_default_Ariel.py @@ -157,8 +157,9 @@ def ariel_Template(self, testcase, app="", testtimeout=480): line_count_diff = abs(num_ref_lines - num_out_lines - num_err_lines) log_debug("Line Count diff = {0}".format(line_count_diff)) - if line_count_diff > 15: - self.assertFalse(line_count_diff > 15, "Line count between output file {0} does not match Reference File {1}; They contain {2} different lines".format(outfile, reffile, line_count_diff)) + delta = 15 + if line_count_diff > delta: + self.assertFalse(line_count_diff > 15, f"Test stdout ({outfile}) and stderr ({errfile}) contain {num_out_lines}+{num_err_lines}={num_out_lines+num_err_lines} lines. Expected this to be within {delta} of the reference file ({reffile}), which has {num_ref_lines} lines, but the difference is {line_count_diff} lines.") ####################### diff --git a/src/sst/elements/ariel/tests/testsuite_mpi_Ariel.py b/src/sst/elements/ariel/tests/testsuite_mpi_Ariel.py new file mode 100644 index 0000000000..97a50887e2 --- /dev/null +++ b/src/sst/elements/ariel/tests/testsuite_mpi_Ariel.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- + +from sst_unittest import * +from sst_unittest_support import * +import os +import inspect +import subprocess + +################################################################################ +# Code to support a single instance module initialize, must be called setUp method + +module_init = 0 +module_sema = threading.Semaphore() + +def initializeTestModule_SingleInstance(class_inst): + global module_init + global module_sema + + module_sema.acquire() + if module_init != 1: + try: + # Put your single instance Init Code Here + class_inst._setup_ariel_test_files() + except: + pass + module_init = 1 + module_sema.release() +################################################################################ +# Functions to support parsing the output of the MPI tests + +def get_reduce_string(rank, ranks, size=1024): + return [f"Rank {rank} partial sum is {sum(range(int(rank*(size/ranks)), int((rank+1)*(size/ranks))))}, total sum is {sum(range(size))}\n"] + +def get_hello_string(rank, ranks, tracerank, threads): + if rank == tracerank: + return [f"Hello from rank {rank} of {ranks}, thread {i}! (Launched by pin)\n" for i in range(threads)] + else: + return [f"Hello from rank {rank} of {ranks}, thread {i}!\n" for i in range(threads)] +################################################################################ + +class testcase_Ariel(SSTTestCase): + + def initializeClass(self, testName): + super(type(self), self).initializeClass(testName) + # Put test based setup code here. it is called before testing starts + # NOTE: This method is called once for every test + + def setUp(self): + super(type(self), self).setUp() + initializeTestModule_SingleInstance(self) + # Put test based setup code here. it is called once before every test + + def tearDown(self): + # Put test based teardown code here. it is called once after every test + super(type(self), self).tearDown() + + # Test that the output contains the specified line. Because the programs are + # Multithreaded, we cannot know ahead of time which line will match. The + # programs use #pragma critical so we expect that the output from each thread + # will be on its own line. + def file_contains(self, filename, strings): + with open(filename, 'r') as file: + lines = file.readlines() + for s in strings: + self.assertTrue(s in lines, "Output {0} does not contain expected line {1}".format(filename, s)) + + # Test that the stats file `filename` has a non-zero value for statistics `stat`. + def assert_nonzero_stat(self, filename, stat): + with open(filename, 'r') as file: + lines = file.readlines() + for ln in lines: + l = ln.split(' ') + if l[0] == stat: + stat_value = int(l[12].split(';')[0]) + self.assertTrue(stat_value > 0, f"Statistics file `{filename}` did not have a positive value for stat `{stat}`. Line was:\n\t{ln}") + + pin_loaded = testing_is_PIN_loaded() + pin_error_msg = "Ariel: Requires PIN, but Env Var 'INTEL_PIN_DIRECTORY' is not found or path does not exist." + + multi_rank = testing_check_get_num_ranks() > 1 + multi_rank_error_msg = "Ariel: Ariel MPI tests are not compatible with multi-rank sst runs." + + using_osx = host_os_is_osx() + osx_error_msg = "Ariel: OpenMP is not supported on macOS" + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + def test_Ariel_mpi_hello_01(self): + self.ariel_Template(threads=1, ranks=1) + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + def test_Ariel_mpi_hello_02(self): + self.ariel_Template(threads=1, ranks=2) + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + @unittest.skipIf(using_osx, osx_error_msg) + def test_Ariel_mpi_hello_03(self): + self.ariel_Template(threads=2, ranks=1) + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + def test_Ariel_mpi_hello_04(self): + self.ariel_Template(threads=1, ranks=2, tracerank=1) + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + @unittest.skipIf(using_osx, osx_error_msg) + def test_Ariel_mpi_hello_05(self): + self.ariel_Template(threads=2, ranks=3, tracerank=1) + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + @unittest.skipIf(using_osx, osx_error_msg) + def test_Ariel_mpi_hello_06(self): + self.ariel_Template(threads=2, ranks=2) + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + def test_Ariel_mpi_reduce_01(self): + self.ariel_Template(threads=1, ranks=1, program="reduce") + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + @unittest.skipIf(using_osx, osx_error_msg) + def test_Ariel_mpi_reduce_02(self): + self.ariel_Template(threads=2, ranks=2, program="reduce") + + @unittest.skipIf(not pin_loaded, pin_error_msg) + @unittest.skipIf(multi_rank, multi_rank_error_msg) + @unittest.skipIf(using_osx, osx_error_msg) + def test_Ariel_mpi_reduce_03(self): + self.ariel_Template(threads=2, ranks=4, program="reduce", tracerank=1) + + def ariel_Template(self, threads, ranks, program="hello", tracerank=0, testtimeout=60, size=8000): + # Set the paths to the various directories + testcase = inspect.stack()[1][3] # name the test after the calling function + + # Get the path to the test files + test_path = self.get_testsuite_dir() + outdir = self.get_test_output_run_dir() + tmpdir = self.get_test_output_tmp_dir() + + # Set paths + ArielElementDir = os.path.abspath("{0}/../".format(test_path)) + ArielElementAPIDir = "{0}/api".format(ArielElementDir) + ArielElementTestMPIDir = "{0}/tests/testMPI".format(ArielElementDir) + + libpath = os.environ.get("LD_LIBRARY_PATH") + if libpath: + os.environ["LD_LIBRARY_PATH"] = ArielElementAPIDir + ":" + libpath + else: + os.environ["LD_LIBRARY_PATH"] = ArielElementAPIDir + + # Set the various file paths + testDataFileName=("{0}".format(testcase)) + + sdlfile = "{0}/test-mpi.py".format(ArielElementTestMPIDir) + outfile = "{0}/{1}.out".format(outdir, testDataFileName) + errfile = "{0}/{1}.err".format(outdir, testDataFileName) + mpioutfiles = "{0}/{1}.testfile".format(outdir, testDataFileName) + statfile = f"{ArielElementTestMPIDir}/stats.csv" + program_output = f"{tmpdir}/ariel_testmpi_{testcase}.out" + other_args = f'--model-options="{program} -o {program_output} -r {ranks} -t {threads} -a {tracerank} -s {size}"' + + log_debug("testcase = {0}".format(testcase)) + log_debug("sdl file = {0}".format(sdlfile)) + log_debug("out file = {0}".format(outfile)) + log_debug("err file = {0}".format(errfile)) + + # Run SST in the tests directory + self.run_sst(sdlfile, outfile, errfile, set_cwd=ArielElementTestMPIDir, + mpi_out_files=mpioutfiles, timeout_sec=testtimeout, other_args=other_args) + + # Each rank will have its own output file + # We will examine all of them. + + # These programs are designed to output a separate file for each rank, + # and append the rank id to the argument + program_output_files = [f"{program_output}_{i}" for i in range(ranks)] + + # Look for the word "FATAL" in the sst output file + cmd = 'grep "FATAL" {0} '.format(outfile) + grep_result = os.system(cmd) != 0 + self.assertTrue(grep_result, "Output file {0} contains the word 'FATAL'...".format(outfile)) + + # Test for expected output + for i in range(ranks): + if program == "hello": + self.file_contains(f'{program_output}_{i}', get_hello_string(i, ranks, tracerank, threads)) + else: + self.file_contains(f'{program_output}_{i}', get_reduce_string(i, ranks, size)) + + # Test to make sure that each core did some work and sent something to its L1 + for i in range(threads): + self.assert_nonzero_stat(statfile, f"core.read_requests.{i}") + self.assert_nonzero_stat(statfile, f"cache_{i}.CacheMisses") + + +####################### + + def _setup_ariel_test_files(self): + # NOTE: This routine is called a single time at module startup, so it + # may have some redunant + log_debug("_setup_ariel_test_files() Running") + test_path = self.get_testsuite_dir() + outdir = self.get_test_output_run_dir() + + # Set the paths to the various directories + self.ArielElementDir = os.path.abspath("{0}/../".format(test_path)) + self.ArielElementTestMPIDir = "{0}/tests/testMPI".format(self.ArielElementDir) + + # Build the Ariel API library with mpicc + ArielApiDir = "{0}/api".format(self.ArielElementDir) + cmd = "make clean" + OSCommand(cmd, set_cwd=ArielApiDir).run() + cmd = "CC=mpicc make" + rtn0 = OSCommand(cmd, set_cwd=ArielApiDir).run() + log_debug("Ariel api/libarielapi.so Make result = {0}; output =\n{1}".format(rtn0.result(), rtn0.output())) + os.environ["ARIELAPI"] = ArielApiDir + + # Build the test mpi programs + cmd = "make" + rtn1 = OSCommand(cmd, set_cwd=self.ArielElementTestMPIDir).run() + log_debug("Ariel ariel/tests/testMPI make result = {1}; output =\n{2}".format(ArielElementTestMPIDir, rtn1.result(), rtn1.output())) + + # Check that everything compiled OK + self.assertTrue(rtn0.result() == 0, "libarielapi failed to compile") + self.assertTrue(rtn1.result() == 0, "mpi test binaries failed to compile") + diff --git a/src/sst/elements/ariel/tests/testsuite_testio_Ariel.py b/src/sst/elements/ariel/tests/testsuite_testio_Ariel.py index 2b0fc42203..3008429099 100644 --- a/src/sst/elements/ariel/tests/testsuite_testio_Ariel.py +++ b/src/sst/elements/ariel/tests/testsuite_testio_Ariel.py @@ -55,7 +55,7 @@ def file_contains(self, filename, strings): pin_loaded = testing_is_PIN_loaded() - pin_error_msg = "Ariel: Requires PIN, but Env Var 'INTEL_PIN_DIR' is not found or path does not exist." + pin_error_msg = "Ariel: Requires PIN, but Env Var 'INTEL_PIN_DIRECTORY' is not found or path does not exist." # This is not an exhausitve list of tests, but it covers most of the options.