Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(profiler): allow NUM_WORKERS=0 #587

Merged
merged 1 commit into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/setup/discopop.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ where `<CMAKE_FLAGS>` can consist of any combination of the following flags and
- In case you want to use a specific LLVM installation, specify the location via the `-DLLVM_DIST_PATH=<llvm_base_dir>` flag.
- In case your application uses PThreads, please specify `-DDP_PTHREAD_COMPATIBILITY_MODE=[0|1]`. Note, however, that this can influence the runtime of the profiling.
- In case you require a more verbose output of the runtime library, specify the `-DDP_RTLIB_VERBOSE=[0|1]` flag.
- In case you want to specify the number of Workers available for the profiling step, specify the `-DDP_NUM_WORKERS=<int>` flag.
- In case you want to specify the number of Workers available for the profiling step, specify the `-DDP_NUM_WORKERS=<int>` flag. By default, `3` worker threads are used to analyze the observed memory accesses. `0` might be used to disable the creation of additional threads for the analysis.

## Testing the installation
To test the installation, it is possible to execute the provided set of unit tests.
Expand Down
4 changes: 1 addition & 3 deletions rtlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ if(DEFINED DP_RTLIB_VERBOSE)
endif()

if(DEFINED DP_NUM_WORKERS)
if(NOT ${DP_NUM_WORKERS} EQUAL 0)
target_compile_definitions(DiscoPoP_RT PUBLIC DP_NUM_WORKERS=${DP_NUM_WORKERS})
endif()
target_compile_definitions(DiscoPoP_RT PUBLIC DP_NUM_WORKERS=${DP_NUM_WORKERS})
endif()

# end of compiler flags
Expand Down
8 changes: 7 additions & 1 deletion rtlib/functions/dp_finalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,13 @@ void __dp_finalize(LID lid) {
<< ", clearing up" << endl;
}

finalizeParallelization();
if(NUM_WORKERS > 0){
finalizeParallelization();
}
else{
finalizeSingleThreadedExecution();
}

outputLoops();
outputFuncs();
outputAllocations();
Expand Down
8 changes: 7 additions & 1 deletion rtlib/functions/dp_func_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,13 @@ void __dp_func_entry(LID lid, int32_t isStart) {
cout << "DP initialized at LID " << std::dec << dputil::decodeLID(lid) << endl;
}
dpInited = true;
initParallelization();
if(NUM_WORKERS > 0){
initParallelization();
}
else{
initSingleThreadedExecution();
}

} else if (targetTerminated) {
if (DP_DEBUG) {
cout << "Entering function LID " << std::dec << dputil::decodeLID(lid);
Expand Down
9 changes: 9 additions & 0 deletions rtlib/functions/dp_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,14 @@ void __dp_read(LID lid, ADDR addr, char *var) {
timers->stop_and_add(TimerRegion::STACK_CHECK_READ_ACCESS);
// !TEST

#if defined DP_NUM_WORKERS && DP_NUM_WORKERS == 0
AccessInfo current;
#else
int64_t workerID =
((addr - (addr % 4)) % (NUM_WORKERS * 4)) / 4; // implicit "floor"
AccessInfo &current = tempAddrChunks[workerID][tempAddrCount[workerID]++];

#endif
current.isRead = true;
current.lid = lid;
current.var = var;
Expand Down Expand Up @@ -158,6 +163,9 @@ void __dp_read(LID lid, ADDR addr, char *var) {
current.lid = current.lid | (((LID)0xFF) << 56);
}

#if defined DP_NUM_WORKERS && DP_NUM_WORKERS == 0
analyzeSingleAccess(singleThreadedExecutionSMem, current);
#else
if (tempAddrCount[workerID] == CHUNK_SIZE) {
pthread_mutex_lock(&addrChunkMutexes[workerID]);
addrChunkPresent[workerID] = true;
Expand All @@ -167,6 +175,7 @@ void __dp_read(LID lid, ADDR addr, char *var) {
tempAddrChunks[workerID] = new AccessInfo[CHUNK_SIZE];
tempAddrCount[workerID] = 0;
}
#endif
#ifdef DP_RTLIB_VERBOSE
cout << "exit __dp_read\n";
#endif
Expand Down
13 changes: 11 additions & 2 deletions rtlib/functions/dp_write.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,13 @@ void __dp_write(LID lid, ADDR addr, char *var) {
timers->stop_and_add(TimerRegion::STACK_CHECK_WRITE_ACCESS);
// !TEST

int64_t workerID =
((addr - (addr % 4)) % (NUM_WORKERS * 4)) / 4; // implicit "floor"
#if defined DP_NUM_WORKERS && DP_NUM_WORKERS == 0
AccessInfo current;
#else
int64_t workerID = ((addr - (addr % 4)) % (NUM_WORKERS * 4)) / 4; // implicit "floor"
AccessInfo &current = tempAddrChunks[workerID][tempAddrCount[workerID]++];
#endif

current.isRead = false;
current.lid = lid;
current.var = var;
Expand Down Expand Up @@ -162,6 +166,9 @@ void __dp_write(LID lid, ADDR addr, char *var) {
current.lid = current.lid | (((LID)0xFF) << 56);
}

#if defined DP_NUM_WORKERS && DP_NUM_WORKERS == 0
analyzeSingleAccess(singleThreadedExecutionSMem, current);
#else
if (tempAddrCount[workerID] == CHUNK_SIZE) {
pthread_mutex_lock(&addrChunkMutexes[workerID]);
addrChunkPresent[workerID] = true;
Expand All @@ -171,6 +178,8 @@ void __dp_write(LID lid, ADDR addr, char *var) {
tempAddrChunks[workerID] = new AccessInfo[CHUNK_SIZE];
tempAddrCount[workerID] = 0;
}
#endif

#ifdef DP_RTLIB_VERBOSE
cout << "exit __dp_write\n";
#endif
Expand Down
139 changes: 91 additions & 48 deletions rtlib/iFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,44 @@ void initParallelization() {
timers->stop_and_add(TimerRegion::INIT_PARALLELIZATION);
}

void initSingleThreadedExecution() {
#ifdef DP_RTLIB_VERBOSE
cout << "enter initSingleThreadedExecution\n";
#endif
timers->start(TimerRegion::ANALYZE_DEPS);

if (USE_PERFECT) {
singleThreadedExecutionSMem = new PerfectShadow(SIG_ELEM_BIT, SIG_NUM_ELEM, SIG_NUM_HASH);
} else {
singleThreadedExecutionSMem = new ShadowMemory(SIG_ELEM_BIT, SIG_NUM_ELEM, SIG_NUM_HASH);
}
myMap = new depMap();

#ifdef DP_RTLIB_VERBOSE
cout << "exit initSingleThreadedExecution\n";
#endif
}

void finalizeSingleThreadedExecution() {

#ifdef DP_RTLIB_VERBOSE
cout << "enter finalizeSingleThreadedExecution\n";
#endif
if (DP_DEBUG) {
cout << "BEGIN: finalize Single Threaded Execution... \n";
}

delete singleThreadedExecutionSMem;
mergeDeps();

if (DP_DEBUG) {
cout << "END: finalize Single Threaded Execution... \n";
}
#ifdef DP_RTLIB_VERBOSE
cout << "exit finalizeSingleThreadedExecution\n";
#endif
}

string getMemoryRegionIdFromAddr(string fallback, ADDR addr) {
timers->start(TimerRegion::GET_MEMORY_REGION_ID_FROM_ADDR);

Expand Down Expand Up @@ -722,6 +760,57 @@ void mergeDeps() {
pthread_mutex_unlock(&allDepsLock);
}

void analyzeSingleAccess(__dp::Shadow* SMem, __dp::AccessInfo& access){
// analyze data dependences
timers->start(TimerRegion::ANALYZE_SINGLE_ACCESS);

if (access.isRead) {
// hybrid analysis
if (access.skip) {
SMem->insertToRead(access.addr, access.lid);
timers->stop_and_add(TimerRegion::ANALYZE_SINGLE_ACCESS);
return;
}
// End HA
sigElement lastWrite = SMem->testInWrite(access.addr);
if (lastWrite != 0) {
// RAW
SMem->insertToRead(access.addr, access.lid);
addDep(RAW, access.lid, lastWrite, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
}
} else {
sigElement lastWrite = SMem->insertToWrite(access.addr, access.lid);
if (lastWrite == 0) {
// INIT
addDep(INIT, access.lid, 0, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
} else {
sigElement lastRead = SMem->testInRead(access.addr);
if (lastRead != 0) {
// WAR
addDep(WAR, access.lid, lastRead, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
// Clear intermediate read ops
SMem->insertToRead(access.addr, 0);
} else {
// WAW
addDep(WAW, access.lid, lastWrite, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
}
}
}
timers->stop_and_add(TimerRegion::ANALYZE_SINGLE_ACCESS);
}

void *analyzeDeps(void *arg) {
timers->start(TimerRegion::ANALYZE_DEPS);

Expand Down Expand Up @@ -754,56 +843,10 @@ void *analyzeDeps(void *arg) {
AccessInfo access;

// analyze data dependences

for (unsigned short i = 0; i < CHUNK_SIZE; ++i) {
timers->start(TimerRegion::ANALYZE_DEPS_INNER);

access = accesses[i];

if (access.isRead) {
// hybrid analysis
if (access.skip) {
SMem->insertToRead(access.addr, access.lid);
timers->stop_and_add(TimerRegion::ANALYZE_DEPS_INNER);
continue;
}
// End HA
sigElement lastWrite = SMem->testInWrite(access.addr);
if (lastWrite != 0) {
// RAW
SMem->insertToRead(access.addr, access.lid);
addDep(RAW, access.lid, lastWrite, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
}
} else {
sigElement lastWrite = SMem->insertToWrite(access.addr, access.lid);
if (lastWrite == 0) {
// INIT
addDep(INIT, access.lid, 0, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
} else {
sigElement lastRead = SMem->testInRead(access.addr);
if (lastRead != 0) {
// WAR
addDep(WAR, access.lid, lastRead, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
// Clear intermediate read ops
SMem->insertToRead(access.addr, 0);
} else {
// WAW
addDep(WAW, access.lid, lastWrite, access.var, access.AAvar,
access.isStackAccess, access.addr,
access.addrIsOwnedByScope,
access.positiveScopeChangeOccuredSinceLastAccess);
}
}
}
timers->stop_and_add(TimerRegion::ANALYZE_DEPS_INNER);
analyzeSingleAccess(SMem, access);
}

// delete the current chunk at the end
Expand Down
7 changes: 7 additions & 0 deletions rtlib/iFunctions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "DPTypes.hpp"
#include "iFunctionsTypes.hpp"
#include "abstract_shadow.hpp"

#include <string>

Expand All @@ -36,13 +37,19 @@ void readRuntimeInfo();

void initParallelization();

void initSingleThreadedExecution();

void mergeDeps();

void *analyzeDeps(void *arg);

void analyzeSingleAccess(__dp::Shadow* SMem, __dp::AccessInfo& access);

std::string getMemoryRegionIdFromAddr(std::string fallback, ADDR addr);

void finalizeParallelization();

void finalizeSingleThreadedExecution();

void clearStackAccesses(ADDR stack_lower_bound, ADDR stack_upper_bound);
} // namespace __dp
2 changes: 2 additions & 0 deletions rtlib/iFunctionsGlobals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ int32_t NUM_WORKERS = DP_NUM_WORKERS;
int32_t NUM_WORKERS = 3; // default number of worker threads (multiple workers
// can potentially lead to non-deterministic results)
#endif
#pragma message "Profiler: set NUM_WORKERS to " XSTR(NUM_WORKERS)
extern Shadow* singleThreadedExecutionSMem = nullptr; // used if NUM_WORKERS==0
int32_t CHUNK_SIZE = 500; // default number of addresses in each chunk
std::queue<AccessInfo *> *chunks =
nullptr; // one queue of access info chunks for each worker thread
Expand Down
3 changes: 3 additions & 0 deletions rtlib/iFunctionsGlobals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "scope.hpp"
#include "MemoryRegionTree.hpp"
#include "../share/include/timer.hpp"
#include "shadow.hpp"

#include <pthread.h>

Expand Down Expand Up @@ -93,6 +94,7 @@ extern pthread_mutex_t allDepsLock;
extern pthread_t *workers; // worker threads

extern int32_t NUM_WORKERS;
extern Shadow* singleThreadedExecutionSMem;

extern int32_t CHUNK_SIZE; // default number of addresses in each chunk
extern std::queue<AccessInfo *> *chunks; // one queue of access info chunks for each worker thread
Expand All @@ -106,4 +108,5 @@ extern int32_t *tempAddrCount; // tempAddrCount[thread_id] denotes the current n
extern bool stop; // ONLY set stop to true if no more accessed addresses will
// be collected
extern thread_local depMap *myMap;

} // namespace __dp
4 changes: 2 additions & 2 deletions share/include/timer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ enum class TimerRegion : unsigned int {
GET_MEMORY_REGION_ID_FROM_ADDR,
MERGE_DEPS,
ANALYZE_DEPS,
ANALYZE_DEPS_INNER,
ANALYZE_SINGLE_ACCESS,
FINALIZE_PARALLELIZATION,
CLEAR_STACK_ACCESSES,

Expand Down Expand Up @@ -203,7 +203,7 @@ class Timers {
print(stream, " Add a dependency : ", TimerRegion::ADD_DEP);
print(stream, " Merge dendencies : ", TimerRegion::MERGE_DEPS);
print(stream, " Analyze the dependencies (incorrect! : ", TimerRegion::ANALYZE_DEPS); // Incorrect due to multithreading
print(stream, " Analyze the dependencies (inner) : ", TimerRegion::ANALYZE_DEPS_INNER);
print(stream, " Analyze single accesses : ", TimerRegion::ANALYZE_SINGLE_ACCESS);
stream << '\n';
print(stream, " Output the dependencies : ", TimerRegion::OUTPUT_DEPS);
print(stream, " Output the loops : ", TimerRegion::OUTPUT_LOOPS);
Expand Down
Loading