From 3839a78ff3ce01765f9e6d3529936317403fd9c7 Mon Sep 17 00:00:00 2001 From: d-netto Date: Wed, 25 Sep 2024 15:24:18 -0300 Subject: [PATCH] all tasks profiler --- src/gc-alloc-profiler.cpp | 2 +- src/gc-stock.c | 24 ++++- src/gf.c | 2 +- src/init.c | 4 +- src/julia_internal.h | 20 +++- src/julia_threads.h | 1 + src/mtarraylist.c | 2 +- src/signal-handling.c | 120 ++++++++++++++++++---- src/signals-mach.c | 176 +++++++++++++++++--------------- src/signals-unix.c | 170 +++++++++++++++++------------- src/signals-win.c | 39 ++++--- src/stackwalk.c | 57 ++++++++--- src/task.c | 36 ++++++- stdlib/Profile/src/Profile.jl | 77 ++++++++++---- stdlib/Profile/test/runtests.jl | 61 +++++++++-- 15 files changed, 555 insertions(+), 236 deletions(-) diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp index 5b462d48cd2de..cfac125e18632 100644 --- a/src/gc-alloc-profiler.cpp +++ b/src/gc-alloc-profiler.cpp @@ -60,7 +60,7 @@ jl_raw_backtrace_t get_raw_backtrace() JL_NOTSAFEPOINT { ptls->profiling_bt_buffer = shared_bt_data_buffer; } - size_t bt_size = rec_backtrace(shared_bt_data_buffer, JL_MAX_BT_SIZE, 2); + size_t bt_size = rec_backtrace(shared_bt_data_buffer, JL_MAX_BT_SIZE, 2, 0); // Then we copy only the needed bytes out of the buffer into our profile. size_t bt_bytes = bt_size * sizeof(jl_bt_element_t); diff --git a/src/gc-stock.c b/src/gc-stock.c index 541c5b4ecc5c2..43d219b9c2267 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -1025,7 +1025,22 @@ void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT } } -void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT +void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT +{ + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; + if (ptls == NULL) { + continue; + } + arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers; + void *buf; + while ((buf = arraylist_pop(buffers)) != NULL) { + free(buf); + } + } +} + +void sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT { // initialize ptls index for parallel sweeping of stack pools assert(gc_n_threads); @@ -1035,9 +1050,12 @@ void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT else jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1); jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial + uv_mutex_lock(&bt_data_prof_lock); gc_sweep_wake_all_stacks(ptls); sweep_stack_pool_loop(); gc_sweep_wait_for_all_stacks(); + sweep_mtarraylist_buffers(); + uv_mutex_unlock(&bt_data_prof_lock); } static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT @@ -3084,7 +3102,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) current_sweep_full = sweep_full; sweep_weak_refs(); uint64_t stack_pool_time = jl_hrtime(); - sweep_stack_pools(ptls); + sweep_stack_pools_and_mtarraylist_buffers(ptls); stack_pool_time = jl_hrtime() - stack_pool_time; gc_num.total_stack_pool_sweep_time += stack_pool_time; gc_num.stack_pool_sweep_time = stack_pool_time; @@ -3453,6 +3471,8 @@ void jl_init_thread_heap(jl_ptls_t ptls) jl_atomic_store_relaxed(&q->bottom, 0); jl_atomic_store_relaxed(&q->array, wsa2); arraylist_new(&mq->reclaim_set, 32); + // Initialize `lazily_freed_mtarraylist_buffers` + arraylist_new(&ptls->lazily_freed_mtarraylist_buffers, 0); memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num)); jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval); diff --git a/src/gf.c b/src/gf.c index 285942cd157c5..3e65072822fb3 100644 --- a/src/gf.c +++ b/src/gf.c @@ -2352,7 +2352,7 @@ static void JL_NORETURN jl_method_error_bare(jl_value_t *f, jl_value_t *args, si jl_static_show((JL_STREAM*)STDERR_FILENO,(jl_value_t*)f); jl_printf((JL_STREAM*)STDERR_FILENO," world %u\n", (unsigned)world); jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n"); jl_ptls_t ptls = jl_current_task->ptls; - ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); + ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0, 0); jl_critical_error(0, 0, NULL, jl_current_task); abort(); } diff --git a/src/init.c b/src/init.c index 413d4e8055e54..b29d3640ea336 100644 --- a/src/init.c +++ b/src/init.c @@ -744,6 +744,8 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) // initialize symbol-table lock uv_mutex_init(&symtab_lock); + // initialize the profiler buffer lock + uv_mutex_init(&bt_data_prof_lock); // initialize backtraces jl_init_profile_lock(); @@ -758,7 +760,7 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) // nongnu libunwind initialization is only threadsafe on architecture where the // author could access TSAN, per https://github.com/libunwind/libunwind/pull/109 // so we need to do this once early (before threads) - rec_backtrace(NULL, 0, 0); + rec_backtrace(NULL, 0, 0, 0); #endif libsupport_init(); diff --git a/src/julia_internal.h b/src/julia_internal.h index 8c4ee9fca36e0..873ecd960df28 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -211,6 +211,24 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER; void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; +arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT; +typedef struct { + size_t bt_size; + int tid; +} jl_record_backtrace_result_t; +JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data, + size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT; +extern volatile struct _jl_bt_element_t *profile_bt_data_prof; +extern volatile size_t profile_bt_size_max; +extern volatile size_t profile_bt_size_cur; +extern volatile int profile_running; +extern volatile int profile_all_tasks; +extern uv_mutex_t bt_data_prof_lock; +#define PROFILE_STATE_THREAD_NOT_SLEEPING (1) +#define PROFILE_STATE_THREAD_SLEEPING (2) +#define PROFILE_STATE_WALL_TIME_PROFILING (3) +void jl_profile_task(void); + // number of cycles since power-on static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT { @@ -1332,7 +1350,7 @@ typedef unw_cursor_t bt_cursor_t; typedef int bt_context_t; typedef int bt_cursor_t; #endif -size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT; +size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip, int no_gcstack) JL_NOTSAFEPOINT; // Record backtrace from a signal handler. `ctx` is the context of the code // which was asynchronously interrupted. size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, diff --git a/src/julia_threads.h b/src/julia_threads.h index 67da2978b4267..14ebdaccae8b0 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -157,6 +157,7 @@ typedef struct _jl_tls_states_t { int finalizers_inhibited; jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen jl_gc_tls_states_common_t gc_tls_common; // common tls for both GCs + arraylist_t lazily_freed_mtarraylist_buffers; volatile sig_atomic_t defer_signal; _Atomic(struct _jl_task_t*) current_task; struct _jl_task_t *next_task; diff --git a/src/mtarraylist.c b/src/mtarraylist.c index 8bad44797dab4..7401f7f8d9dfa 100644 --- a/src/mtarraylist.c +++ b/src/mtarraylist.c @@ -37,7 +37,7 @@ static void mtarraylist_resizeto(small_mtarraylist_t *a, size_t len, size_t newl a->max = nm; if (olditems != (void*)&a->_space[0]) { jl_task_t *ct = jl_current_task; - jl_gc_add_quiescent(ct->ptls, (void**)olditems, free); + arraylist_push(&ct->ptls->lazily_freed_mtarraylist_buffers, olditems); } } } diff --git a/src/signal-handling.c b/src/signal-handling.c index ce7e8ba57af19..30d751059ec33 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -18,46 +18,47 @@ extern "C" { #include // Profiler control variables -// Note: these "static" variables are also used in "signals-*.c" -static volatile jl_bt_element_t *bt_data_prof = NULL; -static volatile size_t bt_size_max = 0; -static volatile size_t bt_size_cur = 0; +uv_mutex_t bt_data_prof_lock; +volatile jl_bt_element_t *profile_bt_data_prof = NULL; +volatile size_t profile_bt_size_max = 0; +volatile size_t profile_bt_size_cur = 0; static volatile uint64_t nsecprof = 0; -static volatile int running = 0; -static const uint64_t GIGA = 1000000000ULL; +volatile int profile_running = 0; +volatile int profile_all_tasks = 0; +static const uint64_t GIGA = 1000000000ULL; // Timers to take samples at intervals JL_DLLEXPORT void jl_profile_stop_timer(void); -JL_DLLEXPORT int jl_profile_start_timer(void); +JL_DLLEXPORT int jl_profile_start_timer(uint8_t); /////////////////////// // Utility functions // /////////////////////// JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec) { - bt_size_max = maxsize; + profile_bt_size_max = maxsize; nsecprof = delay_nsec; - if (bt_data_prof != NULL) - free((void*)bt_data_prof); - bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t)); - if (bt_data_prof == NULL && maxsize > 0) + if (profile_bt_data_prof != NULL) + free((void*)profile_bt_data_prof); + profile_bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t)); + if (profile_bt_data_prof == NULL && maxsize > 0) return -1; - bt_size_cur = 0; + profile_bt_size_cur = 0; return 0; } JL_DLLEXPORT uint8_t *jl_profile_get_data(void) { - return (uint8_t*) bt_data_prof; + return (uint8_t*) profile_bt_data_prof; } JL_DLLEXPORT size_t jl_profile_len_data(void) { - return bt_size_cur; + return profile_bt_size_cur; } JL_DLLEXPORT size_t jl_profile_maxlen_data(void) { - return bt_size_max; + return profile_bt_size_max; } JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void) @@ -67,12 +68,12 @@ JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void) JL_DLLEXPORT void jl_profile_clear_data(void) { - bt_size_cur = 0; + profile_bt_size_cur = 0; } JL_DLLEXPORT int jl_profile_is_running(void) { - return running; + return profile_running; } // Any function that acquires this lock must be either a unmanaged thread @@ -184,7 +185,88 @@ JL_DLLEXPORT int jl_profile_is_buffer_full(void) // Declare buffer full if there isn't enough room to sample even just the // thread metadata and one max-sized frame. The `+ 6` is for the two block // terminator `0`'s plus the 4 metadata entries. - return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max; + return profile_bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > profile_bt_size_max; +} + +NOINLINE int failed_to_sample_task_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT; +NOINLINE int failed_to_stop_thread_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT; + +#define PROFILE_TASK_DEBUG_FORCE_SAMPLING_FAILURE (0) +#define PROFILE_TASK_DEBUG_FORCE_STOP_THREAD_FAILURE (0) + +void jl_profile_task(void) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + + uv_mutex_lock(&bt_data_prof_lock); + arraylist_t *tasks = jl_get_all_tasks_arraylist(); + jl_task_t *t = NULL; + uint64_t seed = jl_rand(); + const int n_max_random_attempts = (1 << 2); + // randomly select a task that is not done + for (int i = 0; i < n_max_random_attempts; i++) { + t = (jl_task_t*)tasks->items[cong(tasks->len, &seed)]; + assert(t == NULL || jl_is_task(t)); + if (t == NULL) { + continue; + } + int t_state = jl_atomic_load_relaxed(&t->_state); + if (t_state == JL_TASK_STATE_DONE) { + continue; + } + break; + } + arraylist_free(tasks); + free(tasks); + + if (profile_running == 0) { + uv_mutex_unlock(&bt_data_prof_lock); + return; + } + + jl_record_backtrace_result_t r = {0, INT16_MAX}; + jl_bt_element_t *bt_data_prof = (jl_bt_element_t*)(profile_bt_data_prof + profile_bt_size_cur); + size_t bt_size_max = profile_bt_size_max - profile_bt_size_cur - 1; + if (t == NULL || PROFILE_TASK_DEBUG_FORCE_SAMPLING_FAILURE) { + // failed to find a task + r.bt_size = failed_to_sample_task_fun(bt_data_prof, bt_size_max, 0); + } + else { + if (!PROFILE_TASK_DEBUG_FORCE_STOP_THREAD_FAILURE) { + r = jl_record_backtrace(t, bt_data_prof, bt_size_max, 1); + } + // we failed to get a backtrace + if (r.bt_size == 0) { + r.bt_size = failed_to_stop_thread_fun(bt_data_prof, bt_size_max, 0); + } + } + + // update the profile buffer size + profile_bt_size_cur += r.bt_size; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = (uintptr_t)r.tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)t; + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // the thread profiler uses this block to record whether the thread is not sleeping (1) or sleeping (2) + // let's use a dummy value which is not 1 or 2 to + // indicate that we are profiling a task, and therefore, this block is not about the thread state + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 3; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + + uv_mutex_unlock(&bt_data_prof_lock); } static uint64_t jl_last_sigint_trigger = 0; diff --git a/src/signals-mach.c b/src/signals-mach.c index a939e4df71ae0..4e889b4b57fb4 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -724,6 +724,84 @@ void jl_unlock_stackwalk(int lockret) jl_unlock_profile_mach(1, lockret); } +// assumes holding `jl_lock_profile_mach` +void jl_profile_thread_mach(int tid) +{ + // if there is no space left, return early + if (jl_profile_is_buffer_full()) { + jl_profile_stop_timer(); + return; + } + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_prepare(); + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_prepare(); // briefly acquire the dlsym lock + host_thread_state_t state; + int valid_thread = jl_thread_suspend_and_get_state2(tid, &state); + unw_context_t *uc = (unw_context_t*)&state; + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_parent(); // quickly release the dlsym lock + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_parent(); + if (!valid_thread) + return; + if (profile_running) { +#ifdef LLVMLIBUNWIND + /* + * Unfortunately compact unwind info is incorrectly generated for quite a number of + * libraries by quite a large number of compilers. We can fall back to DWARF unwind info + * in some cases, but in quite a number of cases (especially libraries not compiled in debug + * mode, only the compact unwind info may be available). Even more unfortunately, there is no + * way to detect such bogus compact unwind info (other than noticing the resulting segfault). + * What we do here is ugly, but necessary until the compact unwind info situation improves. + * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. + * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost + * entry will always be correct, and the number of cases in which this is an issue is rather small. + * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling + * and during stack unwinding we only ever read memory, but never write it. + */ + + forceDwarf = 0; + unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point + + if (forceDwarf == 0) { + // Save the backtrace + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == 1) { + profile_bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == -1) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } + + forceDwarf = -2; +#else + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); +#endif + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block) + int state = jl_atomic_load_relaxed(&ptls->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = state; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + } + // We're done! Resume the thread. + jl_thread_resume(tid); +} + void *mach_profile_listener(void *arg) { (void)arg; @@ -741,88 +819,21 @@ void *mach_profile_listener(void *arg) // sample each thread, round-robin style in reverse order // (so that thread zero gets notified last) int keymgr_locked = jl_lock_profile_mach(0); - int nthreads = jl_atomic_load_acquire(&jl_n_threads); - int *randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = randperm[idx]; - // if there is no space left, break early - if (jl_profile_is_buffer_full()) { - jl_profile_stop_timer(); - break; - } - - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_prepare(); - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_prepare(); // briefly acquire the dlsym lock - host_thread_state_t state; - int valid_thread = jl_thread_suspend_and_get_state2(i, &state); - unw_context_t *uc = (unw_context_t*)&state; - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_parent(); // quickly release the dlsym lock - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_parent(); - if (!valid_thread) - continue; - if (running) { -#ifdef LLVMLIBUNWIND - /* - * Unfortunately compact unwind info is incorrectly generated for quite a number of - * libraries by quite a large number of compilers. We can fall back to DWARF unwind info - * in some cases, but in quite a number of cases (especially libraries not compiled in debug - * mode, only the compact unwind info may be available). Even more unfortunately, there is no - * way to detect such bogus compact unwind info (other than noticing the resulting segfault). - * What we do here is ugly, but necessary until the compact unwind info situation improves. - * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. - * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost - * entry will always be correct, and the number of cases in which this is an issue is rather small. - * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling - * and during stack unwinding we only ever read memory, but never write it. - */ - - forceDwarf = 0; - unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point - - if (forceDwarf == 0) { - // Save the backtrace - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == 1) { - bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == -1) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } - - forceDwarf = -2; -#else - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); -#endif - jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - - // META_OFFSET_TASKID store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); - - // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task(); + } + else { + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in random order. + int i = randperm[idx]; + jl_profile_thread_mach(i); } - // We're done! Resume the thread. - jl_thread_resume(i); } jl_unlock_profile_mach(0, keymgr_locked); - if (running) { + if (profile_running) { jl_check_profile_autostop(); // Reset the alarm kern_return_t ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); @@ -831,7 +842,8 @@ void *mach_profile_listener(void *arg) } } -JL_DLLEXPORT int jl_profile_start_timer(void) + +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { kern_return_t ret; if (!profile_started) { @@ -860,7 +872,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void) timerprof.tv_sec = nsecprof/GIGA; timerprof.tv_nsec = nsecprof%GIGA; - running = 1; + profile_running = 1; + profile_all_tasks = all_tasks; // ensure the alarm is running ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); HANDLE_MACH_ERROR("clock_alarm", ret); @@ -870,5 +883,6 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; } diff --git a/src/signals-unix.c b/src/signals-unix.c index caf0e977929c5..c589bf9f932f6 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -9,6 +9,10 @@ #include #include #include + +#include "julia.h" +#include "julia_internal.h" + #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) #define MAP_ANONYMOUS MAP_ANON #endif @@ -646,7 +650,7 @@ int timer_graceperiod_elapsed(void) static timer_t timerprof; static struct itimerspec itsprof; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { struct sigevent sigprof; @@ -655,10 +659,12 @@ JL_DLLEXPORT int jl_profile_start_timer(void) sigprof.sigev_notify = SIGEV_SIGNAL; sigprof.sigev_signo = SIGUSR1; sigprof.sigev_value.sival_ptr = &timerprof; - // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer - running = 1; + // Because SIGUSR1 is multipurpose, set `profile_running` before so that we know that the first SIGUSR1 came from the timer + profile_running = 1; + profile_all_tasks = all_tasks; if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; return -2; } @@ -668,7 +674,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void) itsprof.it_value.tv_sec = nsecprof / GIGA; itsprof.it_value.tv_nsec = nsecprof % GIGA; if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; return -3; } return 0; @@ -676,11 +683,13 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { - if (running) { + uv_mutex_lock(&bt_data_prof_lock); + if (profile_running) { timer_delete(timerprof); last_timer_delete_time = jl_hrtime(); - running = 0; + profile_running = 0; } + uv_mutex_unlock(&bt_data_prof_lock); } #elif defined(__OpenBSD__) @@ -797,7 +806,7 @@ void trigger_profile_peek(void) jl_safe_printf("\n======================================================================================\n"); jl_safe_printf("Information request received. A stacktrace will print followed by a %.1f second profile\n", profile_peek_duration); jl_safe_printf("======================================================================================\n"); - if (bt_size_max == 0){ + if (profile_bt_size_max == 0){ // If the buffer hasn't been initialized, initialize with default size // Keep these values synchronized with Profile.default_init() if (jl_profile_init(10000000, 1000000) == -1) { @@ -805,13 +814,62 @@ void trigger_profile_peek(void) return; } } - bt_size_cur = 0; // clear profile buffer - if (jl_profile_start_timer() < 0) + profile_bt_size_cur = 0; // clear profile buffer + if (jl_profile_start_timer(0) < 0) jl_safe_printf("ERROR: Could not start profile timer\n"); else profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9); } +// assumes holding `jl_lock_stackwalk` +void jl_profile_thread_unix(int tid, bt_context_t *signal_context) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + // notify thread to stop + if (!jl_thread_suspend_and_get_state(tid, 1, signal_context)) + return; + // unwinding can fail, so keep track of the current state + // and restore from the SEGV handler if anything happens. + jl_jmp_buf *old_buf = jl_get_safe_restore(); + jl_jmp_buf buf; + + jl_set_safe_restore(&buf); + if (jl_setjmp(buf, 0)) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } else { + // Get backtrace data + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, signal_context, NULL); + } + jl_set_safe_restore(old_buf); + + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls2->tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block) + int state = jl_atomic_load_relaxed(&ptls2->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = state; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + + // notify thread to resume + jl_thread_resume(tid); +} + static void *signal_listener(void *arg) { static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1]; @@ -911,13 +969,13 @@ static void *signal_listener(void *arg) int doexit = critical; #ifdef SIGINFO if (sig == SIGINFO) { - if (running != 1) + if (profile_running != 1) trigger_profile_peek(); doexit = 0; } #else if (sig == SIGUSR1) { - if (running != 1 && timer_graceperiod_elapsed()) + if (profile_running != 1 && timer_graceperiod_elapsed()) trigger_profile_peek(); doexit = 0; } @@ -951,78 +1009,46 @@ static void *signal_listener(void *arg) bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) bt_context_t signal_context; - // sample each thread, round-robin style in reverse order - // (so that thread zero gets notified last) - if (critical || profile) { + if (critical) { int lockret = jl_lock_stackwalk(); - int *randperm; - if (profile) - randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = profile ? randperm[idx] : idx; + // sample each thread, round-robin style in reverse order + // (so that thread zero gets notified last) + for (int i = nthreads; i-- > 0; ) { // notify thread to stop if (!jl_thread_suspend_and_get_state(i, 1, &signal_context)) continue; // do backtrace on thread contexts for critical signals // this part must be signal-handler safe - if (critical) { - bt_size += rec_backtrace_ctx(bt_data + bt_size, - JL_MAX_BT_SIZE / nthreads - 1, - &signal_context, NULL); - bt_data[bt_size++].uintptr = 0; - } - - // do backtrace for profiler - if (profile && running) { - if (jl_profile_is_buffer_full()) { - // Buffer full: Delete the timer - jl_profile_stop_timer(); - } - else { - // unwinding can fail, so keep track of the current state - // and restore from the SEGV handler if anything happens. - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0)) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } else { - // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &signal_context, NULL); - } - jl_set_safe_restore(old_buf); - - jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; - - // META_OFFSET_TASKID store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); - - // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; - } - } - - // notify thread to resume + bt_size += rec_backtrace_ctx(bt_data + bt_size, + JL_MAX_BT_SIZE / nthreads - 1, + &signal_context, NULL); + bt_data[bt_size++].uintptr = 0; jl_thread_resume(i); } jl_unlock_stackwalk(lockret); } + else if (profile) { + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task(); + } + else { + int lockret = jl_lock_stackwalk(); + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in the random order. + int i = randperm[idx]; + // do backtrace for profiler + if (profile && profile_running) { + jl_profile_thread_unix(i, &signal_context); + } + } + jl_unlock_stackwalk(lockret); + } + } #ifndef HAVE_MACH - if (profile && running) { + if (profile && profile_running) { jl_check_profile_autostop(); #if defined(HAVE_TIMER) timer_settime(timerprof, 0, &itsprof, NULL); diff --git a/src/signals-win.c b/src/signals-win.c index b5f8dd8bd79d9..2a594bc92b9b7 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -404,12 +404,16 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) while (1) { DWORD timeout_ms = nsecprof / (GIGA / 1000); Sleep(timeout_ms > 0 ? timeout_ms : 1); - if (running) { + if (profile_running) { if (jl_profile_is_buffer_full()) { jl_profile_stop_timer(); // does not change the thread state SuspendThread(GetCurrentThread()); continue; } + else if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task(); + } else { // TODO: bring this up to parity with other OS by adding loop over tid here int lockret = jl_lock_stackwalk(); @@ -421,26 +425,27 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) break; } // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &ctxThread, NULL); + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, &ctxThread, NULL); jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1; // META_OFFSET_TASKID store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); - // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block) + int state = jl_atomic_load_relaxed(&ptls->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = state; // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; jl_unlock_stackwalk(lockret); jl_thread_resume(0); jl_check_profile_autostop(); @@ -455,7 +460,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) static volatile TIMECAPS timecaps; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { if (hBtThread == NULL) { @@ -483,20 +488,24 @@ JL_DLLEXPORT int jl_profile_start_timer(void) return -2; } } - if (running == 0) { + if (profile_running == 0) { // Failure to change the timer resolution is not fatal. However, it is important to // ensure that the timeBeginPeriod/timeEndPeriod is paired. if (TIMERR_NOERROR != timeBeginPeriod(timecaps.wPeriodMin)) timecaps.wPeriodMin = 0; } - running = 1; // set `running` finally + profile_all_tasks = all_tasks; + profile_running = 1; // set `profile_running` finally return 0; } JL_DLLEXPORT void jl_profile_stop_timer(void) { - if (running && timecaps.wPeriodMin) + uv_mutex_lock(&bt_data_prof_lock); + if (profile_running && timecaps.wPeriodMin) timeEndPeriod(timecaps.wPeriodMin); - running = 0; + profile_running = 0; + profile_all_tasks = 0; + uv_mutex_unlock(&bt_data_prof_lock); } void jl_install_default_signal_handlers(void) diff --git a/src/stackwalk.c b/src/stackwalk.c index 770daa8bf17a6..681707a745af2 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -208,7 +208,7 @@ NOINLINE size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, // // The first `skip` frames are omitted, in addition to omitting the frame from // `rec_backtrace` itself. -NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip) +NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip, int no_gcstack) JL_NOTSAFEPOINT { bt_context_t context; memset(&context, 0, sizeof(context)); @@ -218,12 +218,22 @@ NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip bt_cursor_t cursor; if (!jl_unw_init(&cursor, &context) || maxsize == 0) return 0; - jl_gcframe_t *pgcstack = jl_pgcstack; + jl_gcframe_t *pgcstack = no_gcstack ? NULL : jl_pgcstack; size_t bt_size = 0; jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, skip + 1, &pgcstack, 0); return bt_size; } +NOINLINE int failed_to_sample_task_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT +{ + return rec_backtrace(bt_data, maxsize, skip, 1); +} + +NOINLINE int failed_to_stop_thread_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT +{ + return rec_backtrace(bt_data, maxsize, skip, 1); +} + static jl_value_t *array_ptr_void_type JL_ALWAYS_LEAFTYPE = NULL; // Return backtrace information as an svec of (bt1, bt2, [sp]) // @@ -1225,24 +1235,34 @@ return 0; #endif } -JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT +JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT { - jl_task_t *ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; + int16_t tid = INT16_MAX; + jl_record_backtrace_result_t result = {0, tid}; + jl_task_t *ct = NULL; + jl_ptls_t ptls = NULL; + if (!all_tasks_profiler) { + ct = jl_current_task; + ptls = ct->ptls; + ptls->bt_size = 0; + tid = ptls->tid; + } if (t == ct) { - return rec_backtrace(bt_data, max_bt_size, 0); + result.bt_size = rec_backtrace(bt_data, max_bt_size, 0, 0); + result.tid = tid; + return result; } bt_context_t *context = NULL; bt_context_t c; int16_t old; - for (old = -1; !jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid; old = -1) { + for (old = -1; !jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid; old = -1) { int lockret = jl_lock_stackwalk(); // if this task is already running somewhere, we need to stop the thread it is running on and query its state if (!jl_thread_suspend_and_get_state(old, 1, &c)) { jl_unlock_stackwalk(lockret); if (jl_atomic_load_relaxed(&t->tid) != old) continue; - return 0; + return result; } jl_unlock_stackwalk(lockret); if (jl_atomic_load_relaxed(&t->tid) == old) { @@ -1277,13 +1297,21 @@ JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, #endif } size_t bt_size = 0; - if (context) - bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, t->gcstack); + if (context) { + if (all_tasks_profiler) { + bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, NULL); + } + else { + bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, t->gcstack); + } + } if (old == -1) jl_atomic_store_relaxed(&t->tid, old); - else if (old != ptls->tid) + else if (old != tid) jl_thread_resume(old); - return bt_size; + result.bt_size = bt_size; + result.tid = old; + return result; } //-------------------------------------------------- @@ -1317,7 +1345,8 @@ JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT jl_ptls_t ptls = ct->ptls; ptls->bt_size = 0; jl_bt_element_t *bt_data = ptls->bt_data; - size_t bt_size = jl_record_backtrace(t, bt_data, JL_MAX_BT_SIZE); + jl_record_backtrace_result_t r = jl_record_backtrace(t, bt_data, JL_MAX_BT_SIZE, 0); + size_t bt_size = r.bt_size; size_t i; for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { jl_print_bt_entry_codeloc(bt_data + i); @@ -1331,8 +1360,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT jlbacktrace(); } -extern int gc_first_tid; - // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT { diff --git a/src/task.c b/src/task.c index 5e1172a96a409..7ba2247445a04 100644 --- a/src/task.c +++ b/src/task.c @@ -428,7 +428,7 @@ JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task, NOINLINE static void record_backtrace(jl_ptls_t ptls, int skip) JL_NOTSAFEPOINT { // storing bt_size in ptls ensures roots in bt_data will be found - ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, skip + 1); + ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, skip + 1, 0); } JL_DLLEXPORT void jl_set_next_task(jl_task_t *task) JL_NOTSAFEPOINT @@ -1168,6 +1168,40 @@ JL_DLLEXPORT jl_task_t *jl_get_current_task(void) return pgcstack == NULL ? NULL : container_of(pgcstack, jl_task_t, gcstack); } +extern int gc_first_tid; +// Builds a list of the live tasks. Racy: `live_tasks` can expand at any time. +arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT +{ + arraylist_t *tasks = (arraylist_t*)malloc_s(sizeof(arraylist_t)); + arraylist_new(tasks, 0); + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + for (size_t i = 0; i < nthreads; i++) { + // skip GC threads... + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } + jl_ptls_t ptls2 = allstates[i]; + if (ptls2 == NULL) { + continue; + } + jl_task_t *t = ptls2->root_task; + if (t->ctx.stkbuf != NULL) { + arraylist_push(tasks, t); + } + small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + for (size_t i = 0; i < n; i++) { + jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); + assert(t != NULL); + if (t->ctx.stkbuf != NULL) { + arraylist_push(tasks, t); + } + } + } + return tasks; +} + // Do one-time initializations for task system void jl_init_tasks(void) JL_GC_DISABLED { diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index b753c9ca88f24..adbdf1c53da53 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -63,6 +63,30 @@ macro profile(ex) end end +export @profile_walltime + +""" + @profile_walltime + +`@profile_walltime ` runs your expression while taking periodic backtraces of a sample of all live tasks (both running and not running). +These are appended to an internal buffer of backtraces. + +It can be configured via `Profile.init`, same as the `Profile.@profile`, and that you can't use `@profile` simultaneously with `@profile_walltime`. + +As mentioned above, since this tool sample not only running tasks, but also sleeping tasks and tasks performing IO, +it can be used to diagnose performance issues such as lock contention, IO bottlenecks, and other issues that are not visible in the CPU profile. +""" +macro profile_walltime(ex) + return quote + try + start_timer(true) + $(esc(ex)) + finally + stop_timer() + end + end +end + # An internal function called to show the report after an information request (SIGINFO or SIGUSR1). function _peek_report() iob = Base.AnnotatedIOBuffer() @@ -403,9 +427,10 @@ end function has_meta(data) for i in 6:length(data) - data[i] == 0 || continue # first block end null - data[i - 1] == 0 || continue # second block end null - data[i - META_OFFSET_SLEEPSTATE] in 1:2 || continue + data[i] == 0 || continue # first block end null + data[i - 1] == 0 || continue # second block end null + data[i - META_OFFSET_SLEEPSTATE] in 1:3 || continue # 1 for not sleeping, 2 for sleeping, 3 for task profiler fake state + # See definition in `src/julia_internal.h` data[i - META_OFFSET_CPUCYCLECLOCK] != 0 || continue data[i - META_OFFSET_TASKID] != 0 || continue data[i - META_OFFSET_THREADID] != 0 || continue @@ -608,9 +633,9 @@ Julia, and examine the resulting `*.mem` files. clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) # C wrappers -function start_timer() +function start_timer(all_tasks::Bool=false) check_init() # if the profile buffer hasn't been initialized, initialize with default size - status = ccall(:jl_profile_start_timer, Cint, ()) + status = ccall(:jl_profile_start_timer, Cint, (Bool,), all_tasks) if status < 0 error(error_codes[status]) end @@ -722,12 +747,16 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, startframe = length(data) skip = false nsleeping = 0 + is_task_profile = false for i in startframe:-1:1 (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP ip = data[i] if is_block_end(data, i) # read metadata - thread_sleeping = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + thread_sleeping_state = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + if thread_sleeping_state == 2 + is_task_profile = true + end # cpu_cycle_clock = data[i - META_OFFSET_CPUCYCLECLOCK] taskid = data[i - META_OFFSET_TASKID] threadid = data[i - META_OFFSET_THREADID] @@ -735,7 +764,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, skip = true continue end - if thread_sleeping == 1 + if thread_sleeping_state == 1 nsleeping += 1 end skip = false @@ -769,14 +798,14 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, end end @assert length(lilist) == length(n) == length(m) == length(lilist_idx) - return (lilist, n, m, totalshots, nsleeping) + return (lilist, n, m, totalshots, nsleeping, is_task_profile) end const FileNameMap = Dict{Symbol,Tuple{String,String,String}} function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) - lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) + lilist, n, m, totalshots, nsleeping, is_task_profile = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) if false # optional: drop the "non-interpretable" ones keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist) lilist = lilist[keep] @@ -796,11 +825,15 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo return true end is_subsection || print_flat(io, lilist, n, m, cols, filenamemap, fmt) - Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%") + if is_task_profile + Base.print(io, "Total snapshots: ", totalshots, "\n") + else + Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%") + end if is_subsection println(io) print_flat(io, lilist, n, m, cols, filenamemap, fmt) - else + elseif !is_task_profile Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n") end return false @@ -1034,12 +1067,16 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI startframe = length(all) skip = false nsleeping = 0 + is_task_profile = false for i in startframe:-1:1 (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (it's read ahead below) and extra block end NULL IP ip = all[i] if is_block_end(all, i) # read metadata - thread_sleeping = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + thread_sleeping_state = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + if thread_sleeping_state == 2 + is_task_profile = true + end # cpu_cycle_clock = all[i - META_OFFSET_CPUCYCLECLOCK] taskid = all[i - META_OFFSET_TASKID] threadid = all[i - META_OFFSET_THREADID] @@ -1048,7 +1085,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI skip = true continue end - if thread_sleeping == 1 + if thread_sleeping_state == 1 nsleeping += 1 end skip = false @@ -1154,7 +1191,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI nothing end cleanup!(root) - return root, nsleeping + return root, nsleeping, is_task_profile end function maxstats(root::StackFrameTree) @@ -1223,9 +1260,9 @@ end function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) if fmt.combine - root, nsleeping = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping, is_task_profile = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) else - root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping, is_task_profile = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) end util_perc = (1 - (nsleeping / root.count)) * 100 is_subsection || print_tree(io, root, cols, fmt, is_subsection) @@ -1239,11 +1276,15 @@ function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, Line end return true end - Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") + if is_task_profile + Base.print(io, "Total snapshots: ", root.count, "\n") + else + Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") + end if is_subsection Base.println(io) print_tree(io, root, cols, fmt, is_subsection) - else + elseif !is_task_profile Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n") end return false diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index 1769cbd12da3e..352d07086f25b 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -25,19 +25,64 @@ end end end -busywait(0, 0) # compile -@profile busywait(1, 20) +@noinline function sleeping_tasks(ch::Channel) + for _ in 1:100 + Threads.@spawn take!(ch) + end + sleep(10) +end -let r = Profile.retrieve() - mktemp() do path, io - serialize(io, r) - close(io) - open(path) do io - @test isa(deserialize(io), Tuple{Vector{UInt},Dict{UInt64,Vector{Base.StackTraces.StackFrame}}}) +function test_profile() + let r = Profile.retrieve() + mktemp() do path, io + serialize(io, r) + close(io) + open(path) do io + @test isa(deserialize(io), Tuple{Vector{UInt},Dict{UInt64,Vector{Base.StackTraces.StackFrame}}}) + end + end + end +end + +function test_has_task_profiler_sample_in_buffer() + let r = Profile.retrieve() + mktemp() do path, io + serialize(io, r) + close(io) + open(path) do io + all = deserialize(io) + data = all[1] + startframe = length(data) + for i in startframe:-1:1 + (startframe - 1) >= i >= (startframe - (Profile.nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP + if Profile.is_block_end(data, i) + thread_sleeping_state = data[i - Profile.META_OFFSET_SLEEPSTATE] + @test thread_sleeping_state == 0x3 + end + end + end end end end +busywait(0, 0) # compile + +@profile_walltime busywait(1, 20) +test_profile() + +Profile.clear() + +ch = Channel(1) +@profile_walltime sleeping_tasks(ch) +test_profile() +close(ch) +test_has_task_profiler_sample_in_buffer() + +Profile.clear() + +@profile busywait(1, 20) +test_profile() + # test printing options for options in ((format=:tree, C=true), (format=:tree, maxdepth=2),