diff --git a/runtime/scheduler.c b/runtime/scheduler.c index 2ed42bab..0d721cb1 100644 --- a/runtime/scheduler.c +++ b/runtime/scheduler.c @@ -1435,167 +1435,171 @@ void worker_scheduler(__cilkrts_worker *w) { __cilkrts_worker **workers = rts->workers; ReadyDeque *deques = rts->deques; - while (!atomic_load_explicit(&rts->done, memory_order_acquire)) { - /* A worker entering the steal loop must have saved its reducer map into - the frame to which it belongs. */ - CILK_ASSERT(!w->hyper_table || - (is_boss && atomic_load_explicit( - &rts->done, memory_order_acquire))); - - CILK_STOP_TIMING(w, INTERVAL_SCHED); - - while (!t && !atomic_load_explicit(&rts->done, memory_order_acquire)) { - CILK_START_TIMING(w, INTERVAL_SCHED); - CILK_START_TIMING(w, INTERVAL_IDLE); + while (!is_boss && !rts->terminate) { + while (!atomic_load_explicit(&rts->done, memory_order_acquire)) { + /* A worker entering the steal loop must have saved its reducer map into + the frame to which it belongs. */ + CILK_ASSERT(!w->hyper_table || + (is_boss && atomic_load_explicit( + &rts->done, memory_order_acquire))); + + CILK_STOP_TIMING(w, INTERVAL_SCHED); + + while (!t && !atomic_load_explicit(&rts->done, memory_order_acquire)) { + CILK_START_TIMING(w, INTERVAL_SCHED); + CILK_START_TIMING(w, INTERVAL_IDLE); #if ENABLE_THIEF_SLEEP - // Get the set of workers we can steal from and a local copy of the - // index-to-worker map. We'll attempt a few steals using these - // local copies to minimize memory traffic. - uint64_t disengaged_sentinel = atomic_load_explicit( - &rts->disengaged_sentinel, memory_order_relaxed); - uint32_t disengaged = GET_DISENGAGED(disengaged_sentinel); - uint32_t stealable = nworkers - disengaged; - __attribute__((unused)) - uint32_t sentinel = recent_sentinel_count / SENTINEL_COUNT_HISTORY; - - if (__builtin_expect(stealable == 1, false)) - // If this worker detects only 1 stealable worker, then its the - // only worker in the work-stealing loop. - continue; + // Get the set of workers we can steal from and a local copy of the + // index-to-worker map. We'll attempt a few steals using these + // local copies to minimize memory traffic. + uint64_t disengaged_sentinel = atomic_load_explicit( + &rts->disengaged_sentinel, memory_order_relaxed); + uint32_t disengaged = GET_DISENGAGED(disengaged_sentinel); + uint32_t stealable = nworkers - disengaged; + __attribute__((unused)) + uint32_t sentinel = recent_sentinel_count / SENTINEL_COUNT_HISTORY; + + if (__builtin_expect(stealable == 1, false)) + // If this worker detects only 1 stealable worker, then its the + // only worker in the work-stealing loop. + continue; #else // ENABLE_THIEF_SLEEP - uint32_t stealable = nworkers; - __attribute__((unused)) - uint32_t sentinel = nworkers / 2; + uint32_t stealable = nworkers; + __attribute__((unused)) + uint32_t sentinel = nworkers / 2; #endif // ENABLE_THIEF_SLEEP #ifndef __APPLE__ - uint32_t lg_sentinel = sentinel == 0 ? 1 - : (8 * sizeof(sentinel)) - - __builtin_clz(sentinel); - uint32_t sentinel_div_lg_sentinel = - sentinel == 0 ? 1 - : (sentinel >> (8 * sizeof(lg_sentinel) - - __builtin_clz(lg_sentinel))); + uint32_t lg_sentinel = sentinel == 0 ? 1 + : (8 * sizeof(sentinel)) - + __builtin_clz(sentinel); + uint32_t sentinel_div_lg_sentinel = + sentinel == 0 ? 1 + : (sentinel >> (8 * sizeof(lg_sentinel) - + __builtin_clz(lg_sentinel))); #endif - const unsigned int NAP_THRESHOLD = SENTINEL_THRESHOLD * 64; + const unsigned int NAP_THRESHOLD = SENTINEL_THRESHOLD * 64; #if !defined(__aarch64__) && !defined(__APPLE__) - uint64_t start = __builtin_readcyclecounter(); + uint64_t start = __builtin_readcyclecounter(); #endif // !defined(__aarch64__) && !defined(__APPLE__) - int attempt = ATTEMPTS; - do { - // Choose a random victim not equal to self. - worker_id victim = - index_to_worker[get_rand(rand_state) % stealable]; - rand_state = update_rand_state(rand_state); - while (victim == self) { - victim = index_to_worker[get_rand(rand_state) % stealable]; + int attempt = ATTEMPTS; + do { + // Choose a random victim not equal to self. + worker_id victim = + index_to_worker[get_rand(rand_state) % stealable]; rand_state = update_rand_state(rand_state); - } - // Attempt to steal from that victim. - t = Closure_steal(workers, deques, w, self, victim); - if (!t) { - // Pause inside this busy loop. - busy_loop_pause(); - } - } while (!t && --attempt > 0); + while (victim == self) { + victim = index_to_worker[get_rand(rand_state) % stealable]; + rand_state = update_rand_state(rand_state); + } + // Attempt to steal from that victim. + t = Closure_steal(workers, deques, w, self, victim); + if (!t) { + // Pause inside this busy loop. + busy_loop_pause(); + } + } while (!t && --attempt > 0); #if SCHED_STATS - if (t) { // steal successful - WHEN_SCHED_STATS(w->l->stats.steals++); - CILK_STOP_TIMING(w, INTERVAL_SCHED); - CILK_DROP_TIMING(w, INTERVAL_IDLE); - } else { // steal unsuccessful - CILK_STOP_TIMING(w, INTERVAL_IDLE); - CILK_DROP_TIMING(w, INTERVAL_SCHED); - } + if (t) { // steal successful + WHEN_SCHED_STATS(w->l->stats.steals++); + CILK_STOP_TIMING(w, INTERVAL_SCHED); + CILK_DROP_TIMING(w, INTERVAL_IDLE); + } else { // steal unsuccessful + CILK_STOP_TIMING(w, INTERVAL_IDLE); + CILK_DROP_TIMING(w, INTERVAL_SCHED); + } #endif - fails = go_to_sleep_maybe( - rts, self, nworkers, NAP_THRESHOLD, w, t, fails, - &sample_threshold, &inefficient_history, &efficient_history, - sentinel_count_history, &sentinel_count_history_tail, - &recent_sentinel_count); - - if (!t) { - // Add some delay to the time a worker takes between steal - // attempts. On a variety of systems, this delay seems to - // improve parallel performance of Cilk computations where - // workers spend a signficant amount of time stealing. - // - // The computation for the delay is heuristic, based on the - // following: - // - Incorporate some delay for each steal attempt. - // - Increase the delay for workers who fail a lot of steal - // attempts, and allow successful thieves to steal more - // frequently. - // - Increase the delay based on the number of thieves failing - // lots of steal attempts. In this case, we use the number S - // of sentinels and increase the delay by approximately S/lg - // S, which seems to work better than a linear increase in - // practice. + fails = go_to_sleep_maybe( + rts, self, nworkers, NAP_THRESHOLD, w, t, fails, + &sample_threshold, &inefficient_history, &efficient_history, + sentinel_count_history, &sentinel_count_history_tail, + &recent_sentinel_count); + + if (!t) { + // Add some delay to the time a worker takes between steal + // attempts. On a variety of systems, this delay seems to + // improve parallel performance of Cilk computations where + // workers spend a signficant amount of time stealing. + // + // The computation for the delay is heuristic, based on the + // following: + // - Incorporate some delay for each steal attempt. + // - Increase the delay for workers who fail a lot of steal + // attempts, and allow successful thieves to steal more + // frequently. + // - Increase the delay based on the number of thieves failing + // lots of steal attempts. In this case, we use the number S + // of sentinels and increase the delay by approximately S/lg + // S, which seems to work better than a linear increase in + // practice. #ifndef __APPLE__ #ifndef __aarch64__ - uint64_t stop = 450 * ATTEMPTS; - if (fails > stealable) - stop += 650 * ATTEMPTS; - stop *= sentinel_div_lg_sentinel; - // On x86-64, the latency of a pause instruction varies between - // microarchitectures. We use the cycle counter to delay by a - // certain amount of time, regardless of the latency of pause. - while ((__builtin_readcyclecounter() - start) < stop) { - busy_pause(); - } + uint64_t stop = 450 * ATTEMPTS; + if (fails > stealable) + stop += 650 * ATTEMPTS; + stop *= sentinel_div_lg_sentinel; + // On x86-64, the latency of a pause instruction varies between + // microarchitectures. We use the cycle counter to delay by a + // certain amount of time, regardless of the latency of pause. + while ((__builtin_readcyclecounter() - start) < stop) { + busy_pause(); + } #else - int pause_count = 200 * ATTEMPTS; - if (fails > stealable) - pause_count += 50 * ATTEMPTS; - pause_count *= sentinel_div_lg_sentinel; - // On arm64, we can't necessarily read the cycle counter without - // a kernel patch. Instead, we just perform some number of - // pause instructions. - for (int i = 0; i < pause_count; ++i) - busy_pause(); + int pause_count = 200 * ATTEMPTS; + if (fails > stealable) + pause_count += 50 * ATTEMPTS; + pause_count *= sentinel_div_lg_sentinel; + // On arm64, we can't necessarily read the cycle counter without + // a kernel patch. Instead, we just perform some number of + // pause instructions. + for (int i = 0; i < pause_count; ++i) + busy_pause(); #endif // __aarch64__ #endif // __APPLE__ + } } - } - CILK_START_TIMING(w, INTERVAL_SCHED); - // If one Cilkified region stops and another one starts, then a worker - // can reach this point with t == NULL and w->g->done == false. Check - // that t is not NULL before calling do_what_it_says. - if (t) { + CILK_START_TIMING(w, INTERVAL_SCHED); + // If one Cilkified region stops and another one starts, then a worker + // can reach this point with t == NULL and w->g->done == false. Check + // that t is not NULL before calling do_what_it_says. + if (t) { #if ENABLE_THIEF_SLEEP - const unsigned int MIN_FAILS = 2 * ATTEMPTS; - uint64_t start, end; - // Executing do_what_it_says involves some minimum amount of work, - // which can be used to amortize the cost of some failed steal - // attempts. Therefore, avoid measuring the elapsed cycles if we - // haven't failed many steal attempts. - if (fails > MIN_FAILS) { - start = gettime_fast(); - } + const unsigned int MIN_FAILS = 2 * ATTEMPTS; + uint64_t start, end; + // Executing do_what_it_says involves some minimum amount of work, + // which can be used to amortize the cost of some failed steal + // attempts. Therefore, avoid measuring the elapsed cycles if we + // haven't failed many steal attempts. + if (fails > MIN_FAILS) { + start = gettime_fast(); + } #endif // ENABLE_THIEF_SLEEP - do_what_it_says(deques, w, self, t); + do_what_it_says(deques, w, self, t); #if ENABLE_THIEF_SLEEP - if (fails > MIN_FAILS) { - end = gettime_fast(); - uint64_t elapsed = end - start; - // Decrement the count of failed steal attempts based on the - // amount of work done. - fails = decrease_fails_by_work(rts, fails, elapsed, - &sample_threshold); - if (fails < SENTINEL_THRESHOLD) { - inefficient_history = 0; - efficient_history = 0; + if (fails > MIN_FAILS) { + end = gettime_fast(); + uint64_t elapsed = end - start; + // Decrement the count of failed steal attempts based on the + // amount of work done. + fails = decrease_fails_by_work(rts, fails, elapsed, + &sample_threshold); + if (fails < SENTINEL_THRESHOLD) { + inefficient_history = 0; + efficient_history = 0; + } + } else { + fails = 0; + sample_threshold = SENTINEL_THRESHOLD; } - } else { - fails = 0; - sample_threshold = SENTINEL_THRESHOLD; - } #endif // ENABLE_THIEF_SLEEP - t = NULL; - } else if (!is_boss && + t = NULL; + } + } + + if (!is_boss && atomic_load_explicit(&rts->done, memory_order_relaxed)) { // If it appears the computation is done, busy-wait for a while // before exiting the work-stealing loop, in case another cilkified