Skip to content

Commit

Permalink
ugly version
Browse files Browse the repository at this point in the history
  • Loading branch information
kyle-singer committed Jan 22, 2025
1 parent 07518a4 commit d30139c
Showing 1 changed file with 140 additions and 136 deletions.
276 changes: 140 additions & 136 deletions runtime/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -1435,167 +1435,171 @@ void worker_scheduler(__cilkrts_worker *w) {
__cilkrts_worker **workers = rts->workers;
ReadyDeque *deques = rts->deques;

while (!atomic_load_explicit(&rts->done, memory_order_acquire)) {
/* A worker entering the steal loop must have saved its reducer map into
the frame to which it belongs. */
CILK_ASSERT(!w->hyper_table ||
(is_boss && atomic_load_explicit(
&rts->done, memory_order_acquire)));

CILK_STOP_TIMING(w, INTERVAL_SCHED);

while (!t && !atomic_load_explicit(&rts->done, memory_order_acquire)) {
CILK_START_TIMING(w, INTERVAL_SCHED);
CILK_START_TIMING(w, INTERVAL_IDLE);
while (!is_boss && !rts->terminate) {
while (!atomic_load_explicit(&rts->done, memory_order_acquire)) {
/* A worker entering the steal loop must have saved its reducer map into
the frame to which it belongs. */
CILK_ASSERT(!w->hyper_table ||
(is_boss && atomic_load_explicit(
&rts->done, memory_order_acquire)));

CILK_STOP_TIMING(w, INTERVAL_SCHED);

while (!t && !atomic_load_explicit(&rts->done, memory_order_acquire)) {
CILK_START_TIMING(w, INTERVAL_SCHED);
CILK_START_TIMING(w, INTERVAL_IDLE);
#if ENABLE_THIEF_SLEEP
// Get the set of workers we can steal from and a local copy of the
// index-to-worker map. We'll attempt a few steals using these
// local copies to minimize memory traffic.
uint64_t disengaged_sentinel = atomic_load_explicit(
&rts->disengaged_sentinel, memory_order_relaxed);
uint32_t disengaged = GET_DISENGAGED(disengaged_sentinel);
uint32_t stealable = nworkers - disengaged;
__attribute__((unused))
uint32_t sentinel = recent_sentinel_count / SENTINEL_COUNT_HISTORY;

if (__builtin_expect(stealable == 1, false))
// If this worker detects only 1 stealable worker, then its the
// only worker in the work-stealing loop.
continue;
// Get the set of workers we can steal from and a local copy of the
// index-to-worker map. We'll attempt a few steals using these
// local copies to minimize memory traffic.
uint64_t disengaged_sentinel = atomic_load_explicit(
&rts->disengaged_sentinel, memory_order_relaxed);
uint32_t disengaged = GET_DISENGAGED(disengaged_sentinel);
uint32_t stealable = nworkers - disengaged;
__attribute__((unused))
uint32_t sentinel = recent_sentinel_count / SENTINEL_COUNT_HISTORY;
if (__builtin_expect(stealable == 1, false))
// If this worker detects only 1 stealable worker, then its the
// only worker in the work-stealing loop.
continue;

#else // ENABLE_THIEF_SLEEP
uint32_t stealable = nworkers;
__attribute__((unused))
uint32_t sentinel = nworkers / 2;
uint32_t stealable = nworkers;
__attribute__((unused))
uint32_t sentinel = nworkers / 2;
#endif // ENABLE_THIEF_SLEEP
#ifndef __APPLE__
uint32_t lg_sentinel = sentinel == 0 ? 1
: (8 * sizeof(sentinel)) -
__builtin_clz(sentinel);
uint32_t sentinel_div_lg_sentinel =
sentinel == 0 ? 1
: (sentinel >> (8 * sizeof(lg_sentinel) -
__builtin_clz(lg_sentinel)));
uint32_t lg_sentinel = sentinel == 0 ? 1
: (8 * sizeof(sentinel)) -
__builtin_clz(sentinel);
uint32_t sentinel_div_lg_sentinel =
sentinel == 0 ? 1
: (sentinel >> (8 * sizeof(lg_sentinel) -
__builtin_clz(lg_sentinel)));
#endif
const unsigned int NAP_THRESHOLD = SENTINEL_THRESHOLD * 64;
const unsigned int NAP_THRESHOLD = SENTINEL_THRESHOLD * 64;

#if !defined(__aarch64__) && !defined(__APPLE__)
uint64_t start = __builtin_readcyclecounter();
uint64_t start = __builtin_readcyclecounter();
#endif // !defined(__aarch64__) && !defined(__APPLE__)
int attempt = ATTEMPTS;
do {
// Choose a random victim not equal to self.
worker_id victim =
index_to_worker[get_rand(rand_state) % stealable];
rand_state = update_rand_state(rand_state);
while (victim == self) {
victim = index_to_worker[get_rand(rand_state) % stealable];
int attempt = ATTEMPTS;
do {
// Choose a random victim not equal to self.
worker_id victim =
index_to_worker[get_rand(rand_state) % stealable];
rand_state = update_rand_state(rand_state);
}
// Attempt to steal from that victim.
t = Closure_steal(workers, deques, w, self, victim);
if (!t) {
// Pause inside this busy loop.
busy_loop_pause();
}
} while (!t && --attempt > 0);
while (victim == self) {
victim = index_to_worker[get_rand(rand_state) % stealable];
rand_state = update_rand_state(rand_state);
}
// Attempt to steal from that victim.
t = Closure_steal(workers, deques, w, self, victim);
if (!t) {
// Pause inside this busy loop.
busy_loop_pause();
}
} while (!t && --attempt > 0);

#if SCHED_STATS
if (t) { // steal successful
WHEN_SCHED_STATS(w->l->stats.steals++);
CILK_STOP_TIMING(w, INTERVAL_SCHED);
CILK_DROP_TIMING(w, INTERVAL_IDLE);
} else { // steal unsuccessful
CILK_STOP_TIMING(w, INTERVAL_IDLE);
CILK_DROP_TIMING(w, INTERVAL_SCHED);
}
if (t) { // steal successful
WHEN_SCHED_STATS(w->l->stats.steals++);
CILK_STOP_TIMING(w, INTERVAL_SCHED);
CILK_DROP_TIMING(w, INTERVAL_IDLE);
} else { // steal unsuccessful
CILK_STOP_TIMING(w, INTERVAL_IDLE);
CILK_DROP_TIMING(w, INTERVAL_SCHED);
}
#endif

fails = go_to_sleep_maybe(
rts, self, nworkers, NAP_THRESHOLD, w, t, fails,
&sample_threshold, &inefficient_history, &efficient_history,
sentinel_count_history, &sentinel_count_history_tail,
&recent_sentinel_count);

if (!t) {
// Add some delay to the time a worker takes between steal
// attempts. On a variety of systems, this delay seems to
// improve parallel performance of Cilk computations where
// workers spend a signficant amount of time stealing.
//
// The computation for the delay is heuristic, based on the
// following:
// - Incorporate some delay for each steal attempt.
// - Increase the delay for workers who fail a lot of steal
// attempts, and allow successful thieves to steal more
// frequently.
// - Increase the delay based on the number of thieves failing
// lots of steal attempts. In this case, we use the number S
// of sentinels and increase the delay by approximately S/lg
// S, which seems to work better than a linear increase in
// practice.
fails = go_to_sleep_maybe(
rts, self, nworkers, NAP_THRESHOLD, w, t, fails,
&sample_threshold, &inefficient_history, &efficient_history,
sentinel_count_history, &sentinel_count_history_tail,
&recent_sentinel_count);
if (!t) {
// Add some delay to the time a worker takes between steal
// attempts. On a variety of systems, this delay seems to
// improve parallel performance of Cilk computations where
// workers spend a signficant amount of time stealing.
//
// The computation for the delay is heuristic, based on the
// following:
// - Incorporate some delay for each steal attempt.
// - Increase the delay for workers who fail a lot of steal
// attempts, and allow successful thieves to steal more
// frequently.
// - Increase the delay based on the number of thieves failing
// lots of steal attempts. In this case, we use the number S
// of sentinels and increase the delay by approximately S/lg
// S, which seems to work better than a linear increase in
// practice.
#ifndef __APPLE__
#ifndef __aarch64__
uint64_t stop = 450 * ATTEMPTS;
if (fails > stealable)
stop += 650 * ATTEMPTS;
stop *= sentinel_div_lg_sentinel;
// On x86-64, the latency of a pause instruction varies between
// microarchitectures. We use the cycle counter to delay by a
// certain amount of time, regardless of the latency of pause.
while ((__builtin_readcyclecounter() - start) < stop) {
busy_pause();
}
uint64_t stop = 450 * ATTEMPTS;
if (fails > stealable)
stop += 650 * ATTEMPTS;
stop *= sentinel_div_lg_sentinel;
// On x86-64, the latency of a pause instruction varies between
// microarchitectures. We use the cycle counter to delay by a
// certain amount of time, regardless of the latency of pause.
while ((__builtin_readcyclecounter() - start) < stop) {
busy_pause();
}
#else
int pause_count = 200 * ATTEMPTS;
if (fails > stealable)
pause_count += 50 * ATTEMPTS;
pause_count *= sentinel_div_lg_sentinel;
// On arm64, we can't necessarily read the cycle counter without
// a kernel patch. Instead, we just perform some number of
// pause instructions.
for (int i = 0; i < pause_count; ++i)
busy_pause();
int pause_count = 200 * ATTEMPTS;
if (fails > stealable)
pause_count += 50 * ATTEMPTS;
pause_count *= sentinel_div_lg_sentinel;
// On arm64, we can't necessarily read the cycle counter without
// a kernel patch. Instead, we just perform some number of
// pause instructions.
for (int i = 0; i < pause_count; ++i)
busy_pause();
#endif // __aarch64__
#endif // __APPLE__
}
}
}
CILK_START_TIMING(w, INTERVAL_SCHED);
// If one Cilkified region stops and another one starts, then a worker
// can reach this point with t == NULL and w->g->done == false. Check
// that t is not NULL before calling do_what_it_says.
if (t) {
CILK_START_TIMING(w, INTERVAL_SCHED);
// If one Cilkified region stops and another one starts, then a worker
// can reach this point with t == NULL and w->g->done == false. Check
// that t is not NULL before calling do_what_it_says.
if (t) {
#if ENABLE_THIEF_SLEEP
const unsigned int MIN_FAILS = 2 * ATTEMPTS;
uint64_t start, end;
// Executing do_what_it_says involves some minimum amount of work,
// which can be used to amortize the cost of some failed steal
// attempts. Therefore, avoid measuring the elapsed cycles if we
// haven't failed many steal attempts.
if (fails > MIN_FAILS) {
start = gettime_fast();
}
const unsigned int MIN_FAILS = 2 * ATTEMPTS;
uint64_t start, end;
// Executing do_what_it_says involves some minimum amount of work,
// which can be used to amortize the cost of some failed steal
// attempts. Therefore, avoid measuring the elapsed cycles if we
// haven't failed many steal attempts.
if (fails > MIN_FAILS) {
start = gettime_fast();
}
#endif // ENABLE_THIEF_SLEEP
do_what_it_says(deques, w, self, t);
do_what_it_says(deques, w, self, t);
#if ENABLE_THIEF_SLEEP
if (fails > MIN_FAILS) {
end = gettime_fast();
uint64_t elapsed = end - start;
// Decrement the count of failed steal attempts based on the
// amount of work done.
fails = decrease_fails_by_work(rts, fails, elapsed,
&sample_threshold);
if (fails < SENTINEL_THRESHOLD) {
inefficient_history = 0;
efficient_history = 0;
if (fails > MIN_FAILS) {
end = gettime_fast();
uint64_t elapsed = end - start;
// Decrement the count of failed steal attempts based on the
// amount of work done.
fails = decrease_fails_by_work(rts, fails, elapsed,
&sample_threshold);
if (fails < SENTINEL_THRESHOLD) {
inefficient_history = 0;
efficient_history = 0;
}
} else {
fails = 0;
sample_threshold = SENTINEL_THRESHOLD;
}
} else {
fails = 0;
sample_threshold = SENTINEL_THRESHOLD;
}
#endif // ENABLE_THIEF_SLEEP
t = NULL;
} else if (!is_boss &&
t = NULL;
}
}

if (!is_boss &&
atomic_load_explicit(&rts->done, memory_order_relaxed)) {
// If it appears the computation is done, busy-wait for a while
// before exiting the work-stealing loop, in case another cilkified
Expand Down

0 comments on commit d30139c

Please sign in to comment.