-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathinit.c
788 lines (682 loc) · 26.8 KB
/
init.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#include <stdatomic.h>
#endif
#include <sched.h>
#include <stdint.h>
#include <stdio.h>
#include <pthread.h>
#ifdef DEBUG
#include <stdio.h>
#endif
#include <stdlib.h>
#include <string.h> /* strerror */
#ifdef __linux__
#include <sys/sysinfo.h>
#endif
#ifdef __FreeBSD__
#include <pthread_np.h>
#endif
#include <unistd.h>
#include "cilk-internal.h"
#include "debug.h"
#include "fiber.h"
#include "global.h"
#include "init.h"
#include "local.h"
#include "readydeque.h"
#include "sched_stats.h"
#include "scheduler.h"
#include "worker_coord.h"
#if defined __FreeBSD__ && __FreeBSD__ < 13
typedef cpuset_t cpu_set_t;
#endif
extern local_state default_worker_local_state;
static local_state *worker_local_init(local_state *l, global_state *g) {
l->shadow_stack = (__cilkrts_stack_frame **)calloc(
g->options.deqdepth, sizeof(struct __cilkrts_stack_frame *));
for (int i = 0; i < JMPBUF_SIZE; i++) {
l->rts_ctx[i] = NULL;
}
l->state = WORKER_IDLE;
l->provably_good_steal = false;
l->exiting = false;
l->returning = false;
l->rand_next = 0; /* will be reset in scheduler loop */
l->wake_val = 0;
cilk_sched_stats_init(&(l->stats));
return l;
}
static void worker_local_destroy(local_state *l, global_state *g) {
(void)l; // not currently used
(void)g; // not currently used
/* currently nothing to do here */
}
static void deques_init(global_state *g) {
cilkrts_alert(BOOT, "(deques_init) Initializing deques");
for (unsigned int i = 0; i < g->options.nproc; i++) {
g->deques[i].top = NULL;
g->deques[i].bottom = NULL;
g->deques[i].mutex_owner = NO_WORKER;
}
}
static void workers_init(global_state *g) {
cilkrts_alert(BOOT, "(workers_init) Initializing workers");
for (unsigned int i = 0; i < g->options.nproc; i++) {
if (i == 0) {
// Initialize worker 0, so we always have a worker structure to fall
// back on.
__cilkrts_init_tls_worker(0, g);
atomic_store_explicit(&g->dummy_worker.tail, NULL,
memory_order_relaxed);
atomic_store_explicit(&g->dummy_worker.head, NULL,
memory_order_relaxed);
} else {
g->workers[i] = &g->dummy_worker;
}
// Initialize index-to-worker map entry for this worker.
g->worker_args[i].id = i;
g->worker_args[i].g = g;
g->index_to_worker[i] = i;
g->worker_to_index[i] = i;
}
}
__cilkrts_worker *__cilkrts_init_tls_worker(worker_id i, global_state *g) {
cilkrts_alert(BOOT, "(workers_init) Initializing worker %u", i);
__cilkrts_worker *w;
if (i == 0) {
// Use default_worker structure for worker 0.
w = &default_worker;
*(struct local_state **)(&w->l) =
worker_local_init(&default_worker_local_state, g);
__cilkrts_set_tls_worker(w);
} else {
size_t alignment = 2 * __alignof__(__cilkrts_worker);
void *mem = cilk_aligned_alloc(
alignment,
round_size_to_alignment(alignment, sizeof(__cilkrts_worker) +
sizeof(local_state)));
w = (__cilkrts_worker *)mem;
*(struct local_state **)(&w->l) =
worker_local_init(mem + sizeof(__cilkrts_worker), g);
}
*(worker_id *)(&w->self) = i;
w->extension = NULL;
w->ext_stack = NULL;
*(struct global_state **)(&w->g) = g;
*(struct __cilkrts_stack_frame ***)(&w->ltq_limit) =
w->l->shadow_stack + g->options.deqdepth;
g->workers[i] = w;
__cilkrts_stack_frame **init = w->l->shadow_stack + 1;
atomic_store_explicit(&w->tail, init, memory_order_relaxed);
atomic_store_explicit(&w->head, init, memory_order_relaxed);
atomic_store_explicit(&w->exc, init, memory_order_relaxed);
if (i != 0) {
w->hyper_table = NULL;
}
// initialize internal malloc first
cilk_internal_malloc_per_worker_init(w);
// zero-initialize the worker's fiber pool.
cilk_fiber_pool_per_worker_zero_init(w);
return w;
}
#if ENABLE_WORKER_PINNING
#ifdef CPU_SETSIZE
/**
* Move the <code>cpu<\code> bit from the <code>cpu_set_t<\code>
* <code>from<\code> to the <code>cpu_set_t<\code> <code>to<\code>, iff the bit
* is set in <code>from<\code>.
*
* @param cpu the id of the cpu to move
* @param to the set to move the cpu into
* @param from the set to move the cpu out of
*/
static void move_bit(int cpu, cpu_set_t *to, cpu_set_t *from) {
if (CPU_ISSET(cpu, from)) {
CPU_CLR(cpu, from);
CPU_SET(cpu, to);
}
}
/**
* Fill in the passed in <code>worker_mask<\code> to contain all the cpus in
* the next group of cpus, as defined by <code>group_size<\code>,
* <code>step_in<\code>, and <code>step_out<\code>. The
* <code>unassigned_mask<\code> is cleared of these bits to avoid reusing cpus
* for different workers.
*
* @param worker_mask (output) the processor mask that will store the set
* of all cpu ids to assigne to worker <code>w_id<\code>
* @param w_id the id of the worker that will be pinned using the
* <code>worker_mask<\code> (used for debug messages)
* @param cpu_start the cpu id from which to start searching in the
* <code>unassigned_mask<\code> for an available cpu
* @param unassigned_mask the set of cores in the process that are unassigned
* to any workers
* @param group_size the number of cpus to allow <code>w_id<\code> to use
* @param step_in the offset between cpus in the same group
* @param step_out the offset between the start of one group and the
* next
* @param available_cores the total number of cores available to the process
* (used for debug messages)
*
* @return the first possible cpu id for the next group (not
* guaranteed to be available in the
* <code>unassigned_mask<\code>)
*/
static inline int fill_worker_mask_and_get_next_cpu(
cpu_set_t *const worker_mask, int const w_id, int const cpu_start,
cpu_set_t *const unassigned_mask, int const group_size, int const step_in,
int const step_out, int const available_cores) {
int cpu = cpu_start;
while (!CPU_ISSET(cpu, unassigned_mask)) {
++cpu;
}
CPU_CLR(cpu, unassigned_mask);
CPU_ZERO(worker_mask);
CPU_SET(cpu, worker_mask);
for (int off = 1; off < group_size; ++off) {
move_bit(cpu + off * step_in, worker_mask, unassigned_mask);
cilkrts_alert(BOOT, NULL, "Bind worker %u to core %d of %d", w_id,
cpu + off * step_in, available_cores);
}
cpu += step_out;
return cpu;
}
/**
* Pins the passed in thread to the set of cpus in the <code>worker_mask<\code>.
*
* @param thread_id the id of the thread that should be pinned
* @param worker_mask the set of cpus to which the thread should be pinned
*/
static inline void pin_thread(pthread_t const thread_id,
cpu_set_t *const worker_mask) {
int const err =
pthread_setaffinity_np(thread_id, sizeof(*worker_mask), worker_mask);
CILK_ASSERT_G(err == 0);
}
#endif
#endif // ENABLE_WORKER_PINNING
/**
* Initializes all other threads in the runtime, and then enters the
* scheduling loop.
*
* @param args the arguments to be used by this worker in
* <code>scheduler_thread_proc<\code>
*
* @return the result of <code>scheduler_thread_proc<\code>
*/
void *init_threads_and_enter_scheduler(void *args) {
struct worker_args *w_arg = (struct worker_args *)args;
struct global_state *g = w_arg->g;
int const worker_start = 2;
/* TODO: Mac OS has a better interface allowing the application
to request that two threads run as far apart as possible by
giving them distinct "affinity tags". */
#if ENABLE_WORKER_PINNING
#ifdef CPU_SETSIZE
int const my_id = worker_start - 1;
// Affinity setting, from cilkplus-rts
cpu_set_t process_mask;
int available_cores = 0;
// Get the mask from the parent thread (master thread)
if (0 == pthread_getaffinity_np(pthread_self(), sizeof(process_mask),
&process_mask)) {
available_cores = CPU_COUNT(&process_mask);
}
/* pin_strategy controls how threads are spread over cpu numbers.
Based on very limited testing FreeBSD groups hyperthreads of a
core together (consecutive IDs) and Linux separates them.
This is not guaranteed and may not even be consistent.
The order is influenced by board firmware.
When sysfs is enabled, Linux offers
/sys/devices/system/cpu/cpu0/topology/core_siblings
which is in a format compatible with cpulist_parse().
FreeBSD exports sysctl kern.sched.topology_spec, an XML representation
of the processor topology. */
/* TODO: Fix pinning strategy to better utilize cpu architecture. For
example, we probably do not want to pin a worker to cpus on different
NUMA nodes. */
#ifdef __FreeBSD__
int pin_strategy = 1; /* (0, 1), (2, 3), ... */
#else
int pin_strategy = 0; /* (0, N/2), (1, N/2 + 1), ... */
#endif
switch (env_get_int("CILK_PIN")) {
case 1:
pin_strategy = 0;
break;
case 2:
pin_strategy = 1;
break;
case 3:
available_cores = 0;
break;
}
#endif
#endif // ENABLE_WORKER_PINNING
int n_threads = g->nworkers;
CILK_ASSERT(n_threads > 0);
/* TODO: Apple supports thread affinity using a different interface. */
cilkrts_alert(BOOT, "(threads_init) Setting up threads");
#if ENABLE_WORKER_PINNING
#ifdef CPU_SETSIZE
cpu_set_t my_worker_mask;
/* Three cases: core count at least twice worker count, allocate
groups of floor(worker count / core count) CPUs.
Core count greater than worker count, do not bind workers to CPUs.
Otherwise, bind workers to single CPUs. */
int cpu = 0;
int group_size = 1;
int step_in = 1, step_out = 1;
/* If cores are overallocated it doesn't make sense to pin threads. */
if (n_threads > available_cores) {
available_cores = 0;
} else {
group_size = available_cores / n_threads;
if (pin_strategy != 0) {
step_in = 1;
step_out = group_size;
} else {
step_out = 1;
step_in = n_threads;
}
// Get my CPU first, but don't pin yet; special OS permissions are
// required to pin a thread to a cpu not in the current thread's
// cpu affinity set
cpu = fill_worker_mask_and_get_next_cpu(
&my_worker_mask, my_id, cpu, &process_mask, group_size, step_in,
step_out, available_cores);
}
#endif
#endif // ENABLE_WORKER_PINNING
for (int w = worker_start; w < n_threads; w++) {
int status = pthread_create(&g->threads[w], NULL, scheduler_thread_proc,
&g->worker_args[w]);
if (status != 0) {
cilkrts_bug(NULL, "Cilk: thread creation (%u) failed: %s", w,
strerror(status));
}
#if ENABLE_WORKER_PINNING
#ifdef CPU_SETSIZE
if (available_cores > 0) {
cpu_set_t worker_mask;
/* Skip to the next active CPU ID. */
cpu = fill_worker_mask_and_get_next_cpu(
&worker_mask, w, cpu, &process_mask, group_size, step_in,
step_out, available_cores);
pin_thread(g->threads[w], &worker_mask);
}
#endif
#endif // ENABLE_WORKER_PINNING
}
#if ENABLE_WORKER_PINNING
#ifdef CPU_SETSIZE
if (available_cores > 0) {
pin_thread(g->threads[my_id], &my_worker_mask);
}
#endif
#endif
return scheduler_thread_proc(args);
}
static void threads_init(global_state *g) {
int const worker_start = 1;
// Make sure we are supposed to create worker threads
if (worker_start < (int)g->nworkers) {
int status = pthread_create(&g->threads[worker_start], NULL,
init_threads_and_enter_scheduler,
&g->worker_args[worker_start]);
if (status != 0) {
cilkrts_bug(NULL, "Cilk: thread creation (%u) failed: %s",
worker_start, strerror(status));
}
}
}
global_state *__cilkrts_startup(int argc, char *argv[]) {
cilkrts_alert(BOOT, "(__cilkrts_startup) argc %d", argc);
global_state *g = global_state_init(argc, argv);
workers_init(g);
deques_init(g);
// Create the root closure and a fiber to go with it. Use worker 0 to
// allocate the closure and fiber.
__cilkrts_worker *w0 = g->workers[0];
Closure *t = Closure_create(w0, NULL);
struct cilk_fiber *fiber = cilk_fiber_allocate(g->options.stacksize);
t->fiber = fiber;
g->root_closure = t;
return g;
}
// Global constructor for starting up the default cilkrts.
__attribute__((constructor)) void __default_cilkrts_startup() {
default_cilkrts = __cilkrts_startup(0, NULL);
for (unsigned i = 0; i < cilkrts_callbacks.last_init; ++i)
cilkrts_callbacks.init[i]();
/* Any attempt to register more initializers should fail. */
cilkrts_callbacks.after_init = true;
}
void __cilkrts_internal_set_nworkers(unsigned int nworkers) {
set_nworkers(default_cilkrts, nworkers);
}
// Start the Cilk workers in g, for example, by creating their underlying
// Pthreads.
static void __cilkrts_start_workers(global_state *g) {
threads_init(g);
g->workers_started = true;
}
// Stop the Cilk workers in g, for example, by joining their underlying
// Pthreads.
static void __cilkrts_stop_workers(global_state *g) {
/* CILK_ASSERT( */
/* !atomic_load_explicit(&g->start_thieves, memory_order_acquire)); */
// Set g->start and g->terminate, to allow the workers to exit their
// outermost scheduling loop.
g->terminate = true;
// Wake up all the workers.
// We call wake_all_disengaged, rather than wake_thieves, to properly
// terminate all thieves, whether they're disengaged inside or outside the
// work-stealing loop.
wake_all_disengaged(g);
// Join the worker pthreads
unsigned int worker_start = 1;
for (unsigned int i = worker_start; i < g->nworkers; i++) {
int status = pthread_join(g->threads[i], NULL);
if (status != 0)
cilkrts_bug(NULL, "Cilk runtime error: thread join (%u) failed: %s",
i, strerror(status));
}
cilkrts_alert(BOOT, "(threads_join) All workers joined!");
g->workers_started = false;
}
// Block until signaled the Cilkified region is done. Executed by the Cilkfying
// thread.
static inline void wait_until_cilk_done(global_state *g) {
wait_while_cilkified(g);
}
// Helper method to make the boss thread wait for the cilkified region
// to complete.
static inline __attribute__((noinline)) void boss_wait_helper(void) {
// The setjmp/longjmp to and from user code can invalidate the
// function arguments and local variables in this function. Get
// fresh copies of these arguments from the runtime's global
// state.
global_state *g = __cilkrts_tls_worker->g;
__cilkrts_stack_frame *sf = g->root_closure->frame;
CILK_BOSS_START_TIMING(g);
// Wait until the cilkified region is done executing.
wait_until_cilk_done(g);
__cilkrts_need_to_cilkify = true;
// At this point, some Cilk worker must have completed the
// Cilkified region and executed uncilkify at the end of the Cilk
// function. The longjmp will therefore jump to the end of the
// Cilk function. We need only restore the stack pointer to its
// original value on the Cilkifying thread's stack.
CILK_BOSS_STOP_TIMING(g);
// Restore the boss's original rsp, so the boss completes the Cilk
// function on its original stack.
SP(sf) = g->orig_rsp;
sysdep_restore_fp_state(sf);
sanitizer_start_switch_fiber(NULL);
__builtin_longjmp(sf->ctx, 1);
}
// Setup runtime structures to start a new Cilkified region. Executed by the
// Cilkifying thread in cilkify().
void __cilkrts_internal_invoke_cilkified_root(__cilkrts_stack_frame *sf) {
global_state *g = default_cilkrts;
// Initialize the boss thread's runtime structures, if necessary.
static bool boss_initialized = false;
if (!boss_initialized) {
__cilkrts_worker *w0 = g->workers[0];
cilk_fiber_pool_per_worker_init(w0);
w0->l->rand_next = 162347;
if (USE_EXTENSION) {
g->root_closure->ext_fiber =
cilk_fiber_allocate(g->options.stacksize);
}
boss_initialized = true;
}
__cilkrts_need_to_cilkify = false;
// The boss thread will impersonate the last exiting worker until it tries
// to become a thief.
__cilkrts_worker *w;
w = g->workers[0];
Closure *root_closure = g->root_closure;
if (USE_EXTENSION) {
// Initialize sf->extension, to appease the later call to
// setup_for_execution.
sf->extension = w->extension;
// Initialize worker->ext_stack.
w->ext_stack = sysdep_get_stack_start(root_closure->ext_fiber);
}
CILK_START_TIMING(w, INTERVAL_CILKIFY_ENTER);
// Mark the root closure as not initialized
g->root_closure_initialized = false;
// Mark the root closure as ready
Closure_make_ready(g->root_closure);
// Setup the stack pointer to point at the root closure's fiber.
g->orig_rsp = SP(sf);
void *new_rsp =
(void *)sysdep_reset_stack_for_resume(root_closure->fiber, sf);
USE_UNUSED(new_rsp);
CILK_ASSERT_POINTER_EQUAL(SP(sf), new_rsp);
// Mark that this root frame is last (meaning, at the top of the stack)
sf->flags |= CILK_FRAME_LAST;
// Mark this frame as stolen, to maintain invariants in the scheduler
__cilkrts_set_stolen(sf);
// Associate sf with this root closure
Closure_clear_frame(root_closure);
Closure_set_frame(root_closure, sf);
// Now kick off execution of the Cilkified region by setting appropriate
// flags.
/* reset_disengaged_var(g); */
if (__builtin_expect(
atomic_load_explicit(&g->cilkified, memory_order_relaxed), false)) {
cilkrts_bug(
NULL,
"ERROR: OpenCilk runtime already executing a Cilk computation.\n");
}
set_cilkified(g);
// Set g->done = 0, so Cilk workers will continue trying to steal.
atomic_store_explicit(&g->done, 0, memory_order_release);
// Wake up the thieves, to allow them to begin work stealing.
//
// NOTE: We might want to wake thieves gradually, as successful steals
// occur, rather than all at once. Initial testing of this approach did not
// seem to perform well, however. One possible reason why could be because
// of the extra kernel interactions involved in waking workers gradually.
wake_thieves(g);
/* request_more_thieves(g, g->nworkers); */
// Start the workers if necessary
if (__builtin_expect(!g->workers_started, false)) {
__cilkrts_start_workers(g);
}
if (__builtin_setjmp(g->boss_ctx) == 0) {
CILK_SWITCH_TIMING(w, INTERVAL_CILKIFY_ENTER, INTERVAL_SCHED);
do_what_it_says_boss(w, root_closure);
} else {
// The stack on which
// __cilkrts_internal_invoke_cilkified_root() was called may
// be corrupted at this point, so we call this helper method,
// marked noinline, to ensure the compiler does not try to use
// any data from the stack.
boss_wait_helper();
}
}
// Finish the execution of a Cilkified region. Executed by a worker in g.
void __cilkrts_internal_exit_cilkified_root(global_state *g,
__cilkrts_stack_frame *sf) {
__cilkrts_worker *w = __cilkrts_get_tls_worker();
CILK_ASSERT(w->l->state == WORKER_RUN);
CILK_SWITCH_TIMING(w, INTERVAL_WORK, INTERVAL_CILKIFY_EXIT);
worker_id self = w->self;
const bool is_boss = (0 == self);
ReadyDeque *deques = g->deques;
// Mark the computation as done. Also "sleep" the workers: update global
// flags so workers who exit the work-stealing loop will return to waiting
// for the start of the next Cilkified region.
sleep_thieves(g);
atomic_store_explicit(&g->done, 1, memory_order_release);
/* wake_all_disengaged(g); */
if (!is_boss) {
w->l->exiting = true;
__cilkrts_worker **workers = g->workers;
__cilkrts_worker *w0 = workers[0];
w0->hyper_table = w->hyper_table;
w->hyper_table = NULL;
w0->extension = w->extension;
w->extension = NULL;
}
// Clear this worker's deque. Nobody can successfully steal from this deque
// at this point, because head == tail, but we still want any subsequent
// Cilkified region to start with an empty deque. We go ahead and grab the
// deque lock to make sure no other worker has a lingering pointer to the
// closure.
deque_lock_self(deques, self);
deques[self].bottom = (Closure *)NULL;
deques[self].top = (Closure *)NULL;
WHEN_CILK_DEBUG(g->root_closure->owner_ready_deque = NO_WORKER);
deque_unlock_self(deques, self);
// Clear the flags in sf. This routine runs before leave_frame in a Cilk
// function, but leave_frame is executed conditionally in Cilk functions
// based on whether sf->flags == 0. Clearing sf->flags ensures that the
// Cilkifying thread does not try to execute leave_frame.
CILK_ASSERT(__cilkrts_synced(sf));
sf->flags = 0;
CILK_STOP_TIMING(w, INTERVAL_CILKIFY_EXIT);
if (is_boss) {
// We finished the computation on the boss thread. No need to jump to
// the runtime in this case; just return normally.
local_state *l = w->l;
atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed);
l->state = WORKER_IDLE;
__cilkrts_need_to_cilkify = true;
// Restore the boss's original rsp, so the boss completes the Cilk
// function on its original stack.
SP(sf) = g->orig_rsp;
sysdep_restore_fp_state(sf);
sanitizer_start_switch_fiber(NULL);
__builtin_longjmp(sf->ctx, 1);
} else {
// done; go back to runtime
CILK_START_TIMING(w, INTERVAL_WORK);
longjmp_to_runtime(w);
}
}
static void global_state_terminate(global_state *g) {
cilk_fiber_pool_global_terminate(g); /* before malloc terminate */
cilk_internal_malloc_global_terminate(g);
cilk_sched_stats_print(g);
}
static void global_state_deinit(global_state *g) {
cilkrts_alert(BOOT, "(global_state_deinit) Clean up global state");
cilk_fiber_pool_global_destroy(g);
cilk_internal_malloc_global_destroy(g); // internal malloc last
cilk_mutex_destroy(&(g->print_lock));
cilk_mutex_destroy(&(g->index_lock));
// TODO: Convert to cilk_* equivalents
pthread_mutex_destroy(&g->cilkified_lock);
pthread_cond_destroy(&g->cilkified_cond_var);
/* pthread_mutex_destroy(&g->start_thieves_lock); */
/* pthread_cond_destroy(&g->start_thieves_cond_var); */
pthread_mutex_destroy(&g->disengaged_lock);
pthread_cond_destroy(&g->disengaged_cond_var);
free(g->worker_args);
g->worker_args = NULL;
free(g->workers);
g->workers = NULL;
g->nworkers = 0;
free(g->deques);
g->deques = NULL;
free(g->threads);
g->threads = NULL;
free(g->index_to_worker);
g->index_to_worker = NULL;
free(g->worker_to_index);
g->worker_to_index = NULL;
free(g);
}
static void deques_deinit(global_state *g) {
cilkrts_alert(BOOT, "(deques_deinit) Clean up deques");
for (unsigned int i = 0; i < g->options.nproc; i++) {
CILK_ASSERT(g->deques[i].mutex_owner == NO_WORKER);
}
}
static void worker_terminate(__cilkrts_worker *w, void *data) {
(void)data; // not currently used
cilk_fiber_pool_per_worker_terminate(w);
hyper_table *ht = w->hyper_table;
if (ht) {
local_hyper_table_free(ht);
w->hyper_table = NULL;
}
worker_local_destroy(w->l, w->g);
cilk_internal_malloc_per_worker_terminate(w); // internal malloc last
}
static void workers_terminate(global_state *g) {
for_each_worker_rev(g, worker_terminate, NULL);
}
static void sum_allocations(__cilkrts_worker *w, void *data) {
long *counts = (long *)data;
local_state *l = w->l;
for (int i = 0; i < NUM_BUCKETS; ++i) {
counts[i] += l->im_desc.buckets[i].allocated;
}
}
static void wrap_fiber_pool_destroy(__cilkrts_worker *w, void *data) {
(void)data; // not currently used
cilk_fiber_pool_per_worker_destroy(w);
}
static void workers_deinit(global_state *g) {
cilkrts_alert(BOOT, "(workers_deinit) Clean up workers");
long allocations[NUM_BUCKETS] = {0, 0, 0, 0};
for_each_worker_rev(g, sum_allocations, allocations);
if (DEBUG_ENABLED(MEMORY)) {
for (int i = 0; i < NUM_BUCKETS; ++i)
CILK_ASSERT_INDEX_ZERO(allocations, i, , "%ld");
}
unsigned i = g->options.nproc;
while (i-- > 0) {
__cilkrts_worker *w = g->workers[i];
g->workers[i] = NULL;
if (!worker_is_valid(w, g))
continue;
cilk_internal_malloc_per_worker_destroy(w); // internal malloc last
free(w->l->shadow_stack);
w->l->shadow_stack = NULL;
*(struct local_state **)(&w->l) = NULL;
if (i != 0)
free(w);
}
/* TODO: Export initial reducer map */
}
CHEETAH_INTERNAL void __cilkrts_shutdown(global_state *g) {
CILK_ASSERT_NULL(exception_reducer.exn);
// If the workers are still running, stop them now.
if (g->workers_started)
__cilkrts_stop_workers(g);
for (unsigned i = cilkrts_callbacks.last_exit; i > 0;)
cilkrts_callbacks.exit[--i]();
// Deallocate the root closure and its fiber
cilk_fiber_deallocate_global(g, g->root_closure->fiber);
if (USE_EXTENSION)
cilk_fiber_deallocate_global(g, g->root_closure->ext_fiber);
Closure_destroy_global(g, g->root_closure);
// Cleanup the global state
workers_terminate(g);
flush_alert_log();
/* This needs to be before global_state_terminate for good stats. */
for_each_worker(g, wrap_fiber_pool_destroy, NULL);
// global_state_terminate collects and prints out stats, and thus
// should occur *BEFORE* worker_deinit, because worker_deinit
// deinitializes worker-related data structures which may
// include stats that we care about.
// Note: the fiber pools uses the internal-malloc, and fibers in fiber
// pools are not freed until workers_deinit. Thus the stats included on
// internal-malloc that does not include all the free fibers.
global_state_terminate(g);
workers_deinit(g);
deques_deinit(g);
global_state_deinit(g);
}
// Global destructor for shutting down the default cilkrts
__attribute__((destructor)) void __default_cilkrts_shutdown() {
__cilkrts_shutdown(default_cilkrts);
}