-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnotebook.tex
1156 lines (900 loc) · 46.1 KB
/
notebook.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[8pt]{extarticle}
%% \usepackage[fleqn]{amsmath}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amsfonts,amsthm,bm}
\usepackage{breqn}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{tikz}
\usepackage[ruled,vlined,linesnumbered,lined,boxed,commentsnumbered]{algorithm2e}
\usepackage{siunitx}
\usepackage{graphicx}
\usepackage{subcaption}
%% \usepackage{datetime}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{mathrsfs}
\usepackage{fancyhdr}
\usepackage{fancyvrb}
\usepackage{parskip} %turns off paragraph indent
\pagestyle{fancy}
\usepackage{xcolor}
\usepackage{mdframed}
\usepackage[small]{titlesec}
\usepackage{hanging}
\usetikzlibrary{arrows}
\DeclareMathOperator*{\argmin}{argmin}
\newcommand*{\argminl}{\argmin\limits}
\newcommand{\mathleft}{\@fleqntrue\@mathmargin0pt}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\ppartial}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\p}{\partial}
\newcommand{\te}[1]{\text{#1 }}
\newcommand{\norm}[1]{\|#1\|}
\setcounter{MaxMatrixCols}{20}
% remove excess vertical space for align equations
\setlength{\abovedisplayskip}{0pt}
\setlength{\belowdisplayskip}{0pt}
\setlength{\abovedisplayshortskip}{0pt}
\setlength{\belowdisplayshortskip}{0pt}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{mdtheorem}{Theorem}
% \newenvironment{theorem}
% {\begin{mdframed}[
% backgroundcolor=green!10,
% topline=false,
% rightline=false,
% bottomline=false,
% leftline=false
% ]\begin{mdtheorem}}
% {\end{mdtheorem}\end{mdframed}}
\begin {document}
\lhead{Notes - Atomics \& Multithreaded Programming, Yuan Liu}
% \begin{align}
% \nabla f_{k+1}^T p_k & \geq c_2 \nabla f_k^T p_k\\
% \frac{\partial \phi(\alpha_k)}{\partial \alpha_k} & \geq \frac{\partial \phi(0)}{\partial \alpha_k}\\
% &c_1,\alpha_k \in (0,1)\\
% &0<c_1<c_2<1
% \end{align}
\begin{multicols*}{2}
Gathered notes from:
\begin{itemize}
\item Rust Atomics and Locks by Mara Bos \cite{rustatomicsbook}
\end{itemize}
\section{Basic Concurrency Primitives Overview}
topics: interior mutability, threadsafety, runtime borrow check
\subsection{Single Thread Interior Mutability}
RefCell: borrow at runtime\\
Cell: value replacement, not borrow; limited to word size
\subsection{Threadsafe Interior Mutability}
Mutex: exclusive borrow at runtime\\
RwLock: differentiates borrow type: exclusive vs. shared read only\\
Atomics: value replacement, not borrow; limited to word size\\
UnsafeCell: express raw pointer to wrapped data via unsafe block; in practice wrapped by a safer interface to user\\
Traits for threadsafety: Send, Sync\\
T: Send $\iff$ T can be transferred to another thread\\
T: Sync $\iff$ T can be shared with $>1$ threads; \&T: Send\\
all primitve types are Send + Sync\\
auto traits:
\begin{itemize}
\item automatically opt-in
\item manually opt-out
\item recursively deduced on fields of structs
\end{itemize}
un-implemented trait (\verb%!Trait%) in some types:
\begin{verbatim}
Cell<T>: Send + !Sync where T: Send
* const T / * mut T: !Send + !Sync
Rc<T>: !Send + !Sync
std::marker::PhantomdData<T> where T: !Send / !Sync
\end{verbatim}
force opt-in for un-implemented type:
\begin{verbatim}
unsafe impl Send/Sync for T {}
\end{verbatim}
\vfill\null
\columnbreak
\subsection{Mutex}
T is usually Send(not required) in which case the Mutex gives Sync:\\
T: Send $\implies$ \verb%Mutex<T>: Sync%
logical states: unlocked, locked
owning wrapper over T
interface makes access to T safer
MutexGuard as proof of exclusive access; drop automatically triggers unlock at the end of its lifetime
efficient usage: make locked interval as short as possible
lock poisoning: when thread panics while holding the lock
\begin{itemize}
\item lock is released
\item the invoking method call errors
\item further invoking a poisoned mutex returns error and also a locked MutexGuard in case user can correct it to some consistent state
\end{itemize}
unnamed guard may not be immediately dropped in certain statements:
\begin{verbatim}
if ... /*dropped here; only boolean value needed*/ {
...
}
\end{verbatim}
\begin{verbatim}
if let ... = ... {
... /*dropped here; may borrow from let expression*/
}
\end{verbatim}
\subsection{ReaderWriterLock}
requires T: Send + Sync:\\
T: Send + Sync $\implies$ \verb%RwLock<T>: Send + Sync%
logical staes: unlcoked, locked by 1 exclusive accessor, locked by any number of shared readers
differentiating lock guards:\\
read() $\implies$ ReLockReadGuard: Deref\\
write() $\implies$ ReLockWriteGuard: DerefMut
writer starvation issue to consider for fairness of access
\subsection{Park/Unpark}
park: current thread put itself to sleep\\
unpark: another thread wakes sleeping thread; needs handle of the sleep thread read from \verb%spawn()% method or \verb%thread::current()%
spurious wakeup due to false sharing, etc. $\implies$ user provide a check upon wakeup
request to unpark recorded if unpark happens before park in order to avoid lost notification, but does not stack up (max of 1 unpark recorded)
\vfill\null
\columnbreak
\subsection{Condition Variable}
signaling events related to protected data of mutex
methods: \verb%wait%, \verb%notify_*%
atomically unlock mutex and start waiting (to avoid lost notification)
\subsubsection{Communication}
waiting thread:
\begin{enumerate}
\item takes MutexGuard as input
\item unlocks mutex
\item thread put to sleep
\item thread wakes (via a notify of CondVar by another thread or spurious wakeup)
\item relocks mutex and returns MutexGuard
\end{enumerate}
notifying thread:
\begin{enumerate}
\item invoke notify on CondVar
\end{enumerate}
\subsubsection{Spurious Wakeup}
need additional memory to check actual event: can add this along with the original value wrapped by mutex\\
use a loop with wait to put thread back to sleep if condition not met
usage: 1 CondVar for 1 mutex
optionally can wait with timeout parameter to unconditionally wakeup thread after timeout
\subsection{Comparison of Interior Mutability Primitives}
\begin{tabular}{| c | c | c |}
\hline
& value replacement & reference / borrow \\
\hline
1 thread & Cell & RefCell \\
\hline
threadsafe & Atomic & Mutex/RwLock \\
\hline
\end{tabular}
\subsection{Comparison of Shared Ownership Primitives}
Rc/Arc: act similar to Box / smart pointer but with dropping logic to take care of deallocation for shared data\\
\begin{tabular}{| c | c |}
\hline
1 thread & Rc \\
\hline
threadsafe & Arc \\
\hline
\end{tabular}
\subsection{Traits for Interior Mutability Primitives}
\verb%T: Send% $\implies$ \verb%Cell<T>: Send + !Sync% (usual practical case)\\
\verb%T: !Send% $\implies$ \verb%Cell<T>: !Send + !Sync%\\
\verb%T: Send% $\implies$ \verb%RefCell<T>: Send + !Sync% (usual)\\
\verb%T: !Send% $\implies$ \verb%RefCell<T>: !Send + !Sync%\\
\verb%T: Send% $\implies$ \verb%Mutex<T>: Send + Sync% (usual)\\
\verb%T: !Send% $\implies$ \verb%Mutex<T>: !Send + !Sync%\\
\verb%T: Send + Sync% $\implies$ \verb%RwLock<T>: Send + Sync% (usual)\\
\verb%T: !Send / !Sync% $\implies$ \verb%RwLock<T>: !Send + !Sync%\\
\subsection{Traits for Shared Ownership Primitives}
\verb%Rc<T>: !Send + !Sync%\\
\verb%T: Send + Sync% $\implies$ \verb%Arc<T>: Send + Sync%\\
\vfill\null
\columnbreak
\subsection{Typical Usage Pattern}
\verb%Arc<Mutex<T>>%\\
where:\\
Arc allows threadsafe immutable sharing\\
Mutex allows interior mutability using references across $\geq1$ threads
\verb%Rc<RefCell<T>>%\\
where:\\
Rc allows single thread immutable sharing\\
RefCell allows interior mutability using references in single thread
\verb%Rc<Cell<T>>%\\
where:\\
Rc allows single thread immutable sharing\\
Cell allows interior mutability using value in single thread
\verb%Arc<Atomic<T>>%\\
where:\\
Arc allows threadsafe immutable sharing\\
Atomic allows interior mutability using value across $\geq1$ threads
\vfill\null
\columnbreak
\section{Atomics}
operations:
\begin{itemize}
\item \verb%fetch_and_modify%
\item \verb%swap%
\item \verb%compare_exchange%:
\begin{itemize}
\item ABA problem for pointer algorithms
\item weak version exists for more efficient impl. on some hardware at expense of spurious wakeup
\end{itemize}
\item \verb%fetch_update% $\iff$ load followed by loop with \verb%compare_exchange_weak% and user provided computation
\end{itemize}
\subsection{Scoped Thread}
regular \verb%std::thread::spawn% requires closure to be Send $\implies$ all captures of closure are required to be Send
\verb%std::thread::scope%:
\begin{itemize}
\item borrows object of non-static lifetime that can outlive thread
\item mutability rules apply
\item threads are automatically joined at the end of the scope
\end{itemize}
\subsection{Lazy Initialization}
\begin{itemize}
\item execute once by 1 thread, sharable afterwards
\item race possible from threads, but this is different from data race which causes undefined behaviour (UB)
\item can use \verb%CondVar% / thread parking / \verb%std::sync::Once% / \verb%std::sync::OnceLock% to avoid wasted compute from multiple threads
\end{itemize}
\subsection{Move Closure}
\begin{itemize}
\item transfer ownership of value
\item capture variable via copying/moving instead of borrowing
\item copying reference in a move closure in order to borrow from variable
\item note: Atomic does not implement Copy trait
\end{itemize}
\subsection{Data Sharing Between Threads in General}
data shared need to outlive all involved threads:
\begin{itemize}
\item make data owned by entire program via static lifetime (static item exists even before start of the main program
\item leak an allocation and promise never to drop it from that point onward in the duration of the entire program:
eg: \verb%Box::leak(Box::new(..))%\\
note: \verb%'static% means the object will exist until the end of the program but may not exist at the start of the program\\
note: Copy $\implies$ when moved, the original value still exists
\item reference counting: track ownership and invoke \verb%drop% when no owners left\\
eg: \verb%std::rc::Rc%: clone increments counter only and gives reference to allocation\\
eg: \verb%std::sync::Arc%: version of \verb%Rc% that is threadsafe\\
\end{itemize}
use of scope and variable shadowing to reuse identifiers when cloning:
\begin{itemize}
\item shadowing: original name is not obtainable anymore in current scope
\item original name still obtainable in an outerscope, can clone it in another inner scope
\end{itemize}
reference counted pointers(\verb%Rc% and \verb%Arc%) have same restrictions as immutable reference (\verb%&T%)
mutable borrows are guaranteed at compile time $\implies$ mutable aliasing between 2 variables does not occur; optimization to remove impossible code blocks possible
assumptions held by the compiler:
\begin{itemize}
\item an immutable reference exists $\implies$ no other mutable references to the associated data exist
\item there is at maximum 1 mutable reference to an object at anytime
\end{itemize}
if such assumptions are broken, then UB exists: more wrong conclusions may be propagated through optimizations
\verb%unsafe% blocks are also assumed to be sound by the compiler which means compiler may apply optimizations and elide code when feasible
\subsection{Interior Mutability}
shared reference \verb%& T%: copied and sharable (not mutable)\\
exclusive reference \verb%& mut T%: exclusive borrow of T
interior mutability provides more flexibility for shared data that needs mutation
\verb%Cell / Atomic%: replace value, no borrow\\
\verb%RefCell / Mutex%: runtime borrowing; book-keeping cost for existing borrows; failable at runtime
\vfill\null
\columnbreak
\section{Memory Ordering}
defining happens-before relations across threads
concurrent non-atomic stores to same variable causes data race $\implies$ UB
lack of globally consistent order
thread spawn/join: automatically enforces happens-before relation
note: current theoretical model for formalizing memory ordering bug: cyclic reasoning / value out of thin air
\subsubsection{Relaxed Ordering}
\begin{itemize}
\item per atomic variable: a total modification order in every run of the program $\implies$ all modifications of the said atomic variable happen in 1 order that is consistent/same from views of every thread
\item multiple possible orderings may exist when the program is run multiple times, but each run satisfies a total modification order
\item no happens-before relation
\end{itemize}
\subsubsection{Release-Acquire Ordering Pair}
pairing:\\
store operation specified with release semantics\\
load operation specified with acquire semantics
happens-before relation formed at runtime when load succeeds: all memory operations before release store is observable by and after acquire load
release store of an atomic variable may be modified by any number of fetch-modify / compare-exchange operations and still have a happens-before relation with an acquire load afterwards on the said atomic variable
any store of the associated atomic variable breaks the chain of a release-acquire pair (that previously starts with a release store and possibly followed with fetch-modifies/compare-exchanges)
use of non-atomic variable in different threads and borrow checker $\implies$ may need unsafe blocks
\subsubsection{Release-Consume Ordering Pair}
pairing:\\
store operation specified with release semantics\\
load operation specified with consume semantics
happens-before relation for associated atomic variable in the release store and the dependent expressions in the consumer thread
practically, hard to define dependent evaluation and implementation tends to fallback to acquire semantics instead
\subsubsection{Sequentially Consistent Ordering}
pairing:\\
store operation specified with SeqCst semantics\\
load operation specified with SeqCst semantics
guarantees of:
\begin{itemize}
\item acquire ordering
\item release ordering
\item globally consistent ordering of all SeqCst operations (every SeqCst operation in a program is a part of a single total order that all threads agree on)
\end{itemize}
can replace acquire and release ordering and maintain happens-before relation
\subsection{Memory Fence}
separate memory ordering semantics from atomic operations
it can take place of acquire / release / other memory order operations
types of fences:
\begin{itemize}
\item release fence
\item acquire fence
\item acquire-release fence
\item sequentially consistent fence
\end{itemize}
\subsubsection{Practical Replacement}
\begin{tabular}{| c | p{50mm} |}
\hline
\textbf{without fences} & \textbf{with fences} \\
\hline
release store & fence with release ordering \newline ... \newline atomic store (any memory ordering) \\
\hline
acquire lead & atomic load (any memory ordering) \newline ... \newline fence with acquire ordering \\
\hline
\end{tabular}
any atomic store following release fence is observable by any atomic load before acquire fence $\implies$ happens-before relation is established between the release-acquire fences pairing
\subsubsection{Practical Usages}
\begin{itemize}
\item can be used for multiple variables at once
\item conditional fence (apply happens-before relation only after certain condition is met)\\
eg: place acquire fence in conditional branch that succeeds that is relevant to the atomic variable
\begin{verbatim}
let p = var.load(relaxed);
if p == ... {
fence(acquire);
do_something(...);
}
\end{verbatim}
\item may be more efficient if atomic variable is expected to fail in comparison often (let atomic variable be loaded with relaxed memory ordering)
\end{itemize}
\subsubsection{SeqCst Fence}
\begin{itemize}
\item is both a release fence and an acquire fence
\item is part of a single total order of sequentially consistent operations
\end{itemize}
\subsection{Compiler Fence}
does not prevent processor from reordering instructions
Rust compiler fence: \verb%std::sync::atomic::compiler_fence%
uses:
\begin{itemize}
\item process-wide memory barriers
\item special cases of signal handler/interrupt
\end{itemize}
\subsection{FAQs}
memory model is not related to timing
memory model defines order of operations and affects instruction reordering
SeqCst implies the operation depends on the total order of every single SeqCst operation in the program\\
$\implies$ usually overly tall claim\\
$\implies$ more relaxed constraints may be easier to review (eg: release-acquire pairs)
release store not form happens-before relation with SeqCst store; for a part of a globally consistent order, both operations need to be SeqCst
\subsection{Summary}
each atomic variable has its own total modification order that all threads agree on
single thread: happens-before relations exist between every single operations
unlocking a mutex happens-before locking that mutex
SeqCst results in 1 globally consistent order of operations that participates in SeqCst, but it is usually overly constraining
fences allow combining memory ordering of multiple operations for efficiency or applying conditional memory ordering for efficiency
happens-before relation exist when:
\begin{itemize}
\item threads spawn / join
\item acquire load from a release store on an atomic variable
\item fetch-modifies / compare-exchanges in between a release-acquire pair on an atomic variable is still valid for that happens-before relation
\end{itemize}
\vfill\null
\columnbreak
\section{Processor}
atomic operation compiles to machine instructions
memory ordering at lowest level of individual instructions
tool for assembly code: \verb%cargo-show-asm%
instructions that cannot be represented by 1 processor instruction, use equivalent implementation with composite/multiple instructions (eg: cmpxchg and loop)
\subsection{x86-64}
compare-exchange and compare-exchange-weak have no difference: compile to lock cmpxchg instruction
x86 lock prefix as a modifier for instructions:\\
\verb%add, sub, and, not, or, xor, xchg(implicit),%\\
\verb%xadd, bts, btr, btc%
\subsection{RISC}
\subsubsection{Load-Linked/Store-Condition (LL/SC)}
used in a pair on a memory address
LL: returns current value of a memory address, processor remembers the address internally
SC: conditionally store new value to that memory address after previous LL if no updates occured since the LL
\subsubsection{ARM64 LL/SC Pair}
\begin{itemize}
\item \verb%ldxr% (load exclusive register)
\item \verb%stxr% (store exclusive register)
\item \verb%clrex% (clear exclusive): stop tracking writes to memory $\implies$ subsequent store conditional will fail
\end{itemize}
typically used in a loop with a branching compare to retry in order to ensure LL/SC succeeds
for efficient implementation:
\begin{itemize}
\item false negative can happen during store-conditional (eg: \verb%stxr%): a chunk of memory tracked usually 64 bytes / 1 kB
\item 1 memory address pre core can be tracked at a time
\end{itemize}
\subsubsection{ARMv8.1 Atomic Instructions}
\begin{itemize}
\item \verb%cas% (equivalent to compare-exchange)
\item \verb%fetch-*% instructions
\end{itemize}
\subsection{Cache Coherence Protocol for Consistency Between Processor Caches}
eg: EMSI, MOESI, MESIF
keep states for individual cache levels
use \verb%std::hint::black_box(..)% to disable compiler optimization when doing performance measurement
measurable difference when writes to cache require exclusive access at the same time when other cores are tring to load on the same cache lines
add padding to separate variables in cache lines (to reduce false sharing)\\
eg: \verb%#[repr(align(64))]% for 64 byte alignment (note: must be power of 2, cannot be mixed with packed repr)
pack variable close together if they are expected to be accessed close temporally
out of order instructions/effects:
\begin{itemize}
\item store buffers for writing back to cache: brief moment of inconsistency present when write ops are not yet visible to other cores
\item invalidation queues: invalidation requests (dropping cache line) queued for later processing\\
$\implies$ inconsistency (outdated before they are dropped)\\
$\implies$ visibility of write ops from other cores slightly delayed
\item pipelining: parallel instructions: computation on some memory finishes before preceding instructions $\implies$ interaction with other cores (may appear out of order)
\item special instructions exist to prevent these above
\end{itemize}
\subsection{Memory Ordering}
x86-64 and ARM64 are other-multi-copy atomic architectures\\
$\implies$ write op visible to any core $\implies$ visible to all cores at same time\\
$\implies$ memory ordering is same as instruction reordering
x864-64 has fairly strong ordering restructions:
\begin{itemize}
\item release and acquire semantics have same cost as relaxed memory ordering
\item store doesn't get reordered earlier than a preceding memory operation
\item load doesn't get reordered later than a following memory operation
\item $\implies$ (acquire $\iff$ relaxed, release $\iff$ relaxed)
\end{itemize}
ARM64 has relatively weak ordering: all memory ops can be reordered
\begin{itemize}
\item acquire and release versions of loads and stores:
\begin{itemize}
\item \verb%stlr% (store-release register)
\item \verb%ldar% (load-acquire register)
\item \verb%stlxr%(store-release exclusive register)
\item \verb%ldaxr% (load-acquire exclusive register)
\end{itemize}
\item none of the special acquire and release instructions is reordered with any other of these special instructions
$\iff$ acquire-release operations are same as sequentially consistent operations (Acquire / Release / AcqRel has same cost as SeqCst)
\end{itemize}
\subsection{Memory Fence / Barrier Instructions}
prevents certain instructions being reordered past it
fences pose stronger constraint than release-acquire operations directly on atomic variables $\implies$ release-acquire atomic ops can be logically replaced with fences but the reverse is not true
fences are not acquire or load operations
Release Fence:
\begin{itemize}
\item usually used with atomic store after the release fence
\item 1 way fence: memory operations before release fence cannot be reordered down past ALL subsequent memory writes after the fence (contrast with release store on atomic variable: memory operations before release op cannot be reordered down past the release op itself)
\end{itemize}
Acquire Fence:
\begin{itemize}
\item usually used with atomic load before the acquire fence
\item 1 way fence: memory operations after acquire fence cannot be reordered up before ALL previous memory reads before the fence
\item happens-before relation is established when load of the atomic variable reads any expected value that is caused by side-effects of the release sequence (eg: a release store on an atomic variable, or release fence followed by relaxed atomic store)
\end{itemize}
SeqCst Fence:
\begin{itemize}
\item both an release and an acquire fence
\end{itemize}
on X64-64:
\begin{itemize}
\item Acquire and Release are same as relaxed memory ordering (acquire/release fences are elided)
\item SeqCst: issue of mfence instruction
\end{itemize}
on ARM64:
\begin{itemize}
\item Acquire: \verb%dmb ishld%
\item Release: \verb%dmb ish%
\item AcqRel: \verb%dmb ish%
\item SeqCst: \verb%dmb ish% (same cost as acquire and release)
\end{itemize}
\vfill\null
\columnbreak
\section{OS Primitives}
futex used as a basic primitive that is optimized to avoid frequent syscall
originally syscall in linux but available in supporting libraries on other OSes (can use syscall from libc crate)
use of an atomic variable for thread notifications and wakeups
fast path in userspace when non-blocking, resort to slower syscall when blocking is required
spurious thread wakeups possible, thus need a condition check
solution to missing wakeup:
\begin{itemize}
\item use of another value:\\
expected value match atomic variable $\implies$ blocking wait\\
otherwise $\implies$ non-blocking
\item wake is atomic with respect to wait:\\
change atomic variable's value before wake\\
$\implies$ thread that is about to wait will actually not block and skip wait therefore the wake up no longer has any effect
\item check of expected value and wait/block (of \verb%futex_wait%) happens as a single atomic operation wrt. other futex operations
\end{itemize}
priority inversion problem:
\begin{itemize}
\item high priority thread blocked by another lower priority thread with lock held
\item solution: allow priority inheritance temporarily when this situation occurs
\item see \verb%FUTEX_<OP>_PI% operations
\end{itemize}
\subsection{Implementation Variants on Other OS}
Windows:
\begin{itemize}
\item heavy weight objects
\item \verb%critical_section%
\item slim reader-writer lock
\item address based waiting (\verb%Wait/WakeByAddress%)
\end{itemize}
MacOS:
\begin{itemize}
\item \verb%libc, libc++, obj-c, swift% interface: pthread impl
\item platform specific lightweight \verb%os_unfair_lock%, limitations: no cond var, no reader-writer variant
\end{itemize}
\subsection{Standards for Accessing Kernel Scheduler via Special Libs or Syscalls}
POSIX for Unix based systems
pthreads extensions: provide support for threading, data types, functions for concurrency
\subsubsection{Concerns wrt. Movable/Non-Movable Types}
pthread structures are generally non-movable types due to self references\\
possible workarounds:\\
\verb%std::pin%\\
\verb%Box<..>%: issue with leaking/forgetting an object: \verb%pthread_mutex_destroy% on locked mutex may result in undefined behaviour as per spec.\\
\subsection{Futex as an Efficient Mutex}
simple futex-like addition to \verb%C++% standard:\\
\verb%std::atomic_wait%\\
\verb%std::atomic_notify%
originally added to Linux systems: \verb%SYS_futex% syscall: use of 32 bit atomic variable address to notify threads when to wake up
solution for missing wakeup signal: atomic op for wait $\implies$ a wake, between check of expect value provided to wait and the moment it goes to sleep, is not missed
manage state in userspace if possible, only rely on slower code path (via syscall) when absolutely necessary (need for a block)
usually wait used in a loop to check condition of possible spurious wakeups
futex related op arguments:
\begin{itemize}
\item 32 bit atomic pointer
\item op constant
\item optional flags, eg: \verb%FUTEX_PRIVATE, FUTEX_CLOCK_REALTIME%
\item remaining arguments dependent on the op
\end{itemize}
futex ops:
\begin{itemize}
\item \verb%FUTEX_WAIT%: check and block is atomic
\item \verb%FUTEX_WAKE%: provide max number of threads to wake up
\item \verb%FUTEX_WAIT_BITSET%: wake only for bits set in common from a corresponding \verb%FUTEX_WAKE_BITSET% op
\item \verb%FUTEX_WAKE_BITSET%
\item \verb%FUTEX_REQUEUE%
\item \verb%FUTEX_CMP_REQUEUE%
\item \verb%FUTEX_WAKE_OP%
\item \verb%FUTEX_PRIVATE_FLAG%
\end{itemize}
\vfill\null
\columnbreak
\section{Primitive Implementation Examples}
\subsection{Spin Lock}
release store (unlock) and acquire load (lock) pair for preventation of data race (UB)
\verb%std::hint::spin_leep()%: possible optimization of processor
possible implementation:
\begin{itemize}
\item wraps actual data inside an \verb%UnsafeCell<T>% for interior mutability
\item requires \verb%T: Send%
\item locking provides exclusive access (and provides \verb%Sync% trait)
\item uses \verb%unsafe% blocks in function, user will not have to use \verb%unsafe%
\item use lock guard (representing safe access to locked data) pattern to manage lifetime of locked access to protected data:
\begin{itemize}
\item implement \verb%Deref%, \verb%DerefMut% to access data for user ergonomics (behave similar to reference)
\item implement \verb%Drop%: automatic release store (unlocking) when lifetime of the guard ends
\item manual drop also possible: this consumes and ends the valid lifetime of the guard and hence access to data at compile time $\implies$ any further reference and borrow to guard is invalid and will be flagged as error by the compiler
\end{itemize}
\end{itemize}
\subsection{Channels}
\subsubsection{One Shot Channel}
\begin{itemize}
\item 1 message only from one thread to another thread
\item \verb%T: Send%
\item use of \verb%unsafe%:
\begin{itemize}
\item may be unitialized
\item non-copy data must not be duplicated
\item manual content drop may be necessary: leaking/forgetting is safe but sometimes undesirable
\item eg: \verb%std::mem::MaybeUninit<T>% (unsafe version of \verb%Option<T>%) for efficiency where user tracks its initialized status
\item use \verb%UnsafeCell%'s interior mutability for sharing
\item wrapping shared struct \verb%Channel% requires \verb%T: Send% and gives \verb%Sync% in return
\end{itemize}
\item use of atomic swaps for setting one time flags
\item encoding of multiple states in one word and atomic compare-exchanges
\item possible use of runtime check/panic instead of letting UB happen
\item use type checking from compiler to avoid errors: move/consume to avoid unwanted reuse of resources:
use of non-Copy type and pass by value / consume by called function $\implies$ prevent caller from using that object again at compile time (also elides some runtime checks)
\begin{itemize}
\item \verb%TX-RX% pair for message passing, \verb%Channel% shared inside their private implmentations
\item use of \verb%Arc<T>% for sharing of allocation and resource dropping: \verb%RX% drop and \verb%TX% drop $\implies$ \verb%Arc<T>% drop $\implies$ \verb%T% drop
\item \verb%Arc<T>% incurs extra runtime overhead for allocation
\end{itemize}
\item allocation optimization
\begin{itemize}
\item borrowing instead of memory allocation (\verb%Arc%)
\item use of lifetimes and mutable borrow for compile time checks
\item \verb%Channel% explicitly created by user ahead of time and passed in to \verb%RX% and \verb%TX% as references upon construction in the \verb%split% method
\item \verb%TX, RX% take in additional lifetime parameter which is the same as the lifetime of the borrowed \verb%Channel%: when \verb%TX% or \verb%RX% is present, existing \verb%Channel% cannot be mutably borrowed again until \verb%TX% and \verb%RX% are both dropped
\item \verb%Channel% needs to outlive \verb%TX% and \verb%RX% for compiler check to pass
\item \verb%Channel% resets its contents on entry to \verb%split% method in case it is used multiple times
\end{itemize}
\item overwriting content for fresh initialization (when calling \verb%split%): after 1st borrow expires, subsequent borrows are made on these newly created resources
\item blocking interface:
\begin{itemize}
\item make \verb%RX% object not \verb%Send%, such as using a \verb%PhantomData<* const ()>% member field so that auto trait deduction for the wrapping struct is propagated to be \verb%!Send%:\\
\verb%* const () : !Send% $\implies$\\
\verb%PhantomData<* const ()> : !Send% $\implies$\\
wrapping struct is \verb%!Send%
\item \verb%RX% stays on the same thread, \verb%TX% allows to cross thread boundaries
\item use receiving thread's handle to invoke waking up a blocked thread (place this inside the sender struct)
\item call unpark on receiving thread's handle after release store optation is performed by the sender
\item recever checks \verb%Channel%'s variable to avoid spurious wakeup
\end{itemize}
\end{itemize}
\vfill\null
\columnbreak
\subsection{Arc}
basic implementation: use a pointer type to the shared underlying data via \verb%std::ptr::NonNull%, use counter info for the shared data; use \verb%NonNull::from(Box::leak(Box::new(..)))% to get a pointer from initial allocation
implement ergonomic methods \verb%deref% and \verb%deref_mut% from traits: \verb%Deref%, \verb%DerefMut%
let \verb%cloning% change internal counter and point to the shared data
requires \verb%T: Send + Sync% and gives wrapping \verb%Arc<T>% \verb%Send + Sync%
auto traits is not active for raw pointer types (including \verb%NonNull%) wrt. \verb%Send% and \verb%Sync%
cloning corresponds to incrementing counter and giving a shared reference to underlying data:
\begin{verbatim}
impl<T> Clone for Arc<T> {
fn clone(&self) -> Self {
self.data().ref_count.fetch_add(1, Relaxed);
Arc {
ptr: self.ptr,
}
}
}
\end{verbatim}
only final decrement needs to be acquire and release, while all others can be only release:
\begin{verbatim}
impl<T> Drop for Arc<T> {
fn drop(&mut self) {
if var.fetchsub(1, release) == 1 {
fence(acquire); //conditional acquire
//drop logic
unsafe {
drop(Box::from_raw(self.ptr.as_ptr()));
}
}
}
\end{verbatim}
exclusive access to shared data, conditionally in a runtime branch, eg:
\begin{verbatim}
pub fn get_mut(arc: &mut Self) ->
Option<&mut T> {
if arc.data().ref_count.load(Relaxed) == 1 {
fence(Acquire); //contional acquire
//gained exclusive access now
unsafe { Some(&mut arc.ptr.as_mut().data) }
} else {
None
}
}
\end{verbatim}
Miri interpreter for simulation and verification of unsafe code
weak version of \verb%Arc<T>%: \verb%Weak<T>%
\begin{itemize}
\item \verb%T% can be shared between \verb%Arc<T>% and \verb%Weak<T>%
\item \verb%Weak<T>% does not prevent drop of \verb%T%, eg: all \verb%Arc<T>% dropped $\implies$ \verb%T% dropped
\item \verb%Weak<T>% exists without reliance of \verb%T%, which can provide conditional access to \verb%& T%-like object:\\
implement upgrade function to get \verb%Option<Arc<T>>% where it's \verb%None % if \verb%T% is already dropped
\end{itemize}
cycle breaking: use 2 counters:
\begin{itemize}
\item strong pointer count (\verb%data_ref_count%)
\item weak pointer + strong pointer count (\verb%alloc_ref_count%)
\end{itemize}
wrapping struct \verb%ArcData<T>%:
\begin{itemize}
\item use interior mutability of an optional
\item keep extra info of shared counters referencing its data
\end{itemize}
drop implementation of weak pointer and strong pointer:
\begin{itemize}
\item strong pointer count is 0 $\implies$ drop \verb%T%
\item weak pointer + strong pointer count dropping to 0 $\implies$ droppping \verb%ArcData<T>%
\end{itemize}
Rust drop order:
\begin{itemize}
\item run \verb%Drop::drop% on object
\item drop the object's fields 1 by 1 recursively
\end{itemize}
cloning pointers:
\begin{itemize}
\item \verb%Arc%: increment weak counter, increment strong counter
\item \verb%Weak%: increment weak counter
\end{itemize}
dereferencing:
\begin{itemize}
\item \verb%Arc%: unconditionally dereference since existence of \verb%Arc% implies underlying \verb%T% is valid
\item \verb%Weak%: upgrade to strong pointer
\begin{itemize}
\item atomic increment and compare swap on strong counter to give out access
\item upgrade and return an \verb%Arc% to caller
\item if strong pointer counter is 0 $\implies$ data doesn't exist and abort operation
\end{itemize}
\end{itemize}
strong pointer access to mutate data: runtime conditional check to allow exclusive access to \verb%T%
\begin{itemize}
\item check weak pointer counter is 0, strong pointer count is 1
\item cast underlyding data and return \verb%& mut T%; safe since \verb%Arc% exists
\end{itemize}
convert from strong pointer to weak pointer (downgrade): call clone on weak pointer and return the weak pointer
possible optimization: use 1 atomic counter instead of 2 $\implies$
\begin{itemize}
\item if user is not using weak pointers then they don't have to pay the cost when cloning /dropping
\item use \verb%ManuallyDrop<T>% instead of \verb%Option<T>% to save an extra state and use existence of \verb%Arc<T>% to know if data is gone or not
\item let 1 weak count represets all existing \verb%Arc<T>%s $\iff$ 1 \verb%Arc<T>% left, decrement 1 weak count associated with all of them
\item downgrade and \verb%get_mut% requires more change:
\item \verb%get_mut%
\begin{itemize}
\item need to check 2 atomic counters
\item temporarily lock downgrade operation by use of a special value indicating locked state for weak counter; use compare-exchange on \verb%alloc_ref_count% (weak pointer + strong pointer) variable to replace with special value if the condition applies
\item check if strong pointer count == 1 $\implies$ we have exclusive access to data, replace special value earlier to unlock it (weak pointer + strong pointer count) and return \verb%& mut T%
\end{itemize}
\item downgrade
\begin{itemize}
\item check that special value for \verb%alloc_ref_count% (weak pointer + strong point counter) is not present, otherwise loop
\item compare-exchange acquire with \verb%get_mut% method on \verb%alloc_ref_count% atomic variable: increment if success and this will make future \verb%get_mut% fail until \verb%Weak::drop% makes this \verb%alloc_ref_count% go back to 1 via release memmory ordering
\end{itemize}
\end{itemize}
\subsection{Locks}
atomic-wait crate for providing cross platform interface for futex-like syscall:
\begin{itemize}
\item \verb%wait(& AtomicU32, u32)%
\item \verb%wake_one(& AtomicU32)%
\item \verb%wake_all(& AtomicU32)%
\end{itemize}
platform implementation:
\begin{itemize}
\item Linux: \verb%futex% syscall
\item Windows: \verb%WaitOnAddress%
\item FreeBSD: \verb%_umtx_op%
\item MacOS: \verb%libc++%
\end{itemize}
\subsubsection{Constructing Mutex}
use atomic variable for tutex-like syscall (invoke wait for locking):
\begin{verbatim}
struct Mutex<T> {
state:: AtomicU32,
value: UnsafeCell<T>,
}
struct MutexGuard<'a, T> {
mutex: & 'a Mutex<T>,
}
//Produce a guard as proof of exclusive access.
//
//add Deref and DerefMut traits for data access
//like & T, & mut T
fn lock(& self) -> MutexGuard<T> {
while self.state.swap(1, Acquire) == 1 {
wait(&self.state, 1); //wait until state != 1
}
MutexGuard { mutex: self }
}
\end{verbatim}
use drop of guard to unlock mutex by invoking wake after setting state to unlocked value: