notebook.tex

\documentclass[8pt]{extarticle}
 
%% \usepackage[fleqn]{amsmath}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amsfonts,amsthm,bm}
\usepackage{breqn}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{tikz}
\usepackage[ruled,vlined,linesnumbered,lined,boxed,commentsnumbered]{algorithm2e}
\usepackage{siunitx}
\usepackage{graphicx}
\usepackage{subcaption}
%% \usepackage{datetime}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{mathrsfs}
\usepackage{fancyhdr}
\usepackage{fancyvrb}
\usepackage{parskip} %turns off paragraph indent
\pagestyle{fancy}

\usepackage{xcolor}
\usepackage{mdframed}

\usepackage[small]{titlesec}

\usepackage{hanging}

\usetikzlibrary{arrows}

\DeclareMathOperator*{\argmin}{argmin}
\newcommand*{\argminl}{\argmin\limits}

\newcommand{\mathleft}{\@fleqntrue\@mathmargin0pt}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Z}{\mathbb{Z}} 
\newcommand{\N}{\mathbb{N}}
\newcommand{\ppartial}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\p}{\partial}
\newcommand{\te}[1]{\text{#1 }}
\newcommand{\norm}[1]{\|#1\|}

\setcounter{MaxMatrixCols}{20}

% remove excess vertical space for align equations
\setlength{\abovedisplayskip}{0pt}
\setlength{\belowdisplayskip}{0pt}
\setlength{\abovedisplayshortskip}{0pt}
\setlength{\belowdisplayshortskip}{0pt}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}

% \newtheorem{mdtheorem}{Theorem}
% \newenvironment{theorem}
% {\begin{mdframed}[
%   backgroundcolor=green!10,
%   topline=false,
%   rightline=false,
%   bottomline=false,
%   leftline=false
%   ]\begin{mdtheorem}}
%     {\end{mdtheorem}\end{mdframed}}

\begin {document}

\lhead{Notes - Atomics \& Multithreaded Programming, Yuan Liu}

% \begin{align}
%   \nabla f_{k+1}^T p_k & \geq c_2 \nabla f_k^T p_k\\
%   \frac{\partial \phi(\alpha_k)}{\partial \alpha_k} & \geq \frac{\partial \phi(0)}{\partial \alpha_k}\\
%   &c_1,\alpha_k \in (0,1)\\
%   &0<c_1<c_2<1
% \end{align}
  
\begin{multicols*}{2}

  Gathered notes from:
  \begin{itemize}
  \item Rust Atomics and Locks by Mara Bos \cite{rustatomicsbook}
  \end{itemize}
  
  \section{Basic Concurrency Primitives Overview}
  topics: interior mutability, threadsafety, runtime borrow check

  \subsection{Single Thread Interior Mutability}
  RefCell: borrow at runtime\\
  Cell: value replacement, not borrow; limited to word size
  
  \subsection{Threadsafe Interior Mutability}
  Mutex: exclusive borrow at runtime\\
  RwLock: differentiates borrow type: exclusive vs. shared read only\\
  Atomics: value replacement, not borrow; limited to word size\\

  UnsafeCell: express raw pointer to wrapped data via unsafe block; in practice wrapped by a safer interface to user\\

  Traits for threadsafety: Send, Sync\\
  T: Send $\iff$ T can be transferred to another thread\\
  T: Sync $\iff$ T can be shared with $>1$ threads; \&T: Send\\
  all primitve types are Send + Sync\\
  
  auto traits:
  \begin{itemize}
  \item automatically opt-in
  \item manually opt-out
  \item recursively deduced on fields of structs
  \end{itemize}

  un-implemented trait (\verb%!Trait%) in some types:
  \begin{verbatim}
  Cell<T>: Send + !Sync where T: Send
  * const T / * mut T: !Send + !Sync
  Rc<T>: !Send + !Sync
  std::marker::PhantomdData<T> where T: !Send / !Sync
  \end{verbatim}
  
  force opt-in for un-implemented type:
  \begin{verbatim}
  unsafe impl Send/Sync for T {}
  \end{verbatim}
  
  \vfill\null
  \columnbreak
  
  \subsection{Mutex}

  T is usually Send(not required) in which case the Mutex gives Sync:\\
  T: Send $\implies$ \verb%Mutex<T>: Sync%

  logical states: unlocked, locked

  owning wrapper over T

  interface makes access to T safer
  
  MutexGuard as proof of exclusive access; drop automatically triggers unlock at the end of its lifetime

  efficient usage: make locked interval as short as possible

  lock poisoning: when thread panics while holding the lock
  \begin{itemize}
  \item lock is released
  \item the invoking method call errors
  \item further invoking a poisoned mutex returns error and also a locked MutexGuard in case user can correct it to some consistent state
  \end{itemize}

  unnamed guard may not be immediately dropped in certain statements:
\begin{verbatim}
if ... /*dropped here; only boolean value needed*/ {
  ...
}
\end{verbatim}

\begin{verbatim}
if let ... = ... {
  ... /*dropped here; may borrow from let expression*/
}
\end{verbatim}
  
  \subsection{ReaderWriterLock}

  requires T: Send + Sync:\\
  T: Send + Sync $\implies$ \verb%RwLock<T>: Send + Sync%

  logical staes: unlcoked, locked by 1 exclusive accessor, locked by any number of shared readers

  differentiating lock guards:\\
  read() $\implies$ ReLockReadGuard: Deref\\
  write() $\implies$ ReLockWriteGuard: DerefMut

  writer starvation issue to consider for fairness of access

  \subsection{Park/Unpark}
  park: current thread put itself to sleep\\
  unpark: another thread wakes sleeping thread; needs handle of the sleep thread read from \verb%spawn()% method or \verb%thread::current()%

  spurious wakeup due to false sharing, etc. $\implies$ user provide a check upon wakeup

  request to unpark recorded if unpark happens before park in order to  avoid lost notification, but does not stack up (max of 1 unpark recorded)

  \vfill\null
  \columnbreak
    
  \subsection{Condition Variable}

  signaling events related to protected data of mutex

  methods: \verb%wait%, \verb%notify_*%

  atomically unlock mutex and start waiting (to avoid lost notification)

  \subsubsection{Communication}
  waiting thread:
  \begin{enumerate}
  \item takes MutexGuard as input
  \item unlocks mutex
  \item thread put to sleep
  \item thread wakes (via a notify of CondVar by another thread or spurious wakeup)
  \item relocks mutex and returns MutexGuard
  \end{enumerate}
    
  notifying thread:
  \begin{enumerate}
  \item invoke notify on CondVar
  \end{enumerate}
  
  \subsubsection{Spurious Wakeup}
  need additional memory to check actual event: can add this along with the original value wrapped by mutex\\
  use a loop with wait to put thread back to sleep if condition not met
  
  usage: 1 CondVar for 1 mutex

  optionally can wait with timeout parameter to unconditionally wakeup thread after timeout

  \subsection{Comparison of Interior Mutability Primitives}
  \begin{tabular}{| c | c | c |}
    \hline
    & value replacement & reference / borrow \\
    \hline
    1  thread & Cell & RefCell \\
    \hline
    threadsafe & Atomic & Mutex/RwLock \\
    \hline
  \end{tabular}

  \subsection{Comparison of Shared Ownership Primitives}
  Rc/Arc: act similar to Box / smart pointer but with dropping logic to take care of deallocation for shared data\\
  
  \begin{tabular}{| c | c |}
    \hline
    1  thread & Rc \\
    \hline
    threadsafe & Arc \\
    \hline
  \end{tabular}
  
  \subsection{Traits for Interior Mutability Primitives}

  \verb%T: Send% $\implies$ \verb%Cell<T>: Send + !Sync% (usual practical case)\\
  \verb%T: !Send% $\implies$ \verb%Cell<T>: !Send + !Sync%\\
  \verb%T: Send% $\implies$ \verb%RefCell<T>: Send + !Sync% (usual)\\
  \verb%T: !Send% $\implies$ \verb%RefCell<T>: !Send + !Sync%\\
  \verb%T: Send% $\implies$ \verb%Mutex<T>: Send + Sync% (usual)\\
  \verb%T: !Send% $\implies$ \verb%Mutex<T>: !Send + !Sync%\\
  \verb%T: Send + Sync% $\implies$ \verb%RwLock<T>: Send + Sync% (usual)\\
  \verb%T: !Send / !Sync% $\implies$ \verb%RwLock<T>: !Send + !Sync%\\

  \subsection{Traits for Shared Ownership Primitives}
  \verb%Rc<T>: !Send + !Sync%\\
  \verb%T: Send + Sync% $\implies$ \verb%Arc<T>: Send + Sync%\\

  \vfill\null
  \columnbreak
  
  \subsection{Typical Usage Pattern}
  \verb%Arc<Mutex<T>>%\\
  where:\\
  Arc allows threadsafe immutable sharing\\
  Mutex allows interior mutability using references across $\geq1$ threads

  \verb%Rc<RefCell<T>>%\\
  where:\\
  Rc allows single thread immutable sharing\\
  RefCell allows interior mutability using references in single thread

  \verb%Rc<Cell<T>>%\\
  where:\\
  Rc allows single thread immutable sharing\\
  Cell allows interior mutability using value in single thread

  \verb%Arc<Atomic<T>>%\\
  where:\\
  Arc allows threadsafe immutable sharing\\
  Atomic allows interior mutability using value across $\geq1$ threads

  \vfill\null
  \columnbreak

  \section{Atomics}

  operations:
  \begin{itemize}
    \item \verb%fetch_and_modify%
    \item \verb%swap%
    \item \verb%compare_exchange%:
      \begin{itemize}
      \item ABA problem for pointer algorithms
      \item weak version exists for more efficient impl. on some hardware at expense of spurious wakeup
      \end{itemize}
    \item \verb%fetch_update% $\iff$ load followed by loop with \verb%compare_exchange_weak% and user provided computation
  \end{itemize}
  
  \subsection{Scoped Thread}

  regular \verb%std::thread::spawn% requires closure to be Send $\implies$ all captures of closure are required to be Send
  
  \verb%std::thread::scope%:
  \begin{itemize}
  \item borrows object of non-static lifetime that can outlive thread
  \item mutability rules apply
  \item threads are automatically joined at the end of the scope
  \end{itemize}
  
  \subsection{Lazy Initialization}
  \begin{itemize}
  \item execute once by 1 thread, sharable afterwards
  \item race possible from threads, but this is different from data race which causes undefined behaviour (UB)
  \item can use \verb%CondVar% / thread parking / \verb%std::sync::Once% / \verb%std::sync::OnceLock% to avoid wasted compute from multiple threads
  \end{itemize}
  
  \subsection{Move Closure}
  \begin{itemize}
  \item transfer ownership of value
  \item capture variable via copying/moving instead of borrowing
  \item copying reference in a move closure in order to borrow from variable
  \item note: Atomic does not implement Copy trait
  \end{itemize}
  
  \subsection{Data Sharing Between Threads in General}
  data shared need to outlive all involved threads:
  \begin{itemize}
    \item make data owned by entire program via static lifetime (static item exists even before start of the main program
    \item leak an allocation and promise never to drop it from that point onward in the duration of the entire program:
      eg: \verb%Box::leak(Box::new(..))%\\
      note: \verb%'static% means the object will exist until the end of the program but may not exist at the start of the program\\
      note: Copy $\implies$ when moved, the original value still exists
    \item reference counting: track ownership and invoke \verb%drop% when no owners left\\
      eg: \verb%std::rc::Rc%: clone increments counter only and gives reference to allocation\\
      eg: \verb%std::sync::Arc%: version of \verb%Rc% that is threadsafe\\
    \end{itemize}

    use of scope and variable shadowing to reuse identifiers when cloning:
    \begin{itemize}
    \item shadowing: original name is not obtainable anymore in current scope
    \item original name still obtainable in an outerscope, can clone it in another inner scope
    \end{itemize}
    
    reference counted pointers(\verb%Rc% and \verb%Arc%) have same restrictions as immutable reference (\verb%&T%)

    mutable borrows are guaranteed at compile time $\implies$ mutable aliasing between 2 variables does not occur; optimization to remove impossible code blocks possible

    assumptions held by the compiler:
    \begin{itemize}
      \item  an immutable reference exists $\implies$ no other mutable references to the associated data exist
      \item there is at maximum 1 mutable reference to an object at anytime
    \end{itemize}

    if such assumptions are broken, then UB exists: more wrong conclusions may be propagated through optimizations

    \verb%unsafe% blocks are also assumed to be sound by the compiler which means compiler may apply optimizations and elide code when feasible

    \subsection{Interior Mutability}

    shared reference \verb%& T%: copied and sharable (not mutable)\\
    exclusive reference \verb%& mut T%: exclusive borrow of T

    interior mutability provides more flexibility for shared data that needs mutation

    \verb%Cell / Atomic%: replace value, no borrow\\
    \verb%RefCell / Mutex%: runtime borrowing; book-keeping cost for existing borrows; failable at runtime

    \vfill\null
    \columnbreak
    
    \section{Memory Ordering}
    defining happens-before relations across threads

    concurrent non-atomic stores to same variable causes data race $\implies$ UB

    lack of globally consistent order

    thread spawn/join: automatically enforces happens-before relation

    note: current theoretical model for formalizing memory ordering bug: cyclic reasoning / value out of thin air

    \subsubsection{Relaxed Ordering}

    \begin{itemize}
    \item per atomic variable: a total modification order in every run of the program $\implies$ all modifications of the said atomic variable happen in 1 order that is consistent/same from views of every thread
    \item multiple possible orderings may exist when the program is run multiple times, but each run satisfies a total modification order
    \item no happens-before relation
    \end{itemize}

    \subsubsection{Release-Acquire Ordering Pair}
    pairing:\\
    store operation specified with release semantics\\
    load operation specified with acquire semantics

    happens-before relation formed at runtime when load succeeds: all memory operations before release store is observable by and after acquire load

    release store of an atomic variable may be modified by any number of fetch-modify / compare-exchange operations and still have a happens-before relation with an acquire load afterwards on the said atomic variable

    any store of the associated atomic variable breaks the chain of a release-acquire pair (that previously starts with a release store and possibly followed with fetch-modifies/compare-exchanges)

    use of non-atomic variable in different threads and borrow checker $\implies$ may need unsafe blocks
    
    \subsubsection{Release-Consume Ordering Pair}

    pairing:\\
    store operation specified with release semantics\\
    load operation specified with consume semantics

    happens-before relation for associated atomic variable in the release store and the dependent expressions in the consumer thread

    practically, hard to define dependent evaluation and implementation tends to fallback to acquire semantics instead
    
    \subsubsection{Sequentially Consistent Ordering}

    pairing:\\
    store operation specified with SeqCst semantics\\
    load operation specified with SeqCst semantics

    guarantees of:
    \begin{itemize}
    \item acquire ordering
    \item release ordering
    \item globally consistent ordering of all SeqCst operations (every SeqCst operation in a program is a part of a single total order that all threads agree on)
    \end{itemize}

    can replace acquire and release ordering and maintain happens-before relation
    
    \subsection{Memory Fence}

    separate memory ordering semantics from atomic operations

    it can take place of acquire / release / other memory order operations

    types of fences:
    \begin{itemize}
    \item release fence
    \item acquire fence
    \item acquire-release fence
    \item sequentially consistent fence
    \end{itemize}
      
    \subsubsection{Practical Replacement}
    
    \begin{tabular}{| c | p{50mm} |}
    \hline
    \textbf{without fences} & \textbf{with fences} \\
    \hline
    release store & fence with release ordering \newline ... \newline atomic store (any memory ordering) \\
    \hline
    acquire lead & atomic load (any memory ordering) \newline ... \newline fence with acquire ordering \\
    \hline
    \end{tabular}

    any atomic store following release fence is observable by any atomic load before acquire fence $\implies$ happens-before relation is established between the release-acquire fences pairing

    \subsubsection{Practical Usages}
    \begin{itemize}
    \item can be used for multiple variables at once
    \item conditional fence (apply happens-before relation only after certain condition is met)\\
      eg: place acquire fence in conditional branch that succeeds that is relevant to the atomic variable

\begin{verbatim}
let p = var.load(relaxed);
if p == ... {
  fence(acquire);
  do_something(...);
}
\end{verbatim}

    \item may be more efficient if atomic variable is expected to fail in comparison often (let atomic variable be loaded with relaxed memory ordering)
    \end{itemize}

    \subsubsection{SeqCst Fence}
    \begin{itemize}
    \item is both a release fence and an acquire fence
    \item is part of a single total order of sequentially consistent operations
    \end{itemize}

    \subsection{Compiler Fence}
    does not prevent processor from reordering instructions
    
    Rust compiler fence: \verb%std::sync::atomic::compiler_fence%

    uses:
    \begin{itemize}
    \item process-wide memory barriers
    \item special cases of signal handler/interrupt
    \end{itemize}
    
    \subsection{FAQs}

    memory model is not related to timing

    memory model defines order of operations and affects instruction reordering

    SeqCst implies the operation depends on the total order of every single SeqCst operation in the program\\
    $\implies$ usually overly tall claim\\
    $\implies$ more relaxed constraints may be easier to review (eg: release-acquire pairs)

    release store not form happens-before relation with SeqCst store; for a part of a globally consistent order, both operations need to be SeqCst

    \subsection{Summary}

    each atomic variable has its own total modification order that all threads agree on

    single thread: happens-before relations exist between every single operations

    unlocking a mutex happens-before locking that mutex

    SeqCst results in 1 globally consistent order of operations that participates in SeqCst, but it is usually overly constraining

    fences allow combining memory ordering of multiple operations for efficiency or applying conditional memory ordering for efficiency

    happens-before relation exist when:
    \begin{itemize}
    \item threads spawn / join
    \item acquire load from a release store on an atomic variable
    \item fetch-modifies / compare-exchanges in between a release-acquire pair on an atomic variable is still valid for that happens-before relation
    \end{itemize}

    \vfill\null
    \columnbreak

    \section{Processor}

    atomic operation compiles to machine instructions

    memory ordering at lowest level of individual instructions

    tool for assembly code: \verb%cargo-show-asm%

    instructions that cannot be represented by 1 processor instruction, use equivalent implementation with composite/multiple instructions (eg: cmpxchg and loop)

    \subsection{x86-64}
    compare-exchange and compare-exchange-weak have no difference: compile to lock cmpxchg instruction

    x86 lock prefix as a modifier for instructions:\\
    \verb%add, sub, and, not, or, xor, xchg(implicit),%\\
    \verb%xadd, bts, btr, btc%

    \subsection{RISC}

    \subsubsection{Load-Linked/Store-Condition (LL/SC)}
    used in a pair on a memory address

    LL: returns current value of a memory address, processor remembers the address internally
    
    SC: conditionally store new value to that memory address after previous LL if no updates occured since the LL

    \subsubsection{ARM64 LL/SC Pair}
    \begin{itemize}
    \item \verb%ldxr% (load exclusive register)
    \item \verb%stxr% (store exclusive register)
    \item \verb%clrex% (clear exclusive): stop tracking writes to memory $\implies$ subsequent store conditional will fail
    \end{itemize}

    typically used in a loop with a branching compare to retry in order to ensure LL/SC succeeds
    
    for efficient implementation:
    \begin{itemize}
      \item false negative can happen during store-conditional (eg: \verb%stxr%): a chunk of memory tracked usually 64 bytes / 1 kB
      \item 1 memory address pre core can be tracked at a time
    \end{itemize}

    \subsubsection{ARMv8.1 Atomic Instructions}
    \begin{itemize}
    \item \verb%cas% (equivalent to compare-exchange)
    \item \verb%fetch-*% instructions
    \end{itemize}

    \subsection{Cache Coherence Protocol for Consistency Between Processor Caches}
    eg: EMSI, MOESI, MESIF

    keep states for individual cache levels

    use \verb%std::hint::black_box(..)% to disable compiler optimization when doing performance measurement

    measurable difference when writes to cache require exclusive access at the same time when other cores are tring to load on the same cache lines

    add padding to separate variables in cache lines (to reduce false sharing)\\
    eg: \verb%#[repr(align(64))]% for 64 byte alignment (note: must be power of 2, cannot be mixed with packed repr)

    pack variable close together if they are expected to be accessed close temporally

    out of order instructions/effects:
    \begin{itemize}
    \item store buffers for writing back to cache: brief moment of inconsistency present when write ops are not yet visible to other cores
    \item invalidation queues: invalidation requests (dropping cache line) queued for later processing\\
      $\implies$ inconsistency (outdated before they are dropped)\\
      $\implies$ visibility of write ops from other cores slightly delayed
    \item pipelining: parallel instructions: computation on some memory finishes before preceding instructions $\implies$ interaction with other cores (may appear out of order)
    \item special instructions exist to prevent these above
    \end{itemize}

    \subsection{Memory Ordering}
    x86-64 and ARM64 are other-multi-copy atomic architectures\\
    $\implies$ write op visible to any core $\implies$ visible to all cores at same time\\
    $\implies$ memory ordering is same as instruction reordering

    x864-64 has fairly strong ordering restructions:
    \begin{itemize}
    \item release and acquire semantics have same cost as relaxed memory ordering
    \item store doesn't get reordered earlier than a preceding memory operation
    \item load doesn't get reordered later than a following memory operation
    \item $\implies$ (acquire $\iff$ relaxed, release $\iff$ relaxed)
    \end{itemize}
    
    ARM64 has relatively weak ordering: all memory ops can be reordered
    \begin{itemize}
    \item acquire and release versions of loads and stores:
      \begin{itemize}
      \item \verb%stlr% (store-release register)
      \item \verb%ldar% (load-acquire register)
      \item \verb%stlxr%(store-release exclusive register)
      \item \verb%ldaxr% (load-acquire exclusive register) 
      \end{itemize}
    \item none of the special acquire and release instructions is reordered with any other of these special instructions
    $\iff$ acquire-release operations are same as sequentially consistent operations (Acquire / Release / AcqRel has same cost as SeqCst)
    \end{itemize}

  \subsection{Memory Fence / Barrier Instructions}

    prevents certain instructions being reordered past it

    fences pose stronger constraint than release-acquire operations directly on atomic variables $\implies$ release-acquire atomic ops can be logically replaced with fences but the reverse is not true

    fences are not acquire or load operations

    Release Fence:
    \begin{itemize}
    \item usually used with atomic store after the release fence
    \item 1 way fence: memory operations before release fence cannot be reordered down past ALL subsequent memory writes after the fence (contrast with release store on atomic variable: memory operations before release op cannot be reordered down past the release op itself)
    \end{itemize}

    Acquire Fence: 
    \begin{itemize}
    \item usually used with atomic load before the acquire fence
    \item 1 way fence: memory operations after acquire fence cannot be reordered up before ALL previous memory reads before the fence
    \item happens-before relation is established when load of the atomic variable reads any expected value that is caused by side-effects of the release sequence (eg: a release store on an atomic variable, or release fence followed by relaxed atomic store)
    \end{itemize}

    SeqCst Fence:
    \begin{itemize}
      \item both an release and an acquire fence
    \end{itemize}  

    on X64-64:
    \begin{itemize}
    \item Acquire and Release are same as relaxed memory ordering (acquire/release fences are elided)
    \item SeqCst: issue of mfence instruction
    \end{itemize}
  
    on ARM64:
    \begin{itemize}
    \item Acquire: \verb%dmb ishld%
    \item Release: \verb%dmb ish%
    \item AcqRel: \verb%dmb ish%
    \item SeqCst: \verb%dmb ish% (same cost as acquire and release)
    \end{itemize}

    \vfill\null
    \columnbreak
        
    \section{OS Primitives}

    futex used as a basic primitive that is optimized to avoid frequent syscall

    originally syscall in linux but available in supporting libraries on other OSes (can use syscall from libc crate)

    use of an atomic variable for thread notifications and wakeups

    fast path in userspace when non-blocking, resort to slower syscall when blocking is required

    spurious thread wakeups possible, thus need a condition check

    solution to missing wakeup:
    \begin{itemize}
    \item use of another value:\\
      expected value match atomic variable $\implies$ blocking wait\\
      otherwise $\implies$ non-blocking
    \item wake is atomic with respect to wait:\\
      change atomic variable's value before wake\\
      $\implies$ thread that is about to wait will actually not block and skip wait therefore the wake up no longer has any effect
    \item check of expected value and wait/block (of \verb%futex_wait%) happens as a single atomic operation wrt. other futex operations
    \end{itemize}

    priority inversion problem:
    \begin{itemize}
    \item high priority thread blocked by another lower priority thread with lock held
    \item solution: allow priority inheritance temporarily when this situation occurs
    \item see \verb%FUTEX_<OP>_PI% operations
    \end{itemize}
        
    \subsection{Implementation Variants on Other OS}
    Windows:
    \begin{itemize}
    \item heavy weight objects
    \item \verb%critical_section%
    \item slim reader-writer lock
    \item address based waiting (\verb%Wait/WakeByAddress%)
    \end{itemize}

    MacOS:
    \begin{itemize}
    \item \verb%libc, libc++, obj-c, swift% interface: pthread impl
    \item platform specific lightweight \verb%os_unfair_lock%, limitations: no cond var, no reader-writer variant
    \end{itemize}

    \subsection{Standards for Accessing Kernel Scheduler via Special Libs or Syscalls}
    POSIX for Unix based systems

    pthreads extensions: provide support for threading, data types, functions for concurrency

    \subsubsection{Concerns wrt. Movable/Non-Movable Types}
    pthread structures are generally non-movable types due to self references\\
    possible workarounds:\\
    \verb%std::pin%\\
    \verb%Box<..>%: issue with leaking/forgetting an object: \verb%pthread_mutex_destroy% on locked mutex may result in undefined behaviour as per spec.\\
    
    \subsection{Futex as an Efficient Mutex}
    simple futex-like addition to \verb%C++% standard:\\
    \verb%std::atomic_wait%\\
    \verb%std::atomic_notify%

    originally added to Linux systems: \verb%SYS_futex% syscall: use of 32 bit atomic variable address to notify threads when to wake up

    solution for missing wakeup signal: atomic op for wait $\implies$ a wake, between check of expect value provided to wait and the moment it goes to sleep, is not missed

    manage state in userspace if possible, only rely on slower code path (via syscall) when absolutely necessary (need for a block)

    usually wait used in a loop to check condition of possible spurious wakeups

    futex related op arguments:
    \begin{itemize}
    \item 32 bit atomic pointer
    \item op constant
    \item optional flags, eg: \verb%FUTEX_PRIVATE, FUTEX_CLOCK_REALTIME%
    \item remaining arguments dependent on the op
    \end{itemize}
    
    futex ops:
    \begin{itemize}
    \item \verb%FUTEX_WAIT%: check and block is atomic
    \item \verb%FUTEX_WAKE%: provide max number of threads to wake up
    \item \verb%FUTEX_WAIT_BITSET%: wake only for bits set in common from a corresponding \verb%FUTEX_WAKE_BITSET% op
    \item \verb%FUTEX_WAKE_BITSET%
    \item \verb%FUTEX_REQUEUE%
    \item \verb%FUTEX_CMP_REQUEUE%
    \item \verb%FUTEX_WAKE_OP%      
    \item \verb%FUTEX_PRIVATE_FLAG%
    \end{itemize}
    
    \vfill\null
    \columnbreak
    
    \section{Primitive Implementation Examples}

    \subsection{Spin Lock}

    release store (unlock) and acquire load (lock) pair for preventation of data race (UB)
    
    \verb%std::hint::spin_leep()%: possible optimization of processor

    possible implementation:
    \begin{itemize}
    \item wraps actual data inside an \verb%UnsafeCell<T>% for interior mutability
    \item requires \verb%T: Send%
    \item locking provides exclusive access (and provides \verb%Sync% trait)
    \item uses \verb%unsafe% blocks in function, user will not have to use \verb%unsafe%
    \item use lock guard (representing safe access to locked data) pattern to manage lifetime of locked access to protected data:
    \begin{itemize}
      \item implement \verb%Deref%, \verb%DerefMut% to access data for user ergonomics (behave similar to reference)
      \item implement \verb%Drop%: automatic release store (unlocking) when lifetime of the guard ends
      \item manual drop also possible: this consumes and ends the valid lifetime of the guard and hence access to data at compile time $\implies$ any further reference and borrow to guard is invalid and will be flagged as error by the compiler
      \end{itemize}  
    \end{itemize}

    \subsection{Channels}

    \subsubsection{One Shot Channel}
    \begin{itemize}
      \item 1 message only from one thread to another thread
      \item \verb%T: Send%
      \item use of \verb%unsafe%:
        \begin{itemize}
        \item may be unitialized
        \item non-copy data must not be duplicated
        \item manual content drop may be necessary: leaking/forgetting is safe but sometimes undesirable
        \item eg: \verb%std::mem::MaybeUninit<T>% (unsafe version of \verb%Option<T>%) for efficiency where user tracks its initialized status
        \item use \verb%UnsafeCell%'s interior mutability for sharing
        \item wrapping shared struct \verb%Channel% requires \verb%T: Send% and gives \verb%Sync% in return
        \end{itemize}
      \item use of atomic swaps for setting one time flags
      \item encoding of multiple states in one word and atomic compare-exchanges
      \item possible use of runtime check/panic instead of letting UB happen
      \item use type checking from compiler to avoid errors: move/consume to avoid unwanted reuse of resources:

        use of non-Copy type and pass by value / consume by called function $\implies$ prevent caller from using that object again at compile time (also elides some runtime checks)
        \begin{itemize}
          \item \verb%TX-RX% pair for message passing, \verb%Channel% shared inside their private implmentations
          \item use of \verb%Arc<T>% for sharing of allocation and resource dropping: \verb%RX% drop and \verb%TX% drop $\implies$ \verb%Arc<T>% drop $\implies$ \verb%T% drop
          \item \verb%Arc<T>% incurs extra runtime overhead for allocation
          \end{itemize}
          
        \item allocation optimization
          \begin{itemize}
          \item borrowing instead of memory allocation (\verb%Arc%)
          \item use of lifetimes and mutable borrow for compile time checks
          \item \verb%Channel% explicitly created by user ahead of time and passed in to \verb%RX% and \verb%TX% as references upon construction in the \verb%split% method
          \item \verb%TX, RX% take in additional lifetime parameter which is the same as the lifetime of the borrowed \verb%Channel%: when \verb%TX% or \verb%RX% is present, existing \verb%Channel% cannot be mutably borrowed again until \verb%TX% and \verb%RX% are both dropped
          \item \verb%Channel% needs to outlive \verb%TX% and \verb%RX% for compiler check to pass
          \item \verb%Channel% resets its contents on entry to \verb%split% method in case it is used multiple times
          \end{itemize}
          
      \item overwriting content for fresh initialization (when calling \verb%split%): after 1st borrow expires, subsequent borrows are made on these newly created resources
      \item blocking interface:
        \begin{itemize}
        \item make \verb%RX% object not \verb%Send%, such as using a \verb%PhantomData<* const ()>% member field so that auto trait deduction for the wrapping struct is propagated to be \verb%!Send%:\\
          \verb%* const () : !Send% $\implies$\\
          \verb%PhantomData<* const ()> : !Send% $\implies$\\
          wrapping struct is \verb%!Send%
          \item \verb%RX% stays on the same thread, \verb%TX% allows to cross thread boundaries
          \item use receiving thread's handle to invoke waking up a blocked thread (place this inside the sender struct)
          \item call unpark on receiving thread's handle after release store optation is performed by the sender
          \item recever checks \verb%Channel%'s variable to avoid spurious wakeup
        \end{itemize}
      \end{itemize}

      \vfill\null
      \columnbreak

    \subsection{Arc}

    basic implementation: use a pointer type to the shared underlying data via \verb%std::ptr::NonNull%, use counter info for the shared data; use \verb%NonNull::from(Box::leak(Box::new(..)))% to get a pointer from initial allocation

    implement ergonomic methods \verb%deref% and \verb%deref_mut% from traits: \verb%Deref%, \verb%DerefMut%

    let \verb%cloning% change internal counter and point to the shared data
    
    requires \verb%T: Send + Sync% and gives wrapping \verb%Arc<T>% \verb%Send + Sync%
    
    auto traits is not active for raw pointer types (including \verb%NonNull%) wrt. \verb%Send% and \verb%Sync%

    cloning corresponds to incrementing counter and giving a shared reference to underlying data:
\begin{verbatim}
  impl<T> Clone for Arc<T> {
    fn clone(&self) -> Self {
      self.data().ref_count.fetch_add(1, Relaxed);
      Arc {
        ptr: self.ptr,
      }
    }
  }
\end{verbatim}
    
    only final decrement needs to be acquire and release, while all others can be only release:
\begin{verbatim}
  impl<T> Drop for Arc<T> {
    fn drop(&mut self) {
    if var.fetchsub(1, release) == 1 {
      fence(acquire); //conditional acquire
      //drop logic
      unsafe {
        drop(Box::from_raw(self.ptr.as_ptr()));
      }
    }
  }
\end{verbatim}

    exclusive access to shared data, conditionally in a runtime branch, eg:
\begin{verbatim}
   pub fn get_mut(arc: &mut Self) ->
    Option<&mut T> {
    if arc.data().ref_count.load(Relaxed) == 1 {
        fence(Acquire); //contional acquire
        //gained exclusive access now
        unsafe { Some(&mut arc.ptr.as_mut().data) }
    } else {
        None
    }
  }
\end{verbatim}
    
    Miri interpreter for simulation and verification of unsafe code

    weak version of \verb%Arc<T>%: \verb%Weak<T>%
    \begin{itemize}
    \item \verb%T% can be shared between \verb%Arc<T>% and \verb%Weak<T>%
    \item \verb%Weak<T>% does not prevent drop of \verb%T%, eg: all \verb%Arc<T>% dropped $\implies$ \verb%T% dropped
    \item \verb%Weak<T>% exists without reliance of \verb%T%, which can provide conditional access to \verb%& T%-like object:\\
      implement upgrade function to get \verb%Option<Arc<T>>% where it's \verb%None % if \verb%T% is already dropped
    \end{itemize}

    cycle breaking: use 2 counters:
    \begin{itemize}
    \item strong pointer count (\verb%data_ref_count%)
    \item weak pointer + strong pointer count (\verb%alloc_ref_count%)
    \end{itemize}

    wrapping struct \verb%ArcData<T>%:
    \begin{itemize}
      \item use interior mutability of an optional
      \item keep extra info of shared counters referencing its data
    \end{itemize}
  
    drop implementation of weak pointer and strong pointer:
    \begin{itemize}
    \item strong pointer count is 0 $\implies$ drop \verb%T%
    \item weak pointer + strong pointer count dropping to 0 $\implies$ droppping \verb%ArcData<T>%
    \end{itemize}

    Rust drop order:
    \begin{itemize}
    \item run \verb%Drop::drop% on object
    \item drop the object's fields 1 by 1 recursively
    \end{itemize}

    cloning pointers:
    \begin{itemize}
    \item \verb%Arc%: increment weak counter, increment strong counter
    \item \verb%Weak%: increment weak counter
    \end{itemize}

    dereferencing:
    \begin{itemize}
    \item \verb%Arc%: unconditionally dereference since existence of \verb%Arc% implies underlying \verb%T% is valid
    \item \verb%Weak%: upgrade to strong pointer
      \begin{itemize}
        \item atomic increment and compare swap on strong counter to give out access
        \item upgrade and return an \verb%Arc% to caller
        \item if strong pointer counter is 0 $\implies$ data doesn't exist and abort operation
      \end{itemize}
    \end{itemize}

    strong pointer access to mutate data: runtime conditional check to allow exclusive access to \verb%T%
    \begin{itemize}
    \item check weak pointer counter is 0, strong pointer count is 1
    \item cast underlyding data and return \verb%& mut T%; safe since \verb%Arc% exists
    \end{itemize}

    convert from strong pointer to weak pointer (downgrade): call clone on weak pointer and return the weak pointer

    possible optimization: use 1 atomic counter instead of 2 $\implies$
    \begin{itemize}
      \item if user is not using weak pointers then they don't have to pay the cost when cloning /dropping
      \item use \verb%ManuallyDrop<T>% instead of \verb%Option<T>% to save an extra state and use existence of \verb%Arc<T>% to know if data is gone or not
      \item let 1 weak count represets all existing \verb%Arc<T>%s $\iff$ 1 \verb%Arc<T>% left, decrement 1 weak count associated with all of them
      \item downgrade and \verb%get_mut% requires more change:
      \item \verb%get_mut%
        \begin{itemize}
        \item need to check 2 atomic counters
        \item  temporarily lock downgrade operation by use of a special value indicating locked state for weak counter; use compare-exchange on \verb%alloc_ref_count% (weak pointer + strong pointer) variable to replace with special value if the condition applies 
          \item check if strong pointer count == 1 $\implies$ we have exclusive access to data, replace special value earlier to unlock it (weak pointer + strong pointer count) and return \verb%& mut T%
          \end{itemize}
      \item downgrade
        \begin{itemize}
        \item check that special value for \verb%alloc_ref_count% (weak pointer + strong point counter) is not present, otherwise loop
        \item compare-exchange acquire with \verb%get_mut% method on \verb%alloc_ref_count% atomic variable: increment if success and this will make future \verb%get_mut% fail until \verb%Weak::drop% makes this \verb%alloc_ref_count% go back to 1 via release memmory ordering
        \end{itemize}
      \end{itemize}
    
    \subsection{Locks}

    atomic-wait crate for providing cross platform interface for futex-like syscall:
    \begin{itemize}
    \item \verb%wait(& AtomicU32, u32)%
    \item \verb%wake_one(& AtomicU32)%
    \item \verb%wake_all(& AtomicU32)%
    \end{itemize}
    platform implementation:
    \begin{itemize}
    \item Linux: \verb%futex% syscall
    \item Windows: \verb%WaitOnAddress%
    \item FreeBSD: \verb%_umtx_op%
    \item MacOS: \verb%libc++%
    \end{itemize}

    \subsubsection{Constructing Mutex}
    use atomic variable for tutex-like syscall (invoke wait for locking):
\begin{verbatim}
    struct Mutex<T> {
      state:: AtomicU32,
      value: UnsafeCell<T>,
    }

    struct MutexGuard<'a, T> {
      mutex: & 'a Mutex<T>,
    }

    //Produce a guard as proof of exclusive access.
    //
    //add Deref and DerefMut traits for data access
    //like & T, & mut T
    fn lock(& self) -> MutexGuard<T> {
      while self.state.swap(1, Acquire) == 1 {
        wait(&self.state, 1); //wait until state != 1
      }
      MutexGuard { mutex: self }
    }
\end{verbatim}

    use drop of guard to unlock mutex by invoking wake after setting state to unlocked value:
\begin{verbatim}
    imnpl<T> Drop for MutexGuard<'_, T> {
      fn drop(& mut self){
        self.mutex.state.store(0, Release);
        // sufficient for 1 thread to claim lock
        wake_one(&self.mutex.state);
      }
    }
\end{verbatim}    

    note: removal of wait and awake pair (atomic operations) $\implies$
    \begin{itemize}
    \item still correct wrt. memory safety
    \item equivalent to spin lock
    \item serve as optimization
    \end{itemize}

    helper crate (\verb%lock_api%) to generate API/boilerplate for Mutex related things where user provides:
    \begin{itemize}
    \item type representing lock state
    \item unlock and lock functions in \verb%lock_api::RawMutex% trait
    \end{itemize}

    futher optimizations to avoid syscall:
    \begin{itemize}
    \item avoid unconditional awake when dropping MutexGuard: use extra info to track if there are no other threads waiting:
      \begin{itemize}
        \item split state into more values:
          \begin{itemize}
          \item 0: unlocked
          \item 1: locked, no threads waiting
          \item 2: locked, $\geq$ 1 threads waiting
          \end{itemize}
        \item uncontended case: \verb%wait% and \verb%wake_one% are both avoided
        \item contended case: thread that waits will need to eventually do \verb%wake_one%
      \end{itemize}
    \item further optimize by incorporating spin wait for brief duration before resorting a heavy cost wait:
      \begin{itemize}
        \item use case when exclusive access is needed only for a very short time
        \item insert spin loop for a number of iterations for the case when there are no other waiters (state value of 1)
      \end{itemize}
    \end{itemize}
 
    \subsubsection{Constructing Condition Variable}
    for use with a lock (eg: mutex)
    
    unlocks mutex on \verb%wait()%

    locks on a notify signal from another thread

    checks on some supplied condition to enable further access to critital section for associated thread

    thread may be spuriously woken up which is kept in check by the supplied condition, however this locks and unlocks mutex so it takes up compute cycles

    mechanism to prevent lost signal is different to futex:
    \begin{itemize}
    \item cond var starts to listen to signal before unlocking mutex
    \item futex uses a check of state of atomic variable to make sure waiting is a good idea
    \end{itemize}

    possible implementation of cond var using futex:
    \begin{itemize}
    \item let every notification change an atomic variable
    \item \verb%wait()% load the atomic variable before unlocking and call futex wait with the said atomic variable (and let futex wait return before return on a received notify signal)\\
      $\implies$ cond var's most simplistic impl. only needs 1 atomic variable and futex calls, \verb%wait% and \verb%wake/notify%
    \item spurious wakeups on waiting thread: wrap an outer loop around \verb%wait()% call in order to check for user supplied condition
    \end{itemize}

    \begin{verbatim}
    fn wake(){
      atomic_var++;
      futex::wake(atomic_var);
    }

    fn wait(guard: MutexGuard) -> MutexGuard {
      //already locked
      let counter = atomic_var.load(Relaxed);
      let mtx = guard.mutex;

      //explicit drop does a release store on some
      //atomic variable associated with the mutex
      //
      //this happens-before another thread locking
      //mutex (acquire load on some atomic variable
      //associated with the mutex) and signaling,
      //therefore relaxed load of atomic variable
      //by wait() is enough
      guard.drop();

      //unlocked now

      futex::wait(atomic_var, counter);

      mtx.lock() //lock again
    }
    \end{verbatim}

    futex wake-wait pair is atomic $\iff$
    either waiting thread goes to sleep and get woken up later or the thread does not go to sleep and continues on
    
    optimization:
    \begin{itemize}
    \item for the case of wait: not much optimization since thread has decided to call wait anyway after checking supplied  condition
    \item for the case of wake: avoid call if no waiting threads present:\\
      introduce variable to track number of waiters (increment when prior to waiting, decrement when done waiting), notify threads can skip if count is 0
    \item optimization to reduce spurious wakeups: multi-group of waiters and swapping
    \item optimization to reduce \verb%notify_all%'s thundering herd problem: use futex requeue
    \end{itemize}
    
    \subsubsection{Constructing Reader-Writer Lock}
    \verb%T: Send + Sync% $\implies$ \verb%RWLock<T>: Send + Sync%

    \verb%T% requires \verb%Send% just like the case for \verb%Mutex<T>%
    
    \verb%T% additionally requires \verb%Sync% because it may be shared between threads for multiple readers

    idea: counter for shared readers, cell for interior mutability

    2 lock types to access data:
    \begin{itemize}
    \item \verb%ReadGuard%: allows $\geq1$ readers at the same time
    \item \verb%WriteGuard%: behaves similar to \verb%Mutex% (exclusive access from 1 thread at a time
    \end{itemize}

    optimizations:
    \begin{itemize}
    \item introduce another counter variable for waiting writers in order to reduce  writer spin looping and waking up excessively due to presence of readers
    \item writer starvation problem: add additional states to account for cases of waiting writer:\\
      when writer lock not acquired $\implies$ 2 * (\# of readers) + $\mathbbm{1}$(any waiting writer present)\\
      when writer lock acquired $\implies$ \verb%U32::Max%\\

      (writer present $\iff$ odd number) $\implies$ readers need to be blocked
      
    \end{itemize}

    \subsubsection{summary}
    \begin{itemize}
    \item \verb%atomic-wait% crate for futex-like interface on major OSes
    \item efficient mutex implementation tracks info on waiting threads to avoid extra syscalls
    \item cond var may track number of waiting threads to avoid extra wake operations
    \item additional variables are used in lock to wake writers independently from readers
    \item extra state may be used to prioritize waiting writer over waiting readers
    \end{itemize}

    \vfill\null
    \columnbreak
    
    \section{Additional Ideas}
    todo

    \bibliographystyle{plain}
    \bibliography{notebook}
    
  \end{multicols*}

\end {document}