From ce0781136681ef17250627abb50972e0d9f7bbf0 Mon Sep 17 00:00:00 2001
From: Huanyi Chen <huanyi.chen@uwaterloo.ca>
Date: Fri, 9 Feb 2024 11:24:32 -0500
Subject: [PATCH] Prepare L16 & L17 flipped note

---
 lectures/flipped/L16.md | 34 +++++++++++++++++++++-------------
 lectures/flipped/L17.md |  8 +++++++-
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/lectures/flipped/L16.md b/lectures/flipped/L16.md
index 5828656e..ceac4e41 100644
--- a/lectures/flipped/L16.md
+++ b/lectures/flipped/L16.md
@@ -1,5 +1,11 @@
 # Lecture 16 — Mostly Data Parallelism
 
+## Roadmap
+
+We will talk about SIMD and a case study of it.
+
+## Two ideas
+
 Data parallelism: performing *the same* operations on different input
 
 Task parallelism: is performing *different* operations on different input.
@@ -23,22 +29,24 @@ pub fn foo(a: &[f64], b: &[f64], c: &mut [f64]) {
 Without optimization
 
 ```asm
-movsd xmm0, qword ptr [rcx]
-addsd xmm0, qword ptr [rdx]
-movsd qword ptr [rax], xmm0
+; rustc 1.75.0
+; line 758-760 on godbolt
+movsd   xmm0, qword ptr [rdx]
+addsd   xmm0, qword ptr [rcx]
+movsd   qword ptr [rax], xmm0
 ```
 
 With optimization (opt-level=3)
 
 ```asm
-movupd xmm0, xmmword ptr [rdi + 8*rcx]
-movupd xmm1, xmmword ptr [rdi + 8*rcx + 16]
-movupd xmm2, xmmword ptr [rdx + 8*rcx]
-addpd xmm2, xmm0
-movupd xmm0, xmmword ptr [rdx + 8*rcx + 16]
-addpd xmm0, xmm1
-movupd xmmword ptr [r8 + 8*rcx], xmm2
-movupd xmmword ptr [r8 + 8*rcx + 16], xmm0
+movupd  xmm0, xmmword ptr [rdi + 8*rsi]
+movupd  xmm1, xmmword ptr [rdi + 8*rsi + 16] ; seems like a delay slot
+movupd  xmm2, xmmword ptr [rdx + 8*rsi]
+addpd   xmm2, xmm0
+movupd  xmm0, xmmword ptr [rdx + 8*rsi + 16]
+addpd   xmm0, xmm1
+movupd  xmmword ptr [r8 + 8*rsi], xmm2
+movupd  xmmword ptr [r8 + 8*rsi + 16], xmm0
 ```
 
 Exercise: try to use <https://godbolt.org/> to explore the `foo` function. You
@@ -60,7 +68,7 @@ are in the lecture note)
 ```rust
 /*
 
-// runnable on rust explorer
+// not runnable on rust explorer
 
 [dependencies]
 simdeez = "1.0.8"
@@ -131,4 +139,4 @@ then have a scalar result.
 
 Tried again with poor-man's SIMD on the board, then worked through
 Stream VByte example. Did not have students work through examples but
-was somewhat interactive. Did do the SSE example as live coding.
\ No newline at end of file
+was somewhat interactive. Did do the SSE example as live coding.
diff --git a/lectures/flipped/L17.md b/lectures/flipped/L17.md
index d4b879b8..47cbac38 100644
--- a/lectures/flipped/L17.md
+++ b/lectures/flipped/L17.md
@@ -1,6 +1,12 @@
 # Lecture 17 — Compiler Optimizations
 
-Question
+## Roadmap
+
+We will talk about compiler optimizations, specifically, scalar optimizations,
+loop optimizations, and link-time optimizations.
+
+## Question
+
 * How do you enable compiler optimization in `cargo`?
 * Which profile will it use when you call `cargo build` or `cargo build
   --release`?