From ce0781136681ef17250627abb50972e0d9f7bbf0 Mon Sep 17 00:00:00 2001 From: Huanyi Chen Date: Fri, 9 Feb 2024 11:24:32 -0500 Subject: [PATCH] Prepare L16 & L17 flipped note --- lectures/flipped/L16.md | 34 +++++++++++++++++++++------------- lectures/flipped/L17.md | 8 +++++++- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/lectures/flipped/L16.md b/lectures/flipped/L16.md index 5828656e..ceac4e41 100644 --- a/lectures/flipped/L16.md +++ b/lectures/flipped/L16.md @@ -1,5 +1,11 @@ # Lecture 16 — Mostly Data Parallelism +## Roadmap + +We will talk about SIMD and a case study of it. + +## Two ideas + Data parallelism: performing *the same* operations on different input Task parallelism: is performing *different* operations on different input. @@ -23,22 +29,24 @@ pub fn foo(a: &[f64], b: &[f64], c: &mut [f64]) { Without optimization ```asm -movsd xmm0, qword ptr [rcx] -addsd xmm0, qword ptr [rdx] -movsd qword ptr [rax], xmm0 +; rustc 1.75.0 +; line 758-760 on godbolt +movsd xmm0, qword ptr [rdx] +addsd xmm0, qword ptr [rcx] +movsd qword ptr [rax], xmm0 ``` With optimization (opt-level=3) ```asm -movupd xmm0, xmmword ptr [rdi + 8*rcx] -movupd xmm1, xmmword ptr [rdi + 8*rcx + 16] -movupd xmm2, xmmword ptr [rdx + 8*rcx] -addpd xmm2, xmm0 -movupd xmm0, xmmword ptr [rdx + 8*rcx + 16] -addpd xmm0, xmm1 -movupd xmmword ptr [r8 + 8*rcx], xmm2 -movupd xmmword ptr [r8 + 8*rcx + 16], xmm0 +movupd xmm0, xmmword ptr [rdi + 8*rsi] +movupd xmm1, xmmword ptr [rdi + 8*rsi + 16] ; seems like a delay slot +movupd xmm2, xmmword ptr [rdx + 8*rsi] +addpd xmm2, xmm0 +movupd xmm0, xmmword ptr [rdx + 8*rsi + 16] +addpd xmm0, xmm1 +movupd xmmword ptr [r8 + 8*rsi], xmm2 +movupd xmmword ptr [r8 + 8*rsi + 16], xmm0 ``` Exercise: try to use to explore the `foo` function. You @@ -60,7 +68,7 @@ are in the lecture note) ```rust /* -// runnable on rust explorer +// not runnable on rust explorer [dependencies] simdeez = "1.0.8" @@ -131,4 +139,4 @@ then have a scalar result. Tried again with poor-man's SIMD on the board, then worked through Stream VByte example. Did not have students work through examples but -was somewhat interactive. Did do the SSE example as live coding. \ No newline at end of file +was somewhat interactive. Did do the SSE example as live coding. diff --git a/lectures/flipped/L17.md b/lectures/flipped/L17.md index d4b879b8..47cbac38 100644 --- a/lectures/flipped/L17.md +++ b/lectures/flipped/L17.md @@ -1,6 +1,12 @@ # Lecture 17 — Compiler Optimizations -Question +## Roadmap + +We will talk about compiler optimizations, specifically, scalar optimizations, +loop optimizations, and link-time optimizations. + +## Question + * How do you enable compiler optimization in `cargo`? * Which profile will it use when you call `cargo build` or `cargo build --release`?