forked from stv0g/rwth-misc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hpc_summary.tex
883 lines (638 loc) · 34.7 KB
/
hpc_summary.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,latexsym}
\usepackage{fancyhdr}
\usepackage[margin=2.50cm]{geometry}
\usepackage[compact]{titlesec}
\usepackage{hyperref}
\usepackage[T1] {fontenc}
\usepackage{color}
\usepackage{multirow}
\usepackage{array}
\usepackage{paralist}
\usepackage{listings}
\usepackage{enumitem}
\hypersetup{colorlinks=true, linkcolor=blue, urlcolor=cyan}
\lstset{
moredelim=[is][\color{green}]{§}{§},
moredelim=[is][\color{red}]{?}{?},
moredelim=[is][\color{cyan}]{|}{|}
}
% Title infos
\title{HPC Formulars}
\author{Steffen Vogel}
\date{\today}
% headline
\lhead{HPC Formulars}
\chead{Steffen Vogel}
\rhead{\today}
\pagestyle{fancy}
\setlength{\parindent}{0cm}
\begin{document}
\section{Amdahl \& Gustafson}
\begin{tabular}{ l | c c c }
& \textbf{General} & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
& & \textit{weak scaling} & \textit{strong scaling} \\
\hline
Speedup \( S_p(N) \) & \( \frac{T(1)}{T(N)} \) & \( \frac{1}{S + \frac{1 - S}{N}} \) & \( Np + s \) \\
Efficency \( \varepsilon_p(N) \) & \( \frac{S_p(N)}{N} = \frac{T(1)}{N \cdot T(N)} \) & \( \frac{1}{s(N-1) + 1} \) & \( \frac{1 - p}{N} \) \\
\end{tabular}
\section{Moore's Law}
\begin{quote}
\# of transistors / cost-effective IC doubles every $N$ months, with $12 \leq N \leq 24$
\end{quote}
\section{Energy Efficency}
\begin{tabular}{ p{7cm} l }
Power-voltage-frequency & \( P \sim V^2 \cdot f \) or \( P \sim f^3 \) \\
& \\
energy efficency & = \( \frac{\text{energy}}{\text{Flop}} \) \\
& = \( \frac{\text{power consumption} \cdot \text{time}}{\text{Flop}} \) \\
& = \( \frac{\text{power consumption}}{\text{Flop}/s} \) \\
energy & = \( \text{power consumption} \cdot \text{time} \) \\
& \\
Physical limit for efficency & 2.87e-21 Joule/Bit \( \approx ln(2) \cdot k \cdot T \) \\
& 1.84e-19 Joule/Flop \\
Green TOP500 \#1 & 2.22e-10 Joule/Flop \\
& \\
Power wall & expected to be around 20 to 25 MWatt \\
\end{tabular}
\section{Pipelining}
\begin{tabular}{ p{7cm} l }
Stages & $m$ \\
Operations & $N$ \\
Without pipeline & \( T_{seq} = m \cdot N \) \\
With pipeline & \( T_{pipe} = N + m − 1 \) \\
Speedup & \(S_{pipe} = \frac{m}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} m \) \\
Throughput (results/cycle) & \( \frac{N}{T_{pipe}} = \frac{1}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} 1 \) \\
\end{tabular}
\section{Memory Access}
\begin{tabular}{ p{7cm} l }
Hit rate & $\beta$ \\
Miss rate & $(1 - \beta)$ \\
Access time& $T_m$ (Memory), $T_c$ (Cache) \\
Average access time & \( T_{avg} = \beta T_c + (1 - \beta) T_m \) \\
Performance gain & \( G( \theta, \beta ) = \frac{T_m}{T_{avg}} = \frac{\theta}{\beta + \theta(1 - \beta)}, \theta = \frac{T_m}{T_c} \) \\
\end{tabular}
\section{Networks}
\subsection{Transmission \& Bandwidth}
\begin{tabular}{ p{7cm} l }
Transmission time & \( T = T_L + \frac{N}{B} \) \\
Effective bandwidth & \( B_{eff} = \frac{N}{T} = \frac{N}{T_L + \frac{N}{B}} \) \\
\end{tabular}
\subsection{Topologies}
\begin{tabular}{l | c c c c }
\textbf{Topology} & \textbf{Max degree} & \textbf{Edge connectivity} & \textbf{Diameter} & \textbf{Bisection BW} \\
\hline
Bus & 1 & 1 & 1 & B \\
Ring & 2 & 2 & \( \lfloor \frac{N}{2} \rfloor \) & 2B \\
Fully connected & \( \frac{N(N-1)}{2} \) & N-1 & 1 & \( \frac{N^2}{4} \) \\
Sw. Fat Tree & 1 w/o redunancy & depends on design & 2 hierarchy height & depends on design \\
Mesh & 2d & d & \( \sum_{i=1}^d (N_i - 1) \) & \( B ( \prod_{i=1}^{d-1} N_i ) \) \\
Torus & 2d & 2d & \( \sum_{i=1}^d \lfloor \frac{N}{2} \rfloor \) & \( 2B ( \prod_{i=1}^{d-1} N_i ) \) \\
Hypercube & d & d & d & \( B2^{d-1} \) \\
\end{tabular}
\section{Balance, Lightspeed}
\begin{tabular}{ p{7cm} l }
Code balance & \( B_c = \frac{\text{data transfers}}{\text{arithmetic ops}} = \frac{[Words]}{[Flops]} \) \\
Machine balance & \( B_m = \frac{b_s}{P_{max}} \) \\
Relative lightspeed & \( l = \min(1, \frac{B_m}{B_c} ) \) \\
Absolute lightspeed performance & \( P = P_{max} \cdot l \) \\
\end{tabular}
\section{Programming API's}
Argument directions: \lstinline$§input§$, \lstinline$?output?$, \lstinline$|inout|$
Common arguments:
communicator = cm
\subsection{MPI}
All MPI routines return an integer error code!
\subsubsection{Initialization}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Init(§argc§, §argv§)$ & Has to be called before every other MPI\_* function. \\
\lstinline$MPI_Finalize()$ & Like \lstinline$MPI_Init()$ this must only be called once. \\
\lstinline$MPI_Initialized(?flag?)$ & Check if MPI runtime was already initialized. \\
\lstinline$MPI_Finalized(?flag?)$ & Check if MPI runtime was already shut down. \\
\end{tabular}
\subsubsection{Point-to-point data transfer}
\begin{tabular}{ l | l l l }
\textbf{Mode / Op} & \textbf{Blocking} & \textbf{Non-blocking} & \textbf{Completes after...} \\
\hline
Standard & \lstinline$MPI_Send$($\otimes$) & \lstinline$MPI_Isend$($\otimes, \ddag$) & the message has been safely stored away. \\
Synchronous & \lstinline$MPI_Ssend$($\otimes$) & \lstinline$MPI_Issend$($\otimes, \ddag$) & the receive was called. \\
Buffered & \lstinline$MPI_Bsend$($\otimes$) & \lstinline$MPI_Ibsend$($\otimes, \ddag$) & the message was stored away in a buffer. \\
Ready-mode & \lstinline$MPI_Rsend$($\otimes$) & \lstinline$MPI_Irsend$($\otimes, \ddag$) & only if the receive was called before send. \\
\hline
Standard & \lstinline$MPI_Recv$($\diamond, \dag$) & \lstinline$MPI_Irecv$($\diamond, \ddag$) & a message was received. \\
Probing & \lstinline$MPI_Probe$() & \lstinline$MPI_Iprobe$ & immediately. \\
Waiting & \lstinline$MPI_Wait$($\ddag, \dag$) & \lstinline$MPI_Test$($\ddag, $\lstinline$?flag?$$, \dag$) & message available. \\
\end{tabular}
\textbf{Signatures}: \\
{ \footnotesize
\hspace{2cm} $\otimes$ \lstinline$(§buffer§, §count§, §type§, §dst§, §tag§, §cm§)$, \\
\hspace{2cm} $\diamond$ \lstinline$(?buffer?, §count§, §type§, §dst§, §tag§, §cm§)$, \\
\hspace{2cm} $\dag$ \lstinline$(?status?)$ $\ddag$ \lstinline$(?request?)$
}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Get_count(§stat§, §type§, ?count?)$ & Get the number of received elements by \lstinline$MPI_Recv()$. \\
\lstinline$MPI_Sendrecv(§sbuf§, §scnt§, §stype§, §dst§, §stag§,$ & Simultaneous, deadlock-free send and receive. \\
\lstinline$ ?rbuf?, ?scnt?, ?stype?, ?dst?, ?stag?, §cm§, ?stat?)$ & \\
\lstinline$MPI_Sendrecv_replace(...)$ & Like \lstinline$MPI_Sendrecv()$ but using the same buffer. \\
\end{tabular}
\subsubsection{Collective data transfer}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Bcast(|buf|, §cnt§, §type§, §root§, §cm§)$ & Broadcasts data from root to all other. \\ \lstinline$MPI_Scatter(§sbuf§, §scnt§, §stype§,$ & Scatters chunks of data from root to all \\
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
\lstinline$MPI_Gather(§sbuf§, §scnt§, §stype§,$ & Gathers chunks of data from root to all \\
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
\lstinline$MPI_Allgather(...)$ & \\
\lstinline$MPI_Alltoall()$ & \\
\lstinline$MPI_Redude()$ & \\
\lstinline$MPI_Allreduce(...)$ & \\
\end{tabular}
\subsubsection{Synchronisation}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Barrier(§cm§)$ & Waits for all processes. \\
\end{tabular}
\subsubsection{Communicator handling}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Comm_dup(...)$ & \\
\lstinline$MPI_Comm_split(...)$ & \\
\lstinline$MPI_Comm_compare(...)$ & \\
\lstinline$MPI_Comm_free(...)$ & \\
\lstinline$MPI_Comm_rank(§cm§, ?rank?)$ & Get rank/id of current process in this communicator. \\
\lstinline$MPI_Comm_size(§cm§, ?size?)$ & Get total number of processes in this communicator. \\
\end{tabular}
\subsubsection{Virtual topologies}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Cart_create(...)$ & \\
\lstinline$MPI_Cart_rank(...)$ & \\
\lstinline$MPI_Cart_shift(...)$ & \\
\lstinline$MPI_Cart_sub(...)$ & \\
\lstinline$MPI_Cart_coords(...)$ & \\
\lstinline$MPI_Dims_create(...)$ & \\
\end{tabular}
\subsubsection{Derived Datatypes}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Type_contiguous(...)$ & \\
\lstinline$MPI_Type_vector(...)$ & \\
\lstinline$MPI_Type_commit(...)$ & \\
\lstinline$MPI_Type_free(...)$ & \\
\end{tabular}
\subsubsection{Timekeping}
\begin{tabular}{ p{8cm} l }
\lstinline$double MPI_Wtime()$ & Measure local wall-clock time. \\
\lstinline$double MPI_Wtick()$ & Obtain the precision of the MPI timer. \\
\end{tabular}
\subsection{CUDA}
\subsubsection{General Memory Operations}
\begin{tabular}{ p{7cm} l }
\lstinline$cudaMalloc(&p, n*sizeof(float));$ & Allocate memory of n floats on GPU, save pointer in p\\
\lstinline$cudaFree(p);$& Free memory on GPU, which p is pointing to\\
\lstinline$cudaHostMalloc(&p, n*sizeof(float));$ & Allocate memory of n floats on Host, save pointer in p\\
\lstinline$cudaHostFree(p);$& Free memory on Host, which p is pointing to\\
\lstinline$cudaMemcpy(d, h, n * sizeof(float),DIR);$ & memcopy Host => GPU, if DIR = cudaMemcpyHostToDevice \\
\lstinline$cudaMemcpy(h, d, n * sizeof(float),DIR);$& GPU => Host if DIR = cudaMemcpyDeviceToHost\\
\lstinline$func<<<blockNum, threadNum>>>();$ & Call CUDA Kernel code $func$, with given \# of blocks / threads\\
\end{tabular}
\subsubsection{Shared Memory}
\begin{tabular}{ p{7cm} l }
\lstinline$__shared__ double[x][y];$ & Static shared memory between threads\\
\lstinline$extern __shared__ double[x][y];$ & Dynamic shared memory between threads\\
\lstinline$__syncthreads();$ & Synchronize threads after shared memory usage \\
\end{tabular}
\subsubsection{Asynchronous operations}
\begin{tabular}{ p{7cm} l }
\lstinline$cudaStream_t st1,st2;$ & Defines streams. Instructions of one stream are exec'd in order.\\
\lstinline$cudaStreamCreate(&st1)$& Initialize Stream\\
\lstinline$cudaMemcpyAsync(dst,src,size,dir,st1);$& Async memcpy in Stream st1. (nonlbocking)\\
\lstinline$cudaStreamSynchronize(st1);$ & Host waits for stream\\
\lstinline$cudaDeviceSynchronize();$ & Host waits for everything \\
\end{tabular}
\subsection{OpenMP}
\begin{tabular}{ p{7cm} l }
\lstinline$#pragma omp parallel$ & \\
\lstinline$#pragma omp for [simd]$ & \\
\lstinline$#pragma omp sections$ & \\
\lstinline$#pragma omp section$ & \\
\lstinline$#pragma omp task$ & \\
\lstinline$#pragma omp target [data]$ & \\
\lstinline$#pragma omp distribute$ & \\
\lstinline$#pragma omp teams$ & \\
\lstinline$#pragma omp target$ & \\
\lstinline$#pragma omp target$ & \\
\end{tabular}
\subsubsection{Environment variables}
\begin{tabular}{ p{7cm} l }
\lstinline$OMP_NUM_THREADS$ & \\
\lstinline$OMP_PLACES$ & \\
\lstinline$KMP_AFFINITY$ & \\
\end{tabular}
\subsubsection{Runtime API}
\begin{tabular}{ p{7cm} l }
\lstinline$omp_set_num_threads(n)$ & \\
\end{tabular}
\subsection{OpenACC}
\begin{tabular}{ p{7cm} l }
\lstinline$#pragma acc parallel [loop]$ & \\
\lstinline$#pragma acc wait$ & \\
\lstinline$#pragma acc parallel$ & \\
\lstinline$#pragma acc parallel$ & \\
\lstinline$#pragma acc parallel$ & \\
\end{tabular}
\section{LSF}
TODO
\section{Common problems and operations}
\begin{tabular}{ p{7cm} l }
Single-precision real Alpha X Plus Y & \( \vec{y} = \alpha \cdot \vec{x} + \vec{y} \) \\
(S/DAXPY, FMA, Triad) & \\
Sparse matrix vector multiply & \( \vec{y} = \mathbf{A} \cdot \vec{x} \) \\
Dense matrix vector multiply & \\
Dense matrix multiplication & \( \mathbf{A} = \mathbf{B} \cdot \mathbf{C} \) \\
Dense matrix diagonalization & \( \mathbf{P}^{-1} \mathbf{A} \mathbf{P} = diag(\lambda_1, ..., \lambda_n) \) \\
Scalar product & \( s = \vec{x} \cdot \vec{y} \) \\
Vector addition & \( \vec{y} = \vec{x} + \vec{z} \) \\
Matrix addition & \( \mathbf{A} = \mathbf{B} + \mathbf{C} \) \\
Matrix transpose & \( \mathbf{A} = \mathbf{B}' \) \\
\end{tabular}
\newpage
\section{What you've learned?}
\subsection{Why supercomputers?}
\begin{description}[style=nextline]
\item[What is a supercomputer?] A supercomputer is a computer at the frontline of current processing capacity, particularly speed of calculation.
\item[What are supercomputers used for?]
\begin{itemize}
\item Scientific computing \& simulation
\item Weapon research
\item Economics (Oil, minerals)
\item Medicine
\item Meteorology
\end{itemize}
\item[What are recent supercomputers] See TOP500 list of world's fastest systems:
\begin{enumerate}
\item Tianhe-2 (33 PFlops/s)
\item Titan (17 PFlops/s)
\end{enumerate}
\item[What can we read from the performance development as measured in the TOP500?] The TOP500 lists by peak performance $R_max$ in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
Future trends are predictable (exascale computing, architectures).
\item[What does Moore's Law tell you? Is it still valid?] "The number of transistors per chhip doubles around every one to two years." \\
Slope is now declining slowy. But mostly still valid.
\item[Why do we have multi-core architectures today?] Clock frequencies can't pushed further due to limited cooling capabilities. \\
Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
\end{description}
\newpage
\subsection{Energy Efficency}
\begin{description}[style=nextline]
\item[What is the power wall?] 20 to 25 MegaWatt of energy consumption is considered an important barrier for HPC centres.
\item[What is the problem?] To meet the exa-scale goal in 2019, the current trend of energy efficiency isn't sufficient. \\
Power consumption will become the dominating cost factor for operate clusters in the future.
\begin{description}[style=nextline]
\item[Aspects of energy efficency] Energy efficiency is continously improving. To meed the exa-scale goal this has to focused more by using new architectures (accelerators).
\item[Physical descriptions \& limits] Landauer's principle: "Changing one bit of intermation needs at least an amount of energy of $ln(2) \cdot k \cdot T$". \\
The current top Green500 systems are about 10 magnitues away from this limitation.
\end{description}
\item[How can we benchmark energy efficency?] Measure only the consumption of a single rack and extrapolate. Average over diffrent benchmarks.
\begin{description}[style=nextline]
\item[Advantages \& disadvantages of Green500, SPEC Power] High variations in idle and during benchmarks and between nodes. Power extrapolation may not be allowed.
\end{description}
\item[What are the architectural aspects of energy efficency?] Current von Neumann architectures were'nt designed with energy efficiency in mind: OoO execution, cache coherency and prefetchers are inefficient. \\
Desktop/mobile power saving methods are applied to HPC (C and P-states).
\begin{description}[style=nextline]
\item[What is dynamic frequency and voltage scaling?] Reduce voltage/frequency during idle times to save energy. Implemented by processor P-states.
\end{description}
\item[What are the future plans to climb the power wall?]
\begin{itemize}
\item Optimize current architectures
\item Power-efficient accelerators
\item Reduce idle power to $\approx 0 W$
\item Load-balancing by DVS/DFS.
\end{itemize}
\begin{description}[style=nextline]
\item[What about GPU's?] GPU's are a lot more efficient because more transistors are used for computation instead of controlling/management tasks. They already use faster DDR5-RAM. But PCIe will become a bottleneck when CPUs get stacked RAM.
\item[What about stacked memory?] Is a must-have to reach the exa-scale goal. Shorter paths allow higher bandwidths and lower voltages (maintainability? extend the caching hierarchy by SSDs?).
\end{description}
\end{description}
\newpage
\subsection{Modern Processors}
\begin{description}[style=nextline]
\item[What is an ISA?] The for Instruction Set Architecture is the part of an architecture which is visible to the application programmer / compiler.
\begin{description}[style=nextline]
\item[What are the differences in ISA's?]
\begin{itemize}
\item Instructions, CISC or RISC
\item Memory addressing modes
\item Internal storage, registers
\item Endianess
\end{itemize}
\item[What are the main differences between RISC and CISC?]
\begin{description}
\item[CISC] Instructions may involve complex operations and are variable in length.
\item[RISC] Similiar encoding for all instructions, fixed length. Strict distinction between data loading/storing and manipulation.
\end{description}
\item[What do we have today?] HPC dominant x86 architectures are CISC based.
\end{description}
\item[What is the basis of today's processors?] They mostly decode CISC instructions to microcode ops (hybrid CISC/RISC).
\begin{description}[style=nextline]
\item[What are the corresponding bottlenecks?] \hfill
\item[How can we overcome these?] \hfill
\end{description}
\item[What is ILP?] Instruction level parallelism is the potential overlap of instruction execution caused by superscalarity, OoO and pipelines.
\begin{description}[style=nextline]
\item[What is its influence on today's computers? over time?] Todays CPU's use hardware implmented schedulers, dependency resolvers for microcode operations => very complex controlling.
\end{description}
\item[Pipelining]
\begin{description}[style=nextline]
\item[How does it work?] Pipelining splits the execution of an instruction in multiple preferably equally long 'stages' which are orthogonal to each other. \\
This allows reusing the execution unit of one stage for the next instruction before the first instruction has retired.
\item[How to compute possible speedup/throughput?] See formulars.
\item[Problems?]
\begin{itemize}
\item Pipeline stalls caused by phases of diffrent lengths
\item Hazards caused by dependent data, instructions, branch-misses...
\item Inbalanced CISC instruction decoding/fetching
\item Aliasing...
\end{itemize}
\end{description}
\item[Superscalarity] "Execute multiple instructions per cycle."
\begin{description}[style=nextline]
\item[What is a superscalar processor?] A superscalar processor consist of multiple FP pipelines, instruction fetchers and decoders.
Thereby can execute multiple instructions per cycle and allows a higher pipline utilization and SMT. \\
This is a kind of ILP.
\end{description}
\item[SIMD] Single-Instruction-Multiple-Data
\begin{description}[style=nextline]
\item[How does it work?] Execute a single instruction initiates multiple integer or floating point operations on wide "registers"/"vectors".
\item[What are the recent vector register widths?] SSE has 128 Bits, AVX has 256 Bits, the Xeon Phi even 512 Bits with AVX2.
\item[What to think about if code vectorization is desired?]
\begin{itemize}
\item Check for automatic loop-unrolling by compilers
\item Profiling is recommended to eliminate memory bottlenecks (alignment!)
\item Avoid high level C++ abstractions which could avoid optimization!
\end{itemize}
\end{description}
\item[Why were caches introduced?] To close the increasing gap between memory bandwidth and processor performance by adding several layers of caches with increasing bandwidth.
\begin{description}[style=nextline]
\item[What is a cache?] It's a high speed memory with low capacity which is usually organized in multiple layers. Commonly the caches are integrated into the CPU dia for high bandwidth.
\item[What performance characteristics does it have?] High speed: usually allows multiple transfers per cycle
\item[When does it makes sense to use a cache?] If compution performance is limited by memory bandwidth ("memory-bound problems").
\item[What are cache hits/misses?] Cache hits occur when requested items are currently hold in the cache. Otherwise a cache miss happens.
\item[What is the principle of locality?] A item that was referenced will soon be referenced again (temporal). Items that are close to a recentl referenced item are likely to be referenced in near future (spatial).
\end{description}
\item[Which cache types do exist \& what are their differences?]
\begin{itemize}
\item Data cache => stores data only
\item Instruction cache => stores instructions only
\item Unified cache => stores both
\item Trace/Mircro-OP cache => stores decoded instructions
\end{itemize}
\item[Why is cache coherence important?] Memory contents have to kept consistent across mutliple cores.
\begin{description}[style=nextline]
\item[How is the coherence achieved?] By using cache invalidation procotols like MESI. \\
Write-back/write-through strategies.
\end{description}
\item[When is prefetching used?] Prefetching is used to continously fetch data from the main memory. Cache misses usually trigger prefetches of the neighboring cache line.
\end{description}
\newpage
\subsection{Basic optimization techniques for serial code}
\begin{description}[style=nextline]
\item[What is a hot spot?] A part of the code where a dominant fraction of the execution time is spent.
\item[What kind of trigger mechanisms do exist?] Event-driven and trigger-driven.
\begin{description}[style=nextline]
\item[What is the difference between event-driven and sample-driven triggers?] \textbf{Event}-driven triggers count accuratly every occurance by using instrumentation or PMCs => probably high overhead. \
\textbf{Sample}-driven triggers save the state/PC at periodic times => constant overhead.
\item[What is instrumentation?] Instrumentation is used to trigger events during runtime by putting additional instructions in an existing program. This may be done by libraries, emulators or the compiler or the programmer itself.
\end{description}
\item[What kind of recording mechanisms do exist?] Profiling and Tracing.
\begin{description}[style=nextline]
\item[What is the difference between profiling and tracing?] Profiling summaries information about the runtime spent per routine/line/block. Tracing collects call-paths with timing information and is therefore much more detailed.
\end{description}
\item[What can hardware performance counter measure?] PMCs are MSRs/hw-counters to measure processor events like cache/branch-misses, loads/stores and much more.
\item[What are basic single-core optimizations?]
\begin{itemize}
\item Do-less work
\item Avoid expensive operations
\item Shrink the working set
\item Eliminate common subexpressions
\item Avoid branches
\item Use SIMD instruction sets
\item Support the compiler use flags wisely
\item C++
\end{itemize}
\begin{description}[style=nextline]
\item[What are simple loop optimizations?] Break-out the loop as soon as possible. Jam neighbouring loops.
\item[How to avoid expensive operations?] Use precalculated tables for fast lookup of results.
\item[What to consider with data types?] Aligning is important. Take care when using SIMD.
\item[How can temporary variables increase performance?] They can be kept in registers and therefore reduce the memory transations.
\item[Can vectorization help for memory-bound programs?] No.
\item[What is the role of the compiler?] Register usage, inlining, general optimizations
\item[What are problems with C++ codes?] Avoid frequent (de)allocation, use static varibales whenever possible, avoid using to much abstraction.
\end{description}
\end{description}
\newpage
\subsection{Data access optimization}
\begin{description}[style=nextline]
\item[What can be modelled with the balance metric?] Ratio of memory bandwidth [Words/s] to peak performance [Flops/s].
\begin{description}[style=nextline]
\item[How are lightspeed, machine and code balance defined?] See formulars.
\item[How can we get values for lightspeed, machine and code balance?] Count \# of load/stores and arithmetic operations per loop iteration.
\item[What are the limitations of this model?] Expects peak performance: saturated pipelines, superscalarity. Latency is ignored. Data transfer and calculation overlaps perfectly.
\end{description}
\item[How can algorithms be classified depending on the number of arithmetic operations and data transfers?] By the scaling behaviour of memory transactions and arithmetic operations vs. problem size: $O(N), O(N^2), ...$.
\begin{description}[style=nextline]
\item[Which algorithm types have potential for optimizations?] Memory-bound usually not so much. Take care for computate-bound problems: \( \frac{O(N^3)}{O(N^2)} \).
\item[Which optimizations should be applied?] Loop unrolling, jamming and blocking.
\end{description}
\item[Sparse matrix-vector multiply] Number of non-zero entries grows linearly with number of matrix rows.
\begin{description}[style=nextline]
\item[How can sparse matrices be stored and why?] Compressed row storage (CRS): store only non-zero entries by using additional index- and row-pointer- arrays.
\item[How is the sparse matrix-vector multiply computed and optimized?] Workingset shrinking by using CRS, loop unrolling, etc..
\end{description}
\end{description}
\newpage
\subsection{Parallel computers}
\begin{description}[style=nextline]
\item[Which taxonomy for parallel computing does exist?] Flynns: SIMD, MIMD, (SISD, MISD)
\item[What can limit scalability?] Serial parts: synchronization and communication
\item[What does Amdahl's Law (strong scaling) say?] For a fixed data set, speedup $S_p$ converges with increasing number of workers towards $\frac{1}{s}$, where $s$ denotes the fraction of serial code.
\item[What does Gustafson's Law (weak scaling) say?] For an arbitrarily large data set, speedup $S_p$ grows linearly with the number $N$ of parallel workers: $N p + s$ with $p = 1 - s$. \\
Does not model the reality!
\item[What is a advantage? of multicore processors?] Higher performance is feasible with lower clock frequencies and therefore lower energy consumption.
\begin{description}[style=nextline]
\item[Why do we have multicore processors?] Increasing the clock rate would require enormous cooling efforts which arn't economic.
\end{description}
\item[Which advantages do SMT have?] Improves the exploitation of ILP. Increase in piplined throughput.
\item[What is a shared-memory computer?] Multiple cores are using the same main memory and address space.
\begin{description}[style=nextline]
\item[What is the difference between UMA and ccNUMA?] The flat model of uniform memory access (UMA) has equal costs for memory accesses across all cores. Their memory is connected to a shared chipset. Non-uniform memory access (NUMA) costs are dependent on the path between the memory and the socket which initiates the transfer. Memory is connected directly to sockets and/or distributed accross multiple boards which are connected by custom interconnects (mostly propriatary: QPI, HT).
\end{description}
\item[What is a distributed-memory computer?] Unlike (N)UMA systems, so called "No Remote Memory Access" (NORMA) systems don't have a common address space. Their nodes a coupled via networks and use message passing to communicate.
\begin{description}[style=nextline]
\item[How do hybrid systems look like?] They combine shared and distributed memory aspects in one cluster. Multiple shared memory nodes are coupled to a distributed system => multiple isles of cache corherence!
\end{description}
\end{description}
\subsubsection{Networks}
\begin{description}[style=nextline]
\item[Which network performance metrics do exist?]
\begin{itemize}
\item Bisection (band)width $b$: \# of edges which have to be removed to split the network into two (equal sized) halves.
\item Diameter $dm$: Max distance between to nodes/processors.
\item Edge connectivity $e$: Min \# of edges that must be removed to break the network it into two disconnected parts.
\item Node connectivity $\delta$: Max \# of edges in the graph per node (max degree).
\end{itemize}
\begin{description}[style=nextline]
\item[What can the PingPong benchmark model (latency, effective bandwidth)?] \textbf{Small} messages have larger overhead and therefore lower effective bandwidth (latency dominates). \\
\textbf{Large} messages have less overhead better effective bandwidth (latency plays no role).
\item[What does the LogP model do?] Generic model for scaling behaviour of connections between parallel systems. It includes:
\begin{description}
\item[L] Upper boundary on latency for one transfer
\item[o] Overhead: time that a processor is involved with the transfer
\item[g] Gap: minimum time interval between to consective transfers.
\item[P] The number of processors
\end{description}
\item[What are the definitions for bisection bandwidth, diameter \& edge connectivity?] See above.
\end{description}
\item[What are relevant network topologies in HPC?] See table above.
\begin{description}[style=nextline]
\item[What are the bisection bandwidth, diameter, node \& edge connectivity of networks such as Ring, Bus, Switched/Fat Tree, Meshes, Hypercubes, Tori, Hybrids?] See tables above.
\end{description}
\end{description}
\newpage
\subsection{Parallelization and optimization strategies}
\begin{description}[style=nextline]
\item[Which types of parallelism do exist?] \hfill
\begin{description}[style=nextline]
\item[What overhead do the different types have \& how can it be reduced?] \hfill
\item[What is a task dependency graph \& its critical path and average concurrency?] \hfill
\end{description}
\item[Which parallel design spaces \& patterns are defined by Mattson?] \hfill
\begin{description}[style=nextline]
\item[What is adaptive geometric decomposition \& adaptive mesh refinement?] \hfill
\item[What does load imbalance mean? Which types do exist? How to get a load balanced application?] \hfill
\item[What are differences between SPMD, Master/ Worker, Loop Parallelism and Fork/ Join? Name a typical example.] \hfill
\item[What does NUMA affinity mean? How can memory placement and binding
affect performance?] \hfill
\end{description}
\item[Which are typical approaches in a Molecular Dynamics application?] \hfill
\end{description}
\newpage
\subsection{Distributed-memory programming with MPI}
\begin{description}[style=nextline]
\item[Basic ideas of MPI] \hfill
\begin{description}[style=nextline]
\item[What is SPMD and how is it implemented by MPI?] \hfill
\item[How MPI abstracts the communication?] \hfill
\end{description}
\item[Point-to-point operations] \hfill
\begin{description}[style=nextline]
\item[How to transfer data between processes in the form of messages?] \hfill
\item[How to prevent deadlocks and overlap communication and computation?] \hfill
\end{description}
\item[Collective operations] \hfill
\begin{description}[style=nextline]
\item[How to scatter and gather data and perform operations on distributed data?] \hfill
\end{description}
\item[Virtual topologies] \hfill
\begin{description}[style=nextline]
\item[How to distribute processes over a regular grid?] \hfill
\end{description}
\item[Derived datatypes] \hfill
\begin{description}[style=nextline]
\item[How to combine MPI datatypes into more complex entities?] \hfill
\end{description}
\end{description}
\newpage
\subsection{Shared-memory programming with OpenMP}
\begin{description}[style=nextline]
\item[Basic principle of OpenMP] \hfill
\begin{description}[style=nextline]
\item[Execution model] \hfill
\item[Parallel region + worksharing constructs] \hfill
\end{description}
\item[Scoping] \hfill
\begin{description}[style=nextline]
\item[Data sharing clauses] \hfill
\end{description}
\item[Synchronization] \hfill
\begin{description}[style=nextline]
\item[Critical section] \hfill
\item[Reduction clause] \hfill
\item[Team and Task-Barriers] \hfill
\end{description}
\item[Runtime library] \hfill
\begin{description}[style=nextline]
\item[Important functions] \hfill
\end{description}
\end{description}
\newpage
\subsection{Hybrid programming (MPI + OpenMP)}
\begin{description}[style=nextline]
\item[Hybrid programming basics] \hfill
\begin{description}[style=nextline]
\item[Why we need hybrid programs] \hfill
\item[Hybrid programming models] \hfill
\end{description}
\item[Threading modes of MPI] \hfill
\begin{description}[style=nextline]
\item[What levels of thread support MPI provides] \hfill
\item[Potential troubles with multithreaded MPI programs] \hfill
\end{description}
\item[Addressing multiple threads within MPI processes] \hfill
\begin{description}[style=nextline]
\item[How to work around the flat addressing space of MPI] \hfill
\item[Using multiple communicators in a hybrid context] \hfill
\end{description}
\item[Running hybrid programs on the RWTH cluster] \hfill
\begin{description}[style=nextline]
\item[How to properly instruct LSF to run your hybrid job] \hfill
\end{description}
\end{description}
\newpage
\subsection{Heterogeneous architectures (GPUs, Xeon Phis)}
\subsubsection{GPGPU's}
\begin{description}[style=nextline]
\item[How does a GPU look like?] \hfill
\begin{description}[style=nextline]
\item[Why do GPUs deliver a good performance per Watt ratio?] \hfill
\item[What is the difference to CPUs?] \hfill
\item[How does the memory hierarchy look like?] \hfill
\item[How can the logical programming hierarchy be mapped to the execution model?] \hfill
\end{description}
\item[Which models can be used to program a GPU?] \hfill
\begin{description}[style=nextline]
\item[How to handle offloading of regions?] \hfill
\item[How to handle data management?] \hfill
\item[What are the main differences?] \hfill
\end{description}
\item[Which impact does the PCIe have?] \hfill
\item[What is branch divergence?] \hfill
\begin{description}[style=nextline]
\item[Which performance impact does it have?] \hfill
\end{description}
\item[What can be a good launch configuration and why?] \hfill
\begin{description}[style=nextline]
\item[How can the launch configuration be specified?] \hfill
\end{description}
\item[What can be done to saturate the bus?] \hfill
\begin{description}[style=nextline]
\item[What is coalescing?] \hfill
\item[How can it be achieved?] \hfill
\item[What impact does AoS and SoA have?] \hfill
\item[What is the difference between caching and non-caching loads?] \hfill
\end{description}
\item[What is shared memory?] \hfill
\item[How to avoid CPU/ GPU idling?] \hfill
\end{description}
\subsubsection{Xeon Phi}
\begin{description}[style=nextline]
\item[How does a Xeon Phi look like?] \hfill
\begin{description}[style=nextline]
\item[How does the memory hierarchy look like?] \hfill
\item[How many threads/ vector-widths are available?] \hfill
\end{description}
\item[Which programming concepts do exist?] \hfill
\begin{description}[style=nextline]
\item[How can OpenMP (4.0) be used?] \hfill
\end{description}
\item[Which optimization strategies should be applied?] \hfill
\begin{description}[style=nextline]
\item[Which impact can the PCIe have?] \hfill
\item[Which impact does vectorization have?] \hfill
\end{description}
\end{description}
\end{document}