-
Notifications
You must be signed in to change notification settings - Fork 55
/
the-r-in-spark.toc
113 lines (113 loc) · 8.17 KB
/
the-r-in-spark.toc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
\contentsline {chapter}{Preface}{5}{chapter*.2}
\contentsline {chapter}{\numberline {1}Introduction}{7}{chapter.1}
\contentsline {section}{\numberline {1.1}Background}{7}{section.1.1}
\contentsline {section}{\numberline {1.2}Spark}{9}{section.1.2}
\contentsline {section}{\numberline {1.3}R}{10}{section.1.3}
\contentsline {section}{\numberline {1.4}sparklyr}{12}{section.1.4}
\contentsline {chapter}{\numberline {2}Installation}{15}{chapter.2}
\contentsline {section}{\numberline {2.1}Prerequisites}{15}{section.2.1}
\contentsline {subsection}{\numberline {2.1.1}Install R}{15}{subsection.2.1.1}
\contentsline {subsection}{\numberline {2.1.2}Install Java}{15}{subsection.2.1.2}
\contentsline {subsection}{\numberline {2.1.3}Install RStudio}{18}{subsection.2.1.3}
\contentsline {subsection}{\numberline {2.1.4}Install sparklyr}{18}{subsection.2.1.4}
\contentsline {section}{\numberline {2.2}Installing Spark}{18}{section.2.2}
\contentsline {section}{\numberline {2.3}Connecting to Spark}{20}{section.2.3}
\contentsline {section}{\numberline {2.4}Using Spark}{20}{section.2.4}
\contentsline {subsection}{\numberline {2.4.1}Web Interface}{20}{subsection.2.4.1}
\contentsline {subsection}{\numberline {2.4.2}Logs}{21}{subsection.2.4.2}
\contentsline {subsection}{\numberline {2.4.3}RStudio}{21}{subsection.2.4.3}
\contentsline {section}{\numberline {2.5}Disconnecting}{21}{section.2.5}
\contentsline {section}{\numberline {2.6}Recap}{21}{section.2.6}
\contentsline {chapter}{\numberline {3}Analysis}{23}{chapter.3}
\contentsline {section}{\numberline {3.1}dplyr}{23}{section.3.1}
\contentsline {section}{\numberline {3.2}DBI}{24}{section.3.2}
\contentsline {chapter}{\numberline {4}Modeling}{25}{chapter.4}
\contentsline {section}{\numberline {4.1}Overview}{25}{section.4.1}
\contentsline {section}{\numberline {4.2}Supervised}{26}{section.4.2}
\contentsline {section}{\numberline {4.3}Unsupervised}{26}{section.4.3}
\contentsline {subsection}{\numberline {4.3.1}K-Means Clustering}{26}{subsection.4.3.1}
\contentsline {subsection}{\numberline {4.3.2}Gaussian Mixture Clustering}{27}{subsection.4.3.2}
\contentsline {section}{\numberline {4.4}Broom}{27}{section.4.4}
\contentsline {section}{\numberline {4.5}Pipelines}{28}{section.4.5}
\contentsline {chapter}{\numberline {5}Clusters}{29}{chapter.5}
\contentsline {section}{\numberline {5.1}Overview}{29}{section.5.1}
\contentsline {section}{\numberline {5.2}Managers}{30}{section.5.2}
\contentsline {subsection}{\numberline {5.2.1}Standalone}{30}{subsection.5.2.1}
\contentsline {subsection}{\numberline {5.2.2}Yarn}{32}{subsection.5.2.2}
\contentsline {subsection}{\numberline {5.2.3}Mesos}{32}{subsection.5.2.3}
\contentsline {subsection}{\numberline {5.2.4}Kubernetes}{37}{subsection.5.2.4}
\contentsline {section}{\numberline {5.3}On-Premise}{37}{section.5.3}
\contentsline {subsection}{\numberline {5.3.1}Cloudera}{37}{subsection.5.3.1}
\contentsline {subsection}{\numberline {5.3.2}Hortonworks}{37}{subsection.5.3.2}
\contentsline {subsection}{\numberline {5.3.3}MapR}{37}{subsection.5.3.3}
\contentsline {section}{\numberline {5.4}Cloud}{37}{section.5.4}
\contentsline {subsection}{\numberline {5.4.1}Amazon}{42}{subsection.5.4.1}
\contentsline {subsection}{\numberline {5.4.2}Google}{42}{subsection.5.4.2}
\contentsline {subsection}{\numberline {5.4.3}Microsoft}{42}{subsection.5.4.3}
\contentsline {section}{\numberline {5.5}Tools}{42}{section.5.5}
\contentsline {subsection}{\numberline {5.5.1}RStudio}{42}{subsection.5.5.1}
\contentsline {subsection}{\numberline {5.5.2}Livy}{42}{subsection.5.5.2}
\contentsline {section}{\numberline {5.6}Recap}{48}{section.5.6}
\contentsline {chapter}{\numberline {6}Connections}{49}{chapter.6}
\contentsline {section}{\numberline {6.1}Overview}{49}{section.6.1}
\contentsline {subsection}{\numberline {6.1.1}Edge Nodes}{50}{subsection.6.1.1}
\contentsline {subsection}{\numberline {6.1.2}Spark Home}{50}{subsection.6.1.2}
\contentsline {section}{\numberline {6.2}Types}{51}{section.6.2}
\contentsline {subsection}{\numberline {6.2.1}Local}{51}{subsection.6.2.1}
\contentsline {subsection}{\numberline {6.2.2}Standalone}{52}{subsection.6.2.2}
\contentsline {subsection}{\numberline {6.2.3}Yarn}{52}{subsection.6.2.3}
\contentsline {subsubsection}{\numberline {6.2.3.1}Yarn Client}{52}{subsubsection.6.2.3.1}
\contentsline {subsubsection}{\numberline {6.2.3.2}Yarn Cluster}{53}{subsubsection.6.2.3.2}
\contentsline {subsection}{\numberline {6.2.4}Livy}{53}{subsection.6.2.4}
\contentsline {subsection}{\numberline {6.2.5}Mesos}{55}{subsection.6.2.5}
\contentsline {subsection}{\numberline {6.2.6}Kubernetes}{55}{subsection.6.2.6}
\contentsline {section}{\numberline {6.3}Troubleshooting}{56}{section.6.3}
\contentsline {subsection}{\numberline {6.3.1}Logging}{56}{subsection.6.3.1}
\contentsline {subsection}{\numberline {6.3.2}Spark Submit}{56}{subsection.6.3.2}
\contentsline {subsection}{\numberline {6.3.3}Multiple}{57}{subsection.6.3.3}
\contentsline {subsection}{\numberline {6.3.4}Windows}{57}{subsection.6.3.4}
\contentsline {subsection}{\numberline {6.3.5}Submit Manually}{57}{subsection.6.3.5}
\contentsline {section}{\numberline {6.4}Recap}{58}{section.6.4}
\contentsline {chapter}{\numberline {7}Tuning}{59}{chapter.7}
\contentsline {section}{\numberline {7.1}Overview}{59}{section.7.1}
\contentsline {section}{\numberline {7.2}Configuration}{59}{section.7.2}
\contentsline {section}{\numberline {7.3}Caching}{59}{section.7.3}
\contentsline {section}{\numberline {7.4}Partitions}{61}{section.7.4}
\contentsline {section}{\numberline {7.5}Shuffling}{61}{section.7.5}
\contentsline {section}{\numberline {7.6}Checkpointing}{61}{section.7.6}
\contentsline {section}{\numberline {7.7}Troubleshooting}{61}{section.7.7}
\contentsline {subsection}{\numberline {7.7.1}Graph Visualization}{61}{subsection.7.7.1}
\contentsline {subsection}{\numberline {7.7.2}Event Timeline}{61}{subsection.7.7.2}
\contentsline {section}{\numberline {7.8}Recap}{61}{section.7.8}
\contentsline {chapter}{\numberline {8}Extensions}{65}{chapter.8}
\contentsline {section}{\numberline {8.1}Using Extensions}{65}{section.8.1}
\contentsline {subsection}{\numberline {8.1.1}RSparkling}{65}{subsection.8.1.1}
\contentsline {subsection}{\numberline {8.1.2}GraphFrames}{66}{subsection.8.1.2}
\contentsline {subsection}{\numberline {8.1.3}Mleap}{66}{subsection.8.1.3}
\contentsline {section}{\numberline {8.2}Writting Extensions}{67}{section.8.2}
\contentsline {subsection}{\numberline {8.2.1}RStudio Projects}{67}{subsection.8.2.1}
\contentsline {chapter}{\numberline {9}Distributed R}{69}{chapter.9}
\contentsline {section}{\numberline {9.1}Use Cases}{69}{section.9.1}
\contentsline {section}{\numberline {9.2}Grouping}{69}{section.9.2}
\contentsline {section}{\numberline {9.3}Packages}{69}{section.9.3}
\contentsline {section}{\numberline {9.4}Restrictions}{69}{section.9.4}
\contentsline {section}{\numberline {9.5}Troubleshooting}{69}{section.9.5}
\contentsline {chapter}{\numberline {10}Streaming}{71}{chapter.10}
\contentsline {section}{\numberline {10.1}Overview}{71}{section.10.1}
\contentsline {section}{\numberline {10.2}Transformations}{72}{section.10.2}
\contentsline {subsection}{\numberline {10.2.1}dplyr}{73}{subsection.10.2.1}
\contentsline {subsection}{\numberline {10.2.2}Pipelines}{74}{subsection.10.2.2}
\contentsline {subsection}{\numberline {10.2.3}R Code}{75}{subsection.10.2.3}
\contentsline {section}{\numberline {10.3}Shiny}{75}{section.10.3}
\contentsline {section}{\numberline {10.4}Formats}{76}{section.10.4}
\contentsline {chapter}{\numberline {11}Contributing}{77}{chapter.11}
\contentsline {section}{\numberline {11.1}Overview}{77}{section.11.1}
\contentsline {section}{\numberline {11.2}Serialization}{77}{section.11.2}
\contentsline {section}{\numberline {11.3}Invocations}{77}{section.11.3}
\contentsline {section}{\numberline {11.4}R Packages}{77}{section.11.4}
\contentsline {section}{\numberline {11.5}Connections}{77}{section.11.5}
\contentsline {section}{\numberline {11.6}Distributed R}{77}{section.11.6}
\contentsline {chapter}{Appendix}{79}{chapter*.3}
\contentsline {section}{\numberline {11.7}Worlds Store Capacity}{79}{section.11.7}
\contentsline {section}{\numberline {11.8}Daily downloads of CRAN packages}{79}{section.11.8}
\contentsline {section}{\numberline {11.9}Google trends for mainframes, cloud computing and kubernetes}{80}{section.11.9}