-
Notifications
You must be signed in to change notification settings - Fork 2
/
rstudio_sparklyr_emr5.sh
441 lines (383 loc) · 12.5 KB
/
rstudio_sparklyr_emr5.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
#!/bin/bash
set -x -e
# AWS EMR bootstrap script
# for installing RStudio (and Shiny) with SparkR, SparklyR, etc on AWS EMR 4.x and 5.x
#
# 2014-09-24 - schmidbe@amazon.de initial version for RHadoop packages and RStudio
# 2015-07-14 - Tom Zeng tomzeng at amazon.com, modified on top of Christopher Bozeman's "--sparkr" change to add "--sparkr-pkg"
# 2015-07-29 - Tom Zeng tomzeng at amazon.com, converted to AMI 4.0.0 compatible
# 2016-01-15 - Tom Zeng tomzeng at amazon.com, converted to AMI 4.2.0 compatible and added shiny
# 2016-10-07 - Tom Zeng tomzeng at amazon.com, added Sparklyr and improved install speed by 2-3x
# 2016-11-04 - Tom Zeng tomzeng at amazon.com, added RStudio 1.0, and used function rather than separate script for child process, removed --sparkr-pkg
# 2017-05-26 - Tom Zeng tomzeng at amazon.com, fixed the Shiny install typo, thanks to David Howell for spotting it
# 2018-09-23 - Tom Zeng tomzeng at amazon.com, fixed issues with the R 3.4 upgrade, and added CloudyR
# 2019-09-10 - Javier Luraschi javier at rstudio.com, support for arrow and improvements
# Usage:
# --no-rstudio - don't install rstudio-server
# --rstudio-url - the url for the RStudio RPM file
# --sparklyr - install RStudio's sparklyr package
# --sparkr - install SparkR package
# --shiny - install Shiny server
# --shiny-url - the url for the Shiny RPM file
# --nopackages - Do not install basic packages
#
# --user - set user for rstudio, default "hadoop"
# --user-pw - set user-pw for user USER, default "hadoop"
# --rstudio-port - set rstudio port, default 8787
#
# --rexamples - add R examples to the user home dir, default false
# --rhdfs - install rhdfs package, default false
# --plyrmr - install plyrmr package, default false
# --no-updateR - don't update latest R version
# --latestR - install latest R version, default false (build from source - caution, may cause problem with RStudio)
# --cloudyr - install the CloudyR packages
#
# --arrow - install Apache Arrow
# --arrow-version - the version of Apache Arrow to install
# check for master node
IS_MASTER=false
if grep isMaster /mnt/var/lib/info/instance.json | grep true;
then
IS_MASTER=true
fi
# error message
error_msg ()
{
echo 1>&2 "Error: $1"
}
# get input parameters
RSTUDIO=true
SHINY=false
REXAMPLES=false
USER="hadoop"
USERPW="hadoop"
PLYRMR=false
RHDFS=false
UPDATER=true
LATEST_R=false
RSTUDIOPORT=8787
SPARKR=false
SPARKLYR=false
RSTUDIO_URL="https://download2.rstudio.org/rstudio-server-rhel-1.1.463-x86_64.rpm"
MIN_USER_ID=400 # default is 500 starting from 1.0.44, EMR hadoop user id is 498
SHINY_URL="https://download3.rstudio.org/centos6.3/x86_64/shiny-server-1.5.9.923-x86_64.rpm"
CLOUDYR=false
ARROW=false
ARROW_VERSION="0.14.1"
PACKAGES=false
INSTALL_PACKAGES=true
while [ $# -gt 0 ]; do
case "$1" in
--sparklyr)
SPARKLYR=true
;;
--rstudio)
RSTUDIO=true
;;
--rstudio-url)
shift
RSTUDIO_URL=$1
;;
--no-rstudio)
RSTUDIO=false
;;
--shiny)
SHINY=true
;;
--shiny-url)
shift
SHINY_URL=$1
;;
--rexamples)
REXAMPLES=true
;;
--plyrmr)
PLYRMR=true
;;
--rhdfs)
RHDFS=true
;;
--updateR)
UPDATER=true
;;
--no-updateR)
UPDATER=false
;;
--latestR)
LATEST_R=true
UPDATER=false
;;
--sparkr)
SPARKR=true
;;
--rstudio-port)
shift
RSTUDIOPORT=$1
;;
--user)
shift
USER=$1
;;
--user-pw)
shift
USERPW=$1
;;
--cloudyr)
CLOUDYR=true
;;
--arrow)
ARROW=true
;;
--arrow-version)
shift
ARROW_VERSION=$1
;;
--nopackages)
INSTALL_PACKAGES=false
;;
-*)
# do not exit out, just note failure
error_msg "unrecognized option: $1"
;;
*)
break;
;;
esac
shift
done
if [ "$IS_MASTER" = true ]; then
# signal to other BAs that this BA is running
date > /tmp/rstudio_sparklyr_emr5.tmp
fi
export MAKE='make -j 8'
sudo yum install -y xorg-x11-xauth.x86_64 xorg-x11-server-utils.x86_64 xterm libXt libX11-devel libXt-devel libcurl-devel git compat-gmp4 compat-libffi5
sudo yum install R-core R-base R-core-devel R-devel -y
# install latest R version from AWS Repo
if [ "$UPDATER" = true ]; then
sudo yum update R-core R-base R-core-devel R-devel -y
if [ -f /usr/lib64/R/etc/Makeconf.rpmnew ]; then
sudo cp /usr/lib64/R/etc/Makeconf.rpmnew /usr/lib64/R/etc/Makeconf
fi
if [ -f /usr/lib64/R/etc/ldpaths.rpmnew ]; then
sudo cp /usr/lib64/R/etc/ldpaths.rpmnew /usr/lib64/R/etc/ldpaths
fi
fi
# create rstudio user on all machines
# we need a unix user with home directory and password and hadoop permission
if [ "$USER" != "hadoop" ]; then
sudo adduser $USER
fi
sudo sh -c "echo '$USERPW' | passwd $USER --stdin"
mkdir /mnt/r-stuff
cd /mnt/r-stuff
# update to latest R version
if [ "$LATEST_R" = true ] || [ "$ARROW" = true ]; then
pushd .
mkdir R-latest
cd R-latest
wget http://cran.r-project.org/src/base/R-latest.tar.gz
tar -xzf R-latest.tar.gz
sudo yum install -y gcc gcc-c++ gcc-gfortran
sudo yum install -y readline-devel cairo-devel libpng-devel libjpeg-devel libtiff-devel
cd R-3*
./configure --with-readline=yes --enable-R-profiling=no --enable-memory-profiling=no --enable-R-shlib --with-pic --prefix=/usr --with-x --with-libpng --with-jpeglib --with-cairo --enable-R-shlib --with-recommended-packages=yes
make -j 8
sudo make install
sudo su << BASH_SCRIPT
echo '
export PATH=${PWD}/bin:$PATH
' >> /etc/profile
BASH_SCRIPT
popd
fi
if [ "$ARROW" = true ]; then
pushd .
cd /mnt/r-stuff
# install dependencies
sudo yum install -y boost
sudo yum install -y boost-devel
sudo yum install -y autoconf
sudo yum install -y flex
sudo yum install -y bison
sudo yum install -y libssh2-devel
# install cmake 3.x
wget https://cmake.org/files/v3.10/cmake-3.10.0.tar.gz
tar -xvzf cmake-3.10.0.tar.gz
cd cmake-3.10.0
./bootstrap
make
sudo make install
export PATH=$PATH:/usr/local/bin/
cd ..
# install arrow
wget http://archive.apache.org/dist/arrow/arrow-$ARROW_VERSION/apache-arrow-$ARROW_VERSION.tar.gz
tar -xvzf apache-arrow-$ARROW_VERSION.tar.gz
cd apache-arrow-$ARROW_VERSION/cpp
mkdir release
cd release
cmake -DARROW_BUILD_TESTS=ON -DARROW_PARQUET=ON ..
sudo make install
popd
fi
sudo sed -i 's/make/make -j 8/g' /usr/lib64/R/etc/Renviron
# set unix environment variables
sudo su << BASH_SCRIPT
echo '
export HADOOP_HOME=/usr/lib/hadoop
export HADOOP_CMD=/usr/bin/hadoop
export HADOOP_STREAMING=/usr/lib/hadoop-mapreduce/hadoop-streaming.jar
export JAVA_HOME=/etc/alternatives/jre
' >> /etc/profile
BASH_SCRIPT
sudo sh -c "source /etc/profile"
# fix hadoop tmp permission
sudo chmod 777 -R /mnt/var/lib/hadoop/tmp
# fix java binding - R and packages have to be compiled with the same java version as hadoop
sudo R CMD javareconf
# install rstudio
# only run if master node
if [ "$IS_MASTER" = true -a "$RSTUDIO" = true ]; then
# install Rstudio server
# please check and update for latest RStudio version
RSTUDIO_FILE=$(basename $RSTUDIO_URL)
wget $RSTUDIO_URL
sudo yum install --nogpgcheck -y $RSTUDIO_FILE
# change port - 8787 will not work for many companies
sudo sh -c "echo 'www-port=$RSTUDIOPORT' >> /etc/rstudio/rserver.conf"
sudo sh -c "echo 'auth-minimum-user-id=$MIN_USER_ID' >> /etc/rstudio/rserver.conf"
sudo perl -p -i -e "s/= 5../= 100/g" /etc/pam.d/rstudio
sudo rstudio-server stop || true
sudo rstudio-server start
fi
# add examples to user
# only run if master node
if [ "$IS_MASTER" = true -a "$REXAMPLES" = true ]; then
# and copy R example scripts to user's home dir amd set permission
wget --no-check-certificate https://raw.githubusercontent.com/tomz/emr-bootstrap-actions/master/R/Hadoop/examples/rmr2_example.R
wget --no-check-certificate https://raw.githubusercontent.com/tomz/emr-bootstrap-actions/master/R/Hadoop/examples/biganalyses_example.R
wget --no-check-certificate https://raw.githubusercontent.com/tomz/emr-bootstrap-actions/master/R/Hadoop/examples/change_pw.R
#sudo cp -p *.R /home/$USER/.
sudo mv *.R /home/$USER/.
sudo chown $USER:$USER -Rf /home/$USER
fi
# install rhdfs package
if [ "$INSTALL_PACKAGES" = true ]; then
# install required packages
sudo R --no-save << R_SCRIPT
install.packages(c('RJSONIO', 'itertools', 'digest', 'Rcpp', 'functional', 'httr', 'plyr', 'stringr', 'reshape2', 'caTools', 'rJava', 'devtools', 'DBI', 'ggplot2', 'dplyr', 'R.methodsS3', 'Hmisc', 'memoise', 'rjson'),
repos="http://cran.rstudio.com")
# here you can add your required packages which should be installed on ALL nodes
# install.packages(c(''), repos="http://cran.rstudio.com", INSTALL_opts=c('--byte-compile') )
R_SCRIPT
# install rmr2 package
pushd .
rm -rf RHadoop
mkdir RHadoop
cd RHadoop
curl --insecure -L https://github.com/RevolutionAnalytics/rmr2/releases/download/3.3.1/rmr2_3.3.1.tar.gz | tar zx
sudo R CMD INSTALL --byte-compile rmr2
popd
fi
# install rhdfs package
if [ "$RHDFS" = true ]; then
curl --insecure -L https://raw.github.com/RevolutionAnalytics/rhdfs/master/build/rhdfs_1.0.8.tar.gz | tar zx
sudo R CMD INSTALL --byte-compile --no-test-load rhdfs
fi
# install plyrmr package
if [ "$PLYRMR" = true ]; then
curl --insecure -L https://github.com/RevolutionAnalytics/plyrmr/releases/download/0.6.0/plyrmr_0.6.0.tar.gz | tar zx
sudo R CMD INSTALL --byte-compile plyrmr
fi
if [ "$CLOUDYR" = true ]; then
sudo R --no-save << R_SCRIPT
install.packages(c("base64enc","drat"),repos = "http://cran.us.r-project.org")
drat::addRepo("cloudyr", "http://cloudyr.github.io/drat")
install.packages(c("aws.signature","aws.ec2metadata","aws.efs"), repos = c(cloudyr = "http://cloudyr.github.io/drat"))
R_SCRIPT
fi
# the follow code will spawn a child process which waits for dependencies to be installed before proceed
child_process() {
if [ "$SPARKR" = true ] || [ "$SPARKLYR" = true ]; then
cat << 'EOF' > /tmp/Renvextra
JAVA_HOME="/etc/alternatives/jre"
HADOOP_HOME_WARN_SUPPRESS="true"
HADOOP_HOME="/usr/lib/hadoop"
HADOOP_PREFIX="/usr/lib/hadoop"
HADOOP_MAPRED_HOME="/usr/lib/hadoop-mapreduce"
HADOOP_YARN_HOME="/usr/lib/hadoop-yarn"
HADOOP_COMMON_HOME="/usr/lib/hadoop"
HADOOP_HDFS_HOME="/usr/lib/hadoop-hdfs"
YARN_HOME="/usr/lib/hadoop-yarn"
HADOOP_CONF_DIR="/usr/lib/hadoop/etc/hadoop/"
YARN_CONF_DIR="/usr/lib/hadoop/etc/hadoop/"
HIVE_HOME="/usr/lib/hive"
HIVE_CONF_DIR="/usr/lib/hive/conf"
HBASE_HOME="/usr/lib/hbase"
HBASE_CONF_DIR="/usr/lib/hbase/conf"
SPARK_HOME="/usr/lib/spark"
SPARK_CONF_DIR="/usr/lib/spark/conf"
PATH=${PWD}:${PATH}
EOF
cat /tmp/Renvextra | sudo tee -a /usr/lib64/R/etc/Renviron
# wait SparkR file to show up
while [ ! -f /var/run/spark/spark-history-server.pid ]
do
sleep 5
done
fi
# install SparkR or the out-dated SparkR-pkg
if [ "$SPARKR" = true ]; then
sudo mkdir /mnt/spark
sudo chmod a+rwx /mnt/spark
if [ -d /mnt1 ]; then
sudo mkdir /mnt1/spark
sudo chmod a+rwx /mnt1/spark
fi
sudo R --no-save << R_SCRIPT
library(devtools)
install('/usr/lib/spark/R/lib/SparkR')
# here you can add your required packages which should be installed on ALL nodes
# install.packages(c(''), repos="http://cran.rstudio.com", INSTALL_opts=c('--byte-compile') )
R_SCRIPT
fi
if [ "$SPARKLYR" = true ]; then
sudo R --no-save << R_SCRIPT
library(devtools)
devtools::install_github("rstudio/sparklyr")
install.packages(c('nycflights13', 'Lahman', 'data.table'),
repos="http://cran.rstudio.com" )
R_SCRIPT
fi
if [ "$CLOUDYR" = true ]; then
sudo R --no-save << R_SCRIPT
install.packages(c("aws.s3","aws.ec2"), repos = c(cloudyr = "http://cloudyr.github.io/drat"))
R_SCRIPT
fi
if [ "$IS_MASTER" = true ]; then
if [ "$SHINY" = true ]; then
# install Shiny server
SHINY_FILE=$(basename $SHINY_URL)
wget $SHINY_URL
sudo yum install --nogpgcheck -y $SHINY_FILE
sudo R --no-save <<R_SCRIPT
install.packages(c('shiny','rmarkdown'),
repos="http://cran.rstudio.com")
R_SCRIPT
fi
sudo rm -f /tmp/rstudio_sparklyr_emr5.tmp
#the following are needed only if not login in as hadoop
if [ "$USER" != "hadoop" ]; then
while [ ! -f /var/run/hadoop-hdfs/hadoop-hdfs-namenode.pid ]
do
sleep 5
done
sudo -u hdfs hdfs dfs -mkdir /user/$USER
sudo -u hdfs hdfs dfs -chown $USER:$USER /user/$USER
sudo -u hdfs hdfs dfs -chmod -R 777 /user/$USER
fi
sudo rstudio-server restart || true
fi # IS_MASTER
echo "rstudio server and packages installation completed"
} # end of child_process
child_process &
echo "bootstrap action completed after spwaning child process"