Documentation for setting up and using Hadoop with Flume, Spark, and Hive for web server log analysis.
OS: Linux (Arch and Ubuntu)
sudo apt install openjdk-8-jdk
sudo pacman -S jdk8-openjdkCheck the Java JDK version in /usr/lib/jvm/ in bash. To use Java JDK 8, configure it in your bash config (~/.bashrc).
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export PATH=$PATH:/usr/lib/jvm/java-8-openjdk-amd64/bin
export HADOOP_HOME=~/hadoop-3.3.6/
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
export HADOOP_STREAMING=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
export HADOOP_LOG_DIR=$HADOOP_HOME/logs
export PDSH_RCMD_TYPE=ssh(SSH - Secure Shell - protocol used to securely connect to remote server/system, transferring data in encrypted form).
sudo apt-get install ssh
sudo pacman -S sshDownload from hadoop.apache.org, extract the tar file:
tar -zxvf ~/Downloads/hadoop-3.3.6.tar.gz
cd bigdata/hadoop-3.3.6/etc/hadoop
sudo nano hadoop-env.shSet: JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
core-site.xml:
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.proxyuser.dataflair.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.dataflair.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.server.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.server.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.$USER.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.$USER.groups</name>
<value>*</value>
</property>
</configuration>hdfs-site.xml:
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///home/$USER/bigdata/hadoop-3.3.6/data/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///home/$USER/bigdata/hadoop-3.3.6/data/datanode</value>
</property>
</configuration>mapred-site.xml:
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>yarn-site.xml:
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>Generate SSH key for Hadoop communication:
ssh localhost
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys
hadoop-3.3.6/bin/hdfs namenode -formatFormat the file system and NameNode:
export PDSH_RCMD_TYPE=ssh
hdfs namenode -formatstart-all.sh # Start all services
stop-all.sh # Stop all servicesVerify: Check HDFS UI at http://localhost:9870. Create user directory:
hdfs dfs -mkdir -p /user/$USERwget https://downloads.apache.org/flume/1.11.0/apache-flume-1.11.0-bin.tar.gz -P ~/bigdata
tar -xzf ~/bigdata/apache-flume-1.11.0-bin.tar.gz -C ~/bigdataecho 'export FLUME_HOME=~/bigdata/apache-flume-1.11.0-bin' >> ~/.bashrc
echo 'export PATH=$PATH:$FLUME_HOME/bin' >> ~/.bashrc
source ~/.bashrcCreate ~/bigdata/config/flume.conf:
agent.sources = log-source
agent.channels = memory-channel
agent.sinks = hdfs-sink
# Source: Monitor log directory
agent.sources.log-source.type = spooldir
agent.sources.log-source.spoolDir = /home/$USER/bigdata/logs
agent.sources.log-source.fileHeader = true
# Channel: In-memory
agent.channels.memory-channel.type = memory
agent.channels.memory-channel.capacity = 1000
# Sink: Write to HDFS
agent.sinks.hdfs-sink.type = hdfs
agent.sinks.hdfs-sink.hdfs.path = hdfs://localhost:9000/user/$USER/logs
agent.sinks.hdfs-sink.hdfs.filePrefix = web-log
agent.sinks.hdfs-sink.hdfs.rollInterval = 300
agent.sinks.hdfs-sink.hdfs.rollSize = 10485760
agent.sinks.hdfs-sink.hdfs.rollCount = 0
# Connect components
agent.sources.log-source.channels = memory-channel
agent.sinks.hdfs-sink.channel = memory-channel
hdfs dfs -mkdir -p /user/$USER/logswget ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz -P ~/bigdata/logs
gunzip ~/bigdata/logs/NASA_access_log_Jul95.gz
mv ~/bigdata/logs/NASA_access_log_Jul95 ~/bigdata/logs/access.loghdfs dfs -chmod -R 777 /
flume-ng agent --conf ~/bigdata/config --conf-file ~/bigdata/config/flume.conf --name agent -Dflume.root.logger=INFO,consoleVerify:
hdfs dfs -ls /user/$USER/logs
hdfs dfs -cat /user/$USER/logs/web-log.* | headOutput: Files like web-log.1234567890 containing raw log lines, e.g.:
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985
wget https://downloads.apache.org/spark/spark-3.5.7/spark-3.5.7-bin-hadoop3.tgz -P ~/bigdata
tar -xzf ~/bigdata/spark-3.5.7-bin-hadoop3.tgz -C ~/bigdataecho 'export SPARK_HOME=~/bigdata/spark-3.5.7-bin-hadoop3' >> ~/.bashrc
echo 'export PATH=$PATH:$SPARK_HOME/bin' >> ~/.bashrc
source ~/.bashrcCreate ~/bigdata/spark-3.5.7-bin-hadoop3/conf/spark-env.sh:
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 # Ubuntu
# OR: export JAVA_HOME=/usr/lib/jvm/java-8-openjdk # Arch
export HADOOP_HOME=~/bigdata/hadoop-3.3.6
export SPARK_MASTER_HOST=localhost
export SPARK_MASTER_PORT=7077Verify: spark-submit --version
Create ~/bigdata/log_analysis.py. (Uploaded)
spark-submit --master local[*] ~/bigdata/log_analysis.py --input hdfs://localhost:9000/user/$USER/logs/* --output hdfs://localhost:9000/user/$USER/logs_output_fullVerify: hdfs dfs -ls /user/$USER/logs_output_full
wget https://downloads.apache.org/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz -P ~/bigdata
tar -xzf ~/bigdata/apache-hive-3.1.3-bin.tar.gz -C ~/bigdataecho 'export HIVE_HOME=~/bigdata/apache-hive-3.1.3-bin' >> ~/.bashrc
echo 'export PATH=$PATH:$HIVE_HOME/bin' >> ~/.bashrc
source ~/.bashrcCreate ~/bigdata/apache-hive-3.1.3-bin/conf/hive-site.xml:
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby:;databaseName=/home/$USER/bigdata/hive-3.1.3/metastore_db;create=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs://localhost:9000/user/hive/warehouse</value>
</property>
</configuration>hdfs dfs -mkdir -p /user/hive/warehouse
hdfs dfs -chmod -R 777 /user/hive
schematool -initSchema -dbType derbyhiveserver2 &
beeline -u jdbc:hive2://localhost:10000Create database and tables:
CREATE DATABASE log_analysis;
USE log_analysis;
CREATE EXTERNAL TABLE top_urls (url STRING, count BIGINT)
STORED AS PARQUET
LOCATION 'hdfs://localhost:9000/user/$USER/logs_output_full/top_urls';
CREATE EXTERNAL TABLE top_ips (ip STRING, count BIGINT)
STORED AS PARQUET
LOCATION 'hdfs://localhost:9000/user/$USER/logs_output_full/top_ips';
CREATE EXTERNAL TABLE status_counts (status STRING, count BIGINT)
STORED AS PARQUET
LOCATION 'hdfs://localhost:9000/user/$USER/logs_output_full/status_counts';
CREATE EXTERNAL TABLE daily_hits (`date` DATE, count BIGINT)
STORED AS PARQUET
LOCATION 'hdfs://localhost:9000/user/$USER/logs_output_full/daily_hits';Query top 10 URLs:
SELECT url, count FROM top_urls ORDER BY count DESC LIMIT 10;Export to CSV:
beeline -u jdbc:hive2://localhost:10000 --outputformat=csv2 -e \
"USE log_analysis; SELECT url, count FROM top_urls ORDER BY count DESC LIMIT 10;" > ~/bigdata/top_urls.csvSimilarly for other outputs.