Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
tmp
.idea
results
**/build/**
**/.gradle/**
**.iml
2 changes: 2 additions & 0 deletions .lfsconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[lfs]
url = https://github.com/AdvancedJavaLabs/lab3-xGodness.git/info/lfs
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
build:
@cd sales-analyzer && ./gradlew build

up: down
@docker compose --env-file hadoop.env up --build

down:
@docker compose down
12 changes: 12 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
services:
hadoop:
build:
context: .
dockerfile: hadoop/Dockerfile
environment:
MAPRED_NUM_REDUCE_TASKS: $MAPRED_NUM_REDUCE_TASKS
SPLIT_MINSIZE: $SPLIT_MINSIZE
ports:
- "9870:9870"
volumes:
- ./results:/results
2 changes: 2 additions & 0 deletions hadoop.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
MAPRED_NUM_REDUCE_TASKS=5
SPLIT_MINSIZE=4096
69 changes: 69 additions & 0 deletions hadoop/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Download hadoop
FROM eclipse-temurin:17-jdk AS base

ENV HADOOP_VERSION=3.4.2
RUN wget -O /tmp/hadoop.tar.gz "https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION-lean.tar.gz"


# Install dependencies
FROM base AS deps

RUN apt-get update && \
apt-get install -y --no-install-recommends ssh pdsh sudo


# Install hadoop
FROM deps AS hadoop-installation

ENV HADOOP_HOME=/opt/hadoop
ENV HADOOP_CONFIG_DIR=$HADOOP_HOME/etc/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
ENV HADOOP_USER=hadoop
ENV HADOOP_UID=1001
ENV HADOOP_GID=1001

RUN groupadd -g $HADOOP_GID $HADOOP_USER && \
useradd -u $HADOOP_UID -g $HADOOP_GID -r -m -s /bin/bash $HADOOP_USER && \
usermod -aG sudo $HADOOP_USER && \
echo "$HADOOP_USER ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN tar -xzf /tmp/hadoop.tar.gz -C /opt && \
mv /opt/hadoop-$HADOOP_VERSION /opt/hadoop && \
rm /tmp/hadoop.tar.gz


# Configure hadoop
FROM hadoop-installation AS hadoop-configuration

COPY hadoop/core-site.xml $HADOOP_CONFIG_DIR/core-site.xml
COPY hadoop/hdfs-site.xml $HADOOP_CONFIG_DIR/hdfs-site.xml


# Prepare runtime
FROM hadoop-configuration AS runtime

RUN echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN echo "export HDFS_NAMENODE_USER=$HADOOP_USER" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN echo "export HDFS_DATANODE_USER=$HADOOP_USER" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN echo "export HDFS_HADOOP_USER=$HADOOP_USER" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh

ENV ARTIFACT_NAME=sales-analyzer
ENV JAR_NAME=$ARTIFACT_NAME-1.0.jar
ENV BUILD_PATH=$ARTIFACT_NAME/build/libs
COPY $BUILD_PATH/$JAR_NAME .

COPY *.csv .

COPY hadoop/start.sh start.sh
COPY hadoop/start_benchmark.sh start_benchmark.sh
RUN chmod +x start.sh
RUN chmod +x start_benchmark.sh

USER $HADOOP_USER

RUN ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
chmod 600 ~/.ssh/authorized_keys

ENTRYPOINT ["./start.sh"]
#ENTRYPOINT ["./start_benchmark.sh"]
6 changes: 6 additions & 0 deletions hadoop/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
10 changes: 10 additions & 0 deletions hadoop/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
</configuration>
40 changes: 40 additions & 0 deletions hadoop/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
set -e

timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
dir=$timestamp-tasks-$MAPRED_NUM_REDUCE_TASKS-split-$SPLIT_MINSIZE

sudo service ssh start
sudo chmod o+w /results

NAMENODE_DIR=/opt/hadoop/data/namenode
if [ ! -d "$NAMENODE_DIR/current" ]; then
hdfs namenode -format
fi

start-dfs.sh
start-yarn.sh

hadoop fs -mkdir /csv
hadoop fs -put *.csv /csv

echo "Initialization completed"
echo "Starting job..."

start_time=$(date +%s%N)
hadoop jar sales-analyzer-1.0.jar org.itmo.MainJob $MAPRED_NUM_REDUCE_TASKS out /csv/*.csv
end_time=$(date +%s%N)

duration_ms=$(($((end_time - start_time)) / 1000000))

mkdir results/$dir

echo $duration_ms > results/$dir/time_elapsed_ms

hadoop fs -get out/part-r-00000 results/$dir/mapred.csv
sed -i '1i category,revenue,quantity' results/$dir/mapred.csv

hadoop fs -rm -r out
hadoop fs -rm -r mapred_out

#tail -f /dev/null
49 changes: 49 additions & 0 deletions hadoop/start_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
set -e

sudo service ssh start
sudo chmod o+w /results

NAMENODE_DIR=/opt/hadoop/data/namenode
if [ ! -d "$NAMENODE_DIR/current" ]; then
hdfs namenode -format
fi

start-dfs.sh
start-yarn.sh

hadoop fs -mkdir /csv
hadoop fs -put *.csv /csv

echo "Initialization completed"

MAPRED_NUM_REDUCE_TASKS_SET=(1 3 5 10 15 20)
SPLIT_MINSIZE_SET=(1024 2048 8192 32768 131072 262144)

for MAPRED_NUM_REDUCE_TASKS in "${MAPRED_NUM_REDUCE_TASKS_SET[@]}"; do
for SPLIT_MINSIZE in "${SPLIT_MINSIZE_SET[@]}"; do
timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
dir=$timestamp-tasks-$MAPRED_NUM_REDUCE_TASKS-split-$SPLIT_MINSIZE

echo "Running with MAPRED_NUM_REDUCE_TASKS=$MAPRED_NUM_REDUCE_TASKS SPLIT_MINSIZE=$SPLIT_MINSIZE"
MAPRED_NUM_REDUCE_TASKS="$MAPRED_NUM_REDUCE_TASKS" SPLIT_MINSIZE="$SPLIT_MINSIZE"

start_time=$(date +%s%N)
hadoop jar sales-analyzer-1.0.jar org.itmo.MainJob $MAPRED_NUM_REDUCE_TASKS out /csv/*.csv
end_time=$(date +%s%N)

duration_ms=$(($((end_time - start_time)) / 1000000))

mkdir results/$dir

echo $duration_ms > results/$dir/time_elapsed_ms

hadoop fs -get out/part-r-00000 results/$dir/mapred.csv
sed -i '1i category,revenue,quantity' results/$dir/mapred.csv

hadoop fs -rm -r out
hadoop fs -rm -r mapred_out
done
done

#tail -f /dev/null
21 changes: 21 additions & 0 deletions sales-analyzer/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
plugins {
id("java")
}

java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}

group = "org.itmo"
version = "1.0"

repositories {
mavenCentral()
}

dependencies {
compileOnly("org.apache.hadoop:hadoop-common:3.4.2")
compileOnly("org.apache.hadoop:hadoop-mapreduce-client-core:3.4.2")
}
Binary file added sales-analyzer/gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
6 changes: 6 additions & 0 deletions sales-analyzer/gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#Sat Dec 13 15:56:51 MSK 2025
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Loading