Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
**/build
.gradle
.vscode
2 changes: 1 addition & 1 deletion .lfsconfig
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[lfs]
url = https://github.com/AdvancedJavaLabs/lab4-parallel.git/info/lfs
url = https://github.com/AdvancedJavaLabs/lab3-kek-flip.git/info/lfs
9 changes: 9 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
services:
hadoop:
build:
context: .
dockerfile: hadoop/Dockerfile
ports:
- "9870:9870"
volumes:
- ./results:/results
66 changes: 66 additions & 0 deletions hadoop/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Download hadoop
FROM eclipse-temurin:17-jdk AS base

ENV HADOOP_VERSION=3.4.2
RUN wget -O /tmp/hadoop.tar.gz "https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION-lean.tar.gz"


# Install dependencies
FROM base AS deps

RUN apt-get update && \
apt-get install -y --no-install-recommends ssh pdsh sudo


# Install hadoop
FROM deps AS hadoop-installation

ENV HADOOP_HOME=/opt/hadoop
ENV HADOOP_CONFIG_DIR=$HADOOP_HOME/etc/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
ENV HADOOP_USER=hadoop
ENV HADOOP_UID=1001
ENV HADOOP_GID=1001

RUN groupadd -g $HADOOP_GID $HADOOP_USER && \
useradd -u $HADOOP_UID -g $HADOOP_GID -r -m -s /bin/bash $HADOOP_USER && \
usermod -aG sudo $HADOOP_USER && \
echo "$HADOOP_USER ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN tar -xzf /tmp/hadoop.tar.gz -C /opt && \
mv /opt/hadoop-$HADOOP_VERSION /opt/hadoop && \
rm /tmp/hadoop.tar.gz


# Configure hadoop
FROM hadoop-installation AS hadoop-configuration

COPY hadoop/core-site.xml $HADOOP_CONFIG_DIR/core-site.xml
COPY hadoop/hdfs-site.xml $HADOOP_CONFIG_DIR/hdfs-site.xml


# Prepare runtime
FROM hadoop-configuration AS runtime

RUN echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN echo "export HDFS_NAMENODE_USER=$HADOOP_USER" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN echo "export HDFS_DATANODE_USER=$HADOOP_USER" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN echo "export HDFS_HADOOP_USER=$HADOOP_USER" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh

ENV ARTIFACT_NAME=sales-analyzer
ENV JAR_NAME=$ARTIFACT_NAME-1.0.jar
ENV BUILD_PATH=$ARTIFACT_NAME/build/libs
COPY $BUILD_PATH/$JAR_NAME .

COPY *.csv .

COPY hadoop/start.sh start.sh
RUN chmod +x start.sh

USER $HADOOP_USER

RUN ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
chmod 600 ~/.ssh/authorized_keys

ENTRYPOINT ["./start.sh"]
6 changes: 6 additions & 0 deletions hadoop/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
10 changes: 10 additions & 0 deletions hadoop/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
</configuration>
45 changes: 45 additions & 0 deletions hadoop/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
set -e

sudo service ssh start
sudo chmod o+w /results

NAMENODE_DIR=/opt/hadoop/data/namenode
if [ ! -d "$NAMENODE_DIR/current" ]; then
hdfs namenode -format
fi

start-dfs.sh
start-yarn.sh

hadoop fs -mkdir /csv
hadoop fs -put *.csv /csv

MAPRED_NUM_REDUCE_TASKS_SET=(1 3 5 10 15 20)
SPLIT_MINSIZE_SET=(1024 2048 8192 32768 524288)

for MAPRED_NUM_REDUCE_TASKS in "${MAPRED_NUM_REDUCE_TASKS_SET[@]}"; do
for SPLIT_MINSIZE in "${SPLIT_MINSIZE_SET[@]}"; do
dir=tasks-$MAPRED_NUM_REDUCE_TASKS\_split-$SPLIT_MINSIZE

start_time=$(date +%s%N)
hadoop jar sales-analyzer-1.0.jar org.itmo.MainJob $MAPRED_NUM_REDUCE_TASKS $SPLIT_MINSIZE out /csv/*.csv
end_time=$(date +%s%N)

duration_ms=$(($((end_time - start_time)) / 1000000))

if [ ! -d "results/$dir" ]; then
mkdir results/$dir
fi

echo $duration_ms > results/$dir/exec_time_ms

hadoop fs -get out/part-r-00000 results/$dir/data.csv
sed -i '1i category,revenue,quantity' results/$dir/data.csv

hadoop fs -rm -r out
hadoop fs -rm -r mapred_out
done
done

#tail -f /dev/null
21 changes: 21 additions & 0 deletions sales-analyzer/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
plugins {
id("java")
}

java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}

group = "org.itmo"
version = "1.0"

repositories {
mavenCentral()
}

dependencies {
compileOnly("org.apache.hadoop:hadoop-common:3.4.2")
compileOnly("org.apache.hadoop:hadoop-mapreduce-client-core:3.4.2")
}
Binary file added sales-analyzer/gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
6 changes: 6 additions & 0 deletions sales-analyzer/gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#Sat Dec 13 15:56:51 MSK 2025
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Loading