Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/target/
/.idea/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[![Review Assignment Due Date](https://classroom.github.com/assets/deadline-readme-button-22041afd0340ce965d47ae6ef1cefeee28c7c493a6346c4f15d667ab976d596c.svg)](https://classroom.github.com/a/uyodabcP)
## Лабораторная работа: Реализация MapReduce для анализа данных о продажах с ипользованием HADOOP!!!
# Цель работы

Expand Down
84 changes: 84 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
version: '3.8'

services:
namenode:
image: bde2020/hadoop-namenode:latest
container_name: namenode
environment:
- CLUSTER_NAME=hadoop_namenode
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- HDFS_CONF_dfs_replication=1
- HDFS_CONF_dfs_permissions_enabled=false
ports:
- "9870:9870" # Web UI HDFS
- "9000:9000"
- "8020:9000"
volumes:
- namenode_data:/hadoop/dfs/name
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9870" ]
interval: 30s
timeout: 10s
retries: 300

datanode:
image: bde2020/hadoop-datanode:latest
container_name: datanode
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
volumes:
- datanode_data:/hadoop/dfs/data
depends_on:
namenode:
condition: service_healthy
ports:
- "9864:9864"

resourcemanager:
image: bde2020/hadoop-resourcemanager:latest
container_name: resourcemanager
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
- YARN_CONF_yarn_nodemanager_aux-services=mapreduce_shuffle
- YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
- YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
- YARN_CONF_yarn_resourcemanager_resource_tracker_address=resourcemanager:8031
- YARN_CONF_yarn_resourcemanager_admin_address=resourcemanager:8033
- YARN_CONF_yarn_resourcemanager_webapp_address=0.0.0.0:8088
- HADOOP_CONF_DIR=/etc/hadoop
ports:
- "8088:8088"
- "8032:8032"
- "8030:8030"
- "8031:8031"
depends_on:
- namenode
- datanode

nodemanager:
image: bde2020/hadoop-nodemanager:latest
container_name: nodemanager
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
- YARN_CONF_yarn_nodemanager_resource_memory_mb=4096
- YARN_CONF_yarn_nodemanager_resource_cpu_vcores=4
- YARN_CONF_yarn_nodemanager_aux-services=mapreduce_shuffle
- YARN_CONF_yarn_nodemanager_address=0.0.0.0:8041
- YARN_CONF_yarn_nodemanager_webapp_address=0.0.0.0:8042
- YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
- YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
- YARN_CONF_yarn_resourcemanager_resource_tracker_address=resourcemanager:8031
- YARN_CONF_yarn_resourcemanager_admin_address=resourcemanager:8033
- HADOOP_CONF_DIR=/etc/hadoop
ports:
- "8042:8042"
depends_on:
- resourcemanager
- datanode

volumes:
namenode_data:
datanode_data:
100 changes: 100 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>ru.ifmo</groupId>
<artifactId>lab3</artifactId>
<version>1.0</version>
<packaging>jar</packaging>

<name>lab3</name>

<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.4.12</version>
</parent>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<start-class>ru.ifmo.sales.App</start-class>
<java.version>8</java.version>
<maven.compiler.release>8</maven.compiler.release>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<lombok.version>1.18.30</lombok.version>
<hadoop.version>3.3.6</hadoop.version>
</properties>

<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>

<!-- Hadoop dependencies -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<argLine>--add-opens java.base/java.lang=ALL-UNNAMED</argLine>
</configuration>
</plugin>
<!-- <plugin>-->
<!-- <groupId>org.springframework.boot</groupId>-->
<!-- <artifactId>spring-boot-maven-plugin</artifactId>-->
<!-- <configuration>-->
<!-- <jvmArguments>-->
<!-- &#45;&#45;add-opens java.base/java.lang=ALL-UNNAMED-->
<!-- &#45;&#45;add-opens java.base/java.util=ALL-UNNAMED-->
<!-- </jvmArguments>-->
<!-- </configuration>-->
<!-- <dependencies>-->
<!-- <dependency>-->
<!-- <groupId>org.springframework</groupId>-->
<!-- <artifactId>springloaded</artifactId>-->
<!-- <version>3.4.12</version>-->
<!-- </dependency>-->
<!-- </dependencies>-->
<!-- </plugin>-->
</plugins>
</build>
</project>
79 changes: 79 additions & 0 deletions rerun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# !/bin/bash

set -e

echo "Запуск MapReduce для анализа продаж"

REDUCERS=4
BLOCK_SIZE_KB=1024

read -p "Количество reducer-ов [$REDUCERS]: " user_reducers
read -p "Размер блока в KB [$BLOCK_SIZE_KB]: " user_block_size

REDUCERS=${user_reducers:-$REDUCERS}
BLOCK_SIZE_KB=${user_block_size:-$BLOCK_SIZE_KB}

echo "1. Запуск Hadoop кластера..."
docker-compose up -d

echo "Ожидание запуска NameNode..."
while ! curl -s http://localhost:9870 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "Ожидание запуска DataNode..."
while ! curl -s http://localhost:9864 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "Ожидание запуска ResourceManager..."
while ! curl -s http://localhost:8088 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "Ожидание запуска NodeManager..."
while ! curl -s http://localhost:8042 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "2. Сборка проекта..."
mvn clean package

echo "3. Подготовка HDFS..."

docker exec namenode hdfs dfs -rm -r /output /output-temp 2>/dev/null || true

echo "5. Подготовка JAR файла..."
docker cp ./target/lab3-1.0.jar namenode:/tmp/sales-app.jar

echo "6. Запуск MapReduce задания на YARN..."

docker exec namenode hadoop jar /tmp/sales-app.jar /input /output $REDUCERS $BLOCK_SIZE_KB

echo "7. Получение результатов..."

if docker exec namenode hdfs dfs -test -e /output/_SUCCESS; then
echo "Задание выполнено успешно!"
> results.txt
echo "Category,Revenue,Quantity" >> results.txt

docker exec namenode hdfs dfs -ls /output/part-r-* 2>/dev/null | \
awk '{print $NF}' | \
while read file; do
docker exec namenode hdfs dfs -cat "$file" 2>/dev/null >> results.txt
echo "" >> results.txt
done

echo "Результаты сохранены в results.txt"
else
echo "Ошибка: задание не завершилось успешно"
exit 1
fi
101 changes: 101 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# !/bin/bash

set -e

echo "Запуск MapReduce для анализа продаж"

REDUCERS=4
BLOCK_SIZE_KB=1024

read -p "Количество reducer-ов [$REDUCERS]: " user_reducers
read -p "Размер блока в KB [$BLOCK_SIZE_KB]: " user_block_size

REDUCERS=${user_reducers:-$REDUCERS}
BLOCK_SIZE_KB=${user_block_size:-$BLOCK_SIZE_KB}

echo "1. Запуск Hadoop кластера..."
docker-compose up -d

echo "Ожидание запуска NameNode..."
while ! curl -s http://localhost:9870 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "Ожидание запуска DataNode..."
while ! curl -s http://localhost:9864 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "Ожидание запуска ResourceManager..."
while ! curl -s http://localhost:8088 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "Ожидание запуска NodeManager..."
while ! curl -s http://localhost:8042 > /dev/null; do
sleep 2
echo -n "."
done
echo " Готово!"

echo "2. Сборка проекта..."
mvn clean package

echo "3. Подготовка HDFS..."

docker exec namenode hdfs dfs -rm -r /input /output /output-temp 2>/dev/null || true
docker exec namenode hdfs dfs -mkdir -p /input


echo "4. Копирование CSV файлов в HDFS..."

docker exec namenode rm -rf /tmp/*.csv 2>/dev/null || true

for csv_file in ./*.csv; do
if [ -f "$csv_file" ]; then
filename=$(basename "$csv_file")
echo "Копирование $filename..."

docker cp "$csv_file" namenode:/tmp/
docker exec namenode hdfs dfs -put -f /tmp/"$filename" /input/
fi
done

echo "5. Подготовка JAR файла..."
docker cp ./target/lab3-1.0.jar namenode:/tmp/sales-app.jar

echo "6. Запуск MapReduce задания на YARN..."

docker exec namenode hadoop jar /tmp/sales-app.jar /input /output $REDUCERS $BLOCK_SIZE_KB

echo "7. Получение результатов..."

if docker exec namenode hdfs dfs -test -e /output/_SUCCESS; then
echo "Задание выполнено успешно!"
> results.txt
echo "Category,Revenue,Quantity" >> results.txt

docker exec namenode hdfs dfs -ls /output/part-r-* 2>/dev/null | \
awk '{print $NF}' | \
while read file; do
docker exec namenode hdfs dfs -cat "$file" 2>/dev/null >> results.txt
echo "" >> results.txt
done

echo "Результаты сохранены в results.txt"
else
echo "Ошибка: задание не завершилось успешно"
exit 1
fi

read -p "Остановить Hadoop кластер? (y/N): " stop_cluster
if [[ $stop_cluster == "y" || $stop_cluster == "Y" ]]; then
echo "8. Остановка кластера..."
docker-compose down
fi
Loading