Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.gradle/
build/
!gradle/wrapper/gradle-wrapper.jar
.idea/

docker/*.jar

docker/output/*
!docker/output/.keep

docker/reports/*
!docker/reports/.keep

input-data/*
!input-data/.keep

*.log
logs/
3 changes: 0 additions & 3 deletions 0.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 1.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 2.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 3.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 4.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 5.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 6.csv

This file was deleted.

3 changes: 0 additions & 3 deletions 7.csv

This file was deleted.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[![Review Assignment Due Date](https://classroom.github.com/assets/deadline-readme-button-22041afd0340ce965d47ae6ef1cefeee28c7c493a6346c4f15d667ab976d596c.svg)](https://classroom.github.com/a/uyodabcP)
## Лабораторная работа: Реализация MapReduce для анализа данных о продажах с ипользованием HADOOP!!!
# Цель работы

Expand Down
42 changes: 42 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
plugins {
java
}

group = "com.vitaya.para"
version = "1.0-SNAPSHOT"

repositories {
mavenCentral()
}

java {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}

dependencies {
compileOnly("org.apache.hadoop:hadoop-client:3.2.1")
compileOnly("org.apache.hadoop:hadoop-common:3.2.1")
compileOnly("org.apache.hadoop:hadoop-mapreduce-client-core:3.2.1")
}

tasks.jar {
manifest {
attributes(
"Main-Class" to "com.vitaya.para.sales.SalesDriver"
)
}
archiveBaseName.set("sales-analytics")

duplicatesStrategy = DuplicatesStrategy.EXCLUDE
}

tasks.register<Copy>("copyJarToDocker") {
dependsOn(tasks.jar)
from(tasks.jar.get().archiveFile)
into("docker")
}

tasks.build {
dependsOn("copyJarToDocker")
}
15 changes: 15 additions & 0 deletions docker/Dockerfile.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM gradle:7.6-jdk8

WORKDIR /app

# Copy gradle files first for caching
COPY build.gradle.kts settings.gradle.kts ./

# Copy source code
COPY src ./src

# Build the JAR
RUN ./gradlew clean build --no-daemon
RUN gradle clean build --no-daemon

# The JAR will be in /app/build/libs/
24 changes: 24 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

# Build the sales-analytics JAR using Docker (Java 8 environment)
# This script builds the project in a container and copies the JAR to the docker directory

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"

echo "=== Building Sales Analytics JAR ==="
echo "Project directory: $PROJECT_DIR"

# Build in Docker container
docker build -t sales-analytics-builder -f "$SCRIPT_DIR/Dockerfile.build" "$PROJECT_DIR"

# Extract the JAR from the container
CONTAINER_ID=$(docker create sales-analytics-builder)
docker cp "$CONTAINER_ID:/app/build/libs/sales-analytics-1.0-SNAPSHOT.jar" "$SCRIPT_DIR/"
docker rm "$CONTAINER_ID"

echo ""
echo "=== Build complete ==="
echo "JAR file: $SCRIPT_DIR/sales-analytics-1.0-SNAPSHOT.jar"
98 changes: 98 additions & 0 deletions docker/compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
hostname: namenode
ports:
- "9870:9870"
- "9000:9000"
environment:
- CLUSTER_NAME=sales-analytics
env_file:
- ./hadoop.env
volumes:
- namenode_data:/hadoop/dfs/name
networks:
- hadoop

datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode
hostname: datanode
environment:
- SERVICE_PRECONDITION=namenode:9870
env_file:
- ./hadoop.env
volumes:
- datanode_data:/hadoop/dfs/data
networks:
- hadoop
depends_on:
- namenode

resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
container_name: resourcemanager
hostname: resourcemanager
ports:
- "8088:8088"
environment:
- SERVICE_PRECONDITION=namenode:9000 namenode:9870 datanode:9864
env_file:
- ./hadoop.env
volumes:
- ./sales-analytics-1.0-SNAPSHOT.jar:/opt/hadoop/jobs/sales-analytics-1.0-SNAPSHOT.jar:ro
- ../input-data:/opt/hadoop/input:ro
- ./reports:/opt/hadoop/reports
- ./output:/opt/hadoop/output
- ./run-job.sh:/opt/hadoop/run-job.sh:ro
networks:
- hadoop
depends_on:
- namenode
- datanode

nodemanager:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
container_name: nodemanager
hostname: nodemanager
ports:
- "8042:8042"
environment:
- SERVICE_PRECONDITION=namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088
env_file:
- ./hadoop.env
networks:
- hadoop
depends_on:
- namenode
- datanode
- resourcemanager

historyserver:
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8
container_name: historyserver
hostname: historyserver
ports:
- "8188:8188"
environment:
- SERVICE_PRECONDITION=namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088
env_file:
- ./hadoop.env
volumes:
- historyserver_data:/hadoop/yarn/timeline
networks:
- hadoop
depends_on:
- namenode
- datanode
- resourcemanager

volumes:
namenode_data:
datanode_data:
historyserver_data:

networks:
hadoop:
driver: bridge
48 changes: 48 additions & 0 deletions docker/hadoop.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Core configuration
CORE_CONF_fs_defaultFS=hdfs://namenode:9000
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec

# HDFS configuration
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
HDFS_CONF_dfs_replication=1
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false

# YARN configuration
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_mapreduce_map_output_compress=true
YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec
YARN_CONF_yarn_nodemanager_resource_memory___mb=8192
YARN_CONF_yarn_nodemanager_resource_cpu___vcores=4
YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle

# MapReduce configuration
MAPRED_CONF_mapreduce_framework_name=yarn
MAPRED_CONF_mapred_child_java_opts=-Xmx1536m
MAPRED_CONF_mapreduce_map_memory_mb=2048
MAPRED_CONF_mapreduce_reduce_memory_mb=2048
MAPRED_CONF_mapreduce_map_java_opts=-Xmx1536m
MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx1536m
MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
Empty file added docker/output/.keep
Empty file.
Empty file added docker/reports/.keep
Empty file.
45 changes: 45 additions & 0 deletions docker/run-job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# Sales Analytics MapReduce Job Runner

set -e

NUM_REDUCERS=${NUM_REDUCERS:-2}

JAR_PATH="/opt/hadoop/jobs/sales-analytics-1.0-SNAPSHOT.jar"
INPUT_LOCAL="/opt/hadoop/input"
OUTPUT_LOCAL="/opt/hadoop/output"
HDFS_INPUT="/input"
HDFS_OUTPUT="/output"

echo "=== Sales Analytics MapReduce Job ==="
echo "Number of reducers: $NUM_REDUCERS"
echo ""

# Wait for HDFS to be ready
echo "Waiting for HDFS..."
until hdfs dfsadmin -safemode wait 2>/dev/null; do
sleep 5
done
echo "HDFS is ready."

# Setup HDFS
hdfs dfs -mkdir -p $HDFS_INPUT
hdfs dfs -rm -r -f $HDFS_INPUT/* 2>/dev/null || true
hdfs dfs -rm -r -f $HDFS_OUTPUT 2>/dev/null || true

# Copy input data
echo "Copying input data to HDFS..."
hdfs dfs -put $INPUT_LOCAL/*.csv $HDFS_INPUT/

# Run MapReduce job
echo ""
hadoop jar $JAR_PATH $HDFS_INPUT $HDFS_OUTPUT $NUM_REDUCERS

# Get results and sort by revenue (descending)
echo ""
echo "=== Results ==="
hdfs dfs -cat $HDFS_OUTPUT/part-* | tee $OUTPUT_LOCAL/results.txt

echo ""
echo "Results saved to: $OUTPUT_LOCAL/results.txt"
Binary file added gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
7 changes: 7 additions & 0 deletions gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Loading