diff --git a/.gitignore b/.gitignore index 68bc17f9..b41c7669 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +####### 미션에서 큰 파일 +w2/sentiment_analysis/tweets.csv +w2/sentiment_analysis/webtoon_analysis/webtoon_comments.db + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/missions/W1/mtcars.csv b/missions/W1/mtcars.csv deleted file mode 100644 index a22b9c21..00000000 --- a/missions/W1/mtcars.csv +++ /dev/null @@ -1,33 +0,0 @@ -"","mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb" -"Mazda RX4",21,6,160,110,3.9,2.62,16.46,0,1,4,4 -"Mazda RX4 Wag",21,6,160,110,3.9,2.875,17.02,0,1,4,4 -"Datsun 710",22.8,4,108,93,3.85,2.32,18.61,1,1,4,1 -"Hornet 4 Drive",21.4,6,258,110,3.08,3.215,19.44,1,0,3,1 -"Hornet Sportabout",18.7,8,360,175,3.15,3.44,17.02,0,0,3,2 -"Valiant",18.1,6,225,105,2.76,3.46,20.22,1,0,3,1 -"Duster 360",14.3,8,360,245,3.21,3.57,15.84,0,0,3,4 -"Merc 240D",24.4,4,146.7,62,3.69,3.19,20,1,0,4,2 -"Merc 230",22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 -"Merc 280",19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 -"Merc 280C",17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 -"Merc 450SE",16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 -"Merc 450SL",17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 -"Merc 450SLC",15.2,8,275.8,180,3.07,3.78,18,0,0,3,3 -"Cadillac Fleetwood",10.4,8,472,205,2.93,5.25,17.98,0,0,3,4 -"Lincoln Continental",10.4,8,460,215,3,5.424,17.82,0,0,3,4 -"Chrysler Imperial",14.7,8,440,230,3.23,5.345,17.42,0,0,3,4 -"Fiat 128",32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 -"Honda Civic",30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 -"Toyota Corolla",33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 -"Toyota Corona",21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 -"Dodge Challenger",15.5,8,318,150,2.76,3.52,16.87,0,0,3,2 -"AMC Javelin",15.2,8,304,150,3.15,3.435,17.3,0,0,3,2 -"Camaro Z28",13.3,8,350,245,3.73,3.84,15.41,0,0,3,4 -"Pontiac Firebird",19.2,8,400,175,3.08,3.845,17.05,0,0,3,2 -"Fiat X1-9",27.3,4,79,66,4.08,1.935,18.9,1,1,4,1 -"Porsche 914-2",26,4,120.3,91,4.43,2.14,16.7,0,1,5,2 -"Lotus Europa",30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 -"Ford Pantera L",15.8,8,351,264,4.22,3.17,14.5,0,1,5,4 -"Ferrari Dino",19.7,6,145,175,3.62,2.77,15.5,0,1,5,6 -"Maserati Bora",15,8,301,335,3.54,3.57,14.6,0,1,5,8 -"Volvo 142E",21.4,4,121,109,4.11,2.78,18.6,1,1,4,2 diff --git a/slides/W1 Introduction to Data Engineering.pdf b/slides/W1 Introduction to Data Engineering.pdf deleted file mode 100644 index 50229c2d..00000000 Binary files a/slides/W1 Introduction to Data Engineering.pdf and /dev/null differ diff --git a/slides/W2 Introduction to Big Data.pdf b/slides/W2 Introduction to Big Data.pdf deleted file mode 100644 index 6511b2c6..00000000 Binary files a/slides/W2 Introduction to Big Data.pdf and /dev/null differ diff --git a/w5/m2/W5M2.ipynb b/w5/m2/W5M2.ipynb new file mode 100644 index 00000000..ba19af4a --- /dev/null +++ b/w5/m2/W5M2.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# W5M2 - Optimization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 라이브러리 및 세션 설정" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark import SparkConf\n", + "from pyspark.sql import SparkSession, Row\n", + "from pyspark.sql.functions import isnull, avg, min, date_format\n", + "from operator import add\n", + "\n", + "spark = SparkSession.builder \\\n", + " .master('spark://spark-master:7077') \\\n", + " .appName('W5M2') \\\n", + " .config('spark.executor.memory', '4gb') \\\n", + " .config(\"spark.executor.cores\", \"5\") \\\n", + " .getOrCreate()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 데이터 로딩" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TLC_data_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/fhvhv_tripdata_2023-01.parquet'\n", + "weather_data_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/weather.csv'\n", + "output_dir_path = 'hdfs://spark-master:9000/user/spark_user/W5M2_output/'\n", + "tlc_ext = 'parquet'\n", + "weather_ext = 'csv'\n", + "\n", + "def load_dataframe(spark_session, file_path, extension):\n", + " if extension == \"csv\":\n", + " df = spark_session.read.csv(file_path, header=True, inferSchema=True)\n", + " elif extension == \"parquet\":\n", + " df = spark_session.read.parquet(file_path)\n", + " else:\n", + " raise NotImplementedError(\"Unsupported file extension.\")\n", + " return df\n", + "\n", + "df = load_dataframe(spark, TLC_data_path, tlc_ext)\n", + "print(\"- The schema of the TLC DataFrame - \\n\", df.schema)\n", + "df.show(1, vertical=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 데이터 클리닝" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove invalid or null entries and filter out unrealistic values\n", + "df = df.na.drop('any').filter(df.driver_pay > 0).filter(df.base_passenger_fare > 0)\n", + "df.show(5) # Check the top 5 rows after cleaning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 데이터 변환" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply various transformations\n", + "df = df.withColumn(\"pickup_date\", date_format(df.pickup_datetime, 'yyyy-MM-dd'))\n", + "df = df.select(\"pickup_date\", \"base_passenger_fare\", \"trip_miles\")\n", + "df.cache()\n", + "\n", + "short_trip_df = df.filter(df.trip_miles < 10)\n", + "per_day_total_revenue_df = df.groupBy(\"pickup_date\").sum(\"base_passenger_fare\").orderBy(\"pickup_date\")\n", + "per_day_avg_trip_miles_df = df.groupBy(\"pickup_date\").mean(\"trip_miles\").orderBy(\"pickup_date\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 데이터 액션 및 저장" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute actions to trigger the transformations\n", + "print(\"Sample Short Trip Data: \", short_trip_df.take(1))\n", + "print(\"Sample Per Day Total Revenue: \", per_day_total_revenue_df.take(1))\n", + "print(\"Sample Per Day Average Trip Miles: \", per_day_avg_trip_miles_df.take(1))\n", + "\n", + "# Save the results to specified storage format\n", + "df.coalesce(1).write.mode('overwrite').csv(output_dir_path + \"df\")\n", + "short_trip_df.coalesce(1).write.mode('overwrite').csv(output_dir_path + \"short_trip_df\")\n", + "per_day_total_revenue_df.coalesce(1).write.mode('overwrite').csv(output_dir_path + \"per_day_total_revenue_df\")\n", + "per_day_avg_trip_miles_df.coalesce(1).write.mode('overwrite').csv(output_dir_path + \"per_day_avg_trip_miles_df\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/w5/m2/docker-compose.yml b/w5/m2/docker-compose.yml new file mode 100644 index 00000000..d521b2d4 --- /dev/null +++ b/w5/m2/docker-compose.yml @@ -0,0 +1,78 @@ +services: + spark-master: + container_name: spark-master + hostname: spark-master + build: . + image: spark-standalone-cluster + entrypoint: ['./entrypoint.sh', 'master'] + volumes: + - spark-logs:/home/spark_user/spark/spark-events + - W4M2:/home/spark_user/code + - namenode:/home/hduser/data + ports: + - '8080:8080' + - '7077:7077' + - '8888:8888' + - "9870:9870" + - "8088:8088" + networks: + - spark + + spark-history-server: + container_name: spark-history + hostname: spark-history-server + build: . + entrypoint: ['./entrypoint.sh', 'history'] + depends_on: + - spark-master + volumes: + - spark-logs:/home/spark_user/spark/spark-events + - datanode0:/home/hduser/data + ports: + - '18080:18080' + networks: + - spark + + spark-worker1: + container_name: spark-worker1 + hostname: spark-worker1 + build: . + entrypoint: ['./entrypoint.sh', 'worker'] + depends_on: + - spark-master + volumes: + - spark-logs:/home/spark_user/spark/spark-events + - datanode1:/home/hduser/data + + ports: + - '11111:8081' + networks: + - spark + + spark-worker2: + container_name: spark-worker2 + hostname: spark-worker2 + build: . + entrypoint: ['./entrypoint.sh', 'worker'] + depends_on: + - spark-master + volumes: + - spark-logs:/home/spark_user/spark/spark-events + - datanode2:/home/hduser/data + + ports: + - '22222:8081' + networks: + - spark + +volumes: + spark-logs: + W4M2: + namenode: + datanode0: + datanode1: + datanode2: + +networks: + spark: + driver: bridge \ No newline at end of file