From a36e20902317683c023a445a3e2a7cfde283e033 Mon Sep 17 00:00:00 2001 From: Fan Xu Date: Fri, 13 Dec 2024 19:55:10 -0800 Subject: [PATCH] final changes --- README.md | 54 ++++++++++++++++++++++++++++- __pycache__/script.cpython-312.pyc | Bin 2704 -> 2564 bytes lib.py | 8 ++--- output.md | 36 +++++++++---------- script.py | 20 ++--------- test_lib.py | 2 +- 6 files changed, 77 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index b204246..179a24c 100644 --- a/README.md +++ b/README.md @@ -1 +1,53 @@ -# Pyspark \ No newline at end of file +# PySpark + +[![CI](https://github.com/nogibjj/fan_xu_pyspark/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/fan_xu_pyspark/actions/workflows/cicd.yml) + +The purpose of this project is to demonstrate PySpark functionality on a dataset about the statistics of NBA Players. The dataset is queried and a transformation is done with the output shown in a markdown file. + +## Requirements + +- Use PySpark to perform data processing on a large dataset + +- Include at least one Spark SQL query and one data transformation + +## Project Structure + +``` +šŸ“¦ fan_xu_pyspark +.github +workflows +cicd.yml +Makefile +NBA_24_stats.csv +README.md +__pycache__ +script.cpython-312.pyc +gitignore +lib.py +output.md +requirements.txt +script.py +test_lib.py +``` +Ā©generated by [Project Tree Generator](https://woochanleee.github.io/project-tree-generator) + +# Highlights + +- EDA + +The first 3 rows are displayed along with summary statistics for the age, assists, and steals columns + +- Query + +The top 10 highest-scoring players are queried + +- Transformation + +A column is added to show the assist/turnover ratio of the players + +# Installation + +Requirements: +- Python +- PySpark +- Java \ No newline at end of file diff --git a/__pycache__/script.cpython-312.pyc b/__pycache__/script.cpython-312.pyc index 56c17d4b3e0dc6f404de18d3c5ba23f49701cf40..f8df64ecccd31ce812ecc4a2f30b18db7712b21c 100644 GIT binary patch delta 476 zcmYLFO-sW-5Y41XQ>&3uEm#qaqK9D7wiHni!Ov1q5!8bsme@_T&^C3F`f>19JgYf) z5$_(gH-CbjyyjO(dk{Q`vr%zj-_E=@v-5VZVvSz)RaL_Rt>>@3!IWvsLeNJ3?SxyghqBEMC-B_J}o&%i&nW_sp0@p*vH|2s^|~<ET8i;ogX z{~Ofppi#Ap7FM`g#WZPI7dwhfrD$CTs3Z^^!v%P zArm_k!mw&L9 zZ#T(y%%XZQNO$ihVHU-BnJm1d{H%?#>x75m_1o* M!R+Zehs|=%U(0EVtN;K2 diff --git a/lib.py b/lib.py index 8f3312c..358ac4c 100644 --- a/lib.py +++ b/lib.py @@ -5,8 +5,6 @@ def save_to_markdown(filename, content): """ Save the given content to a markdown file. - :param filename: Name of the markdown file. - :param content: Content to write into the file. """ with open(filename, "w") as f: f.write(content) @@ -39,14 +37,14 @@ def main(): process_output = process_data(spark, df) markdown_content += "## SQL Queries\n\n" markdown_content += ( - "### Query 1: Top 10 High-Scoring Games\n\n" + process_output["query"] + "\n\n" + "### Top 10 Highest Scoring Players\n\n" + process_output["query"] + "\n\n" ) # Transformation 1 winner_output = transformation(df) - markdown_content += "## Data Transformation 1\n\n" + markdown_content += "## Data Transformation\n\n" markdown_content += ( - "### Creating a winner column for each game and calculate the point difference (1st 10 rows)\n\n" + "### Creating a column to calculate Assist/Turnover Ratio (1st 10 rows)\n\n" + winner_output + "\n\n" ) diff --git a/output.md b/output.md index 4b525f8..5cbd207 100644 --- a/output.md +++ b/output.md @@ -28,24 +28,24 @@ Data loaded successfully. ## SQL Queries -### Query 1: Top 10 High-Scoring Games - -| Player | Team | Pos | -|:------------------------|:-------|:------| -| De'Aaron Fox | SAC | PG | -| Jayson Tatum | BOS | PF | -| Kevin Durant | PHO | PF | -| Devin Booker | PHO | PG | -| Jalen Brunson | NYK | PG | -| Shai Gilgeous-Alexander | OKC | PG | -| Giannis Antetokounmpo | MIL | PF | -| Luka Dončić | DAL | PG | -| Ja Morant | MEM | PG | -| Kyrie Irving | DAL | SG | - -## Data Transformation 1 - -### Creating a winner column for each game and calculate the point difference (1st 10 rows) +### Top 10 Highest Scoring Players + +| Player | Team | Pos | PTS | +|:------------------------|:-------|:------|------:| +| Joel Embiid | PHI | C | 34.7 | +| Luka Dončić | DAL | PG | 33.9 | +| Giannis Antetokounmpo | MIL | PF | 30.4 | +| Shai Gilgeous-Alexander | OKC | PG | 30.1 | +| Jalen Brunson | NYK | PG | 28.7 | +| Devin Booker | PHO | PG | 27.1 | +| Kevin Durant | PHO | PF | 27.1 | +| Jayson Tatum | BOS | PF | 26.9 | +| De'Aaron Fox | SAC | PG | 26.6 | +| Donovan Mitchell | CLE | SG | 26.6 | + +## Data Transformation + +### Creating a column to calculate Assist/Turnover Ratio (1st 10 rows) | Player | Team | Pos | AST/TOV | |:------------------------|:-------|:------|----------:| diff --git a/script.py b/script.py index a896bc2..51e4549 100644 --- a/script.py +++ b/script.py @@ -3,22 +3,6 @@ def load_data(spark, local_path="NBA_24_stats.csv"): Load data from a local CSV file into a PySpark DataFrame, clean up column names, and remove duplicate columns. """ df = spark.read.csv(local_path, header=True, inferSchema=True) - - # # Clean column names - # for column_name in df.columns: - # new_col_name = re.sub(r"[^a-zA-Z0-9]", "", column_name) - # df = df.withColumnRenamed(column_name, new_col_name) - - # # Remove duplicate columns - # column_set = set() - # duplicate_columns = [ - # col for col in df.columns if col in column_set or column_set.add(col) - # ] - # if duplicate_columns: - # print(f"Duplicate Columns Detected: {duplicate_columns}") - # for col in duplicate_columns: - # df = df.drop(col) - return df @@ -56,10 +40,10 @@ def process_data(spark, df): # Query 1 query1_df = spark.sql( """ - SELECT Player, Team, Pos + SELECT Player, Team, Pos, PTS FROM nba_players WHERE PTS > 25 - ORDER BY Rk DESC + ORDER BY PTS DESC LIMIT 10 """ ) diff --git a/test_lib.py b/test_lib.py index a8933f7..00c06e5 100644 --- a/test_lib.py +++ b/test_lib.py @@ -35,7 +35,7 @@ def test_explore_data(sample_data): output = explore_data(sample_data) assert "first_rows" in output assert "row_count" in output - assert output["row_count"] == 3 # Check sample data row count + assert output["row_count"] == 3 assert "summary_stats" in output