From a36e20902317683c023a445a3e2a7cfde283e033 Mon Sep 17 00:00:00 2001
From: Fan Xu <fanxu2017@gmail.com>
Date: Fri, 13 Dec 2024 19:55:10 -0800
Subject: [PATCH] final changes

---
 README.md                          |  54 ++++++++++++++++++++++++++++-
 __pycache__/script.cpython-312.pyc | Bin 2704 -> 2564 bytes
 lib.py                             |   8 ++---
 output.md                          |  36 +++++++++----------
 script.py                          |  20 ++---------
 test_lib.py                        |   2 +-
 6 files changed, 77 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index b204246..179a24c 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,53 @@
-# Pyspark
\ No newline at end of file
+# PySpark
+
+[![CI](https://github.com/nogibjj/fan_xu_pyspark/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/fan_xu_pyspark/actions/workflows/cicd.yml)
+
+The purpose of this project is to demonstrate PySpark functionality on a dataset about the statistics of NBA Players. The dataset is queried and a transformation is done with the output shown in a markdown file.
+
+## Requirements
+
+- Use PySpark to perform data processing on a large dataset
+
+- Include at least one Spark SQL query and one data transformation
+
+## Project Structure
+
+```
+📦 fan_xu_pyspark
+.github
+workflows
+cicd.yml
+Makefile
+NBA_24_stats.csv
+README.md
+__pycache__
+script.cpython-312.pyc
+gitignore
+lib.py
+output.md
+requirements.txt
+script.py
+test_lib.py
+```
+©generated by [Project Tree Generator](https://woochanleee.github.io/project-tree-generator)
+
+# Highlights
+
+- EDA
+
+The first 3 rows are displayed along with summary statistics for the age, assists, and steals columns
+
+- Query
+
+The top 10 highest-scoring players are queried 
+
+- Transformation
+
+A column is added to show the assist/turnover ratio of the players
+
+# Installation
+
+Requirements:
+- Python
+- PySpark
+- Java
\ No newline at end of file
diff --git a/__pycache__/script.cpython-312.pyc b/__pycache__/script.cpython-312.pyc
index 56c17d4b3e0dc6f404de18d3c5ba23f49701cf40..f8df64ecccd31ce812ecc4a2f30b18db7712b21c 100644
GIT binary patch
delta 476
zcmYLFO-sW-5Y41XQ>&3uEm#qaqK9D7wiHni!Ov1q5!8bsme@_T&^C3F`f>19JgYf)
z5$_(gH-CbjyyjO(dk{Q`vr%zj-_E=@v-5VZVvSz)RaL_Rt>>@3!<lO}J~c?mt-yi|
z8n7!cu_5dN3lNG-A`G0%y7C!WUzpFQGkM!F9DAZ*pFG|jfZRo`&<u!cqT<N~t6p{7
zZApW$dm@eYb&}|HnP;`Kg=U@^j)5aY#hXvk3&5p&5ge8i+Q!zVdmmgFrc+y5-dqf8
zziTzOmfO-&NuOB3$c-|djZ;7N6X+6#c+6jdLj+#Oaoxo8RGt-WiE@4k{y3<iRWNP)
z$AM1Tq>IWvsLeNJ3?SxyghqBEMC-B_J}o&%i&nW_sp0@p*vH|2s^|~&LTET8i;ogX
z{~Ofppi#Ap7FM`g#WZPI7dwhfrD$CT<kjSy|C;|5$ldsL4+8Qc-|=T?wqmh**_^`+
faX{PV7TyB@&=zCR-AeXAw3UcKw4DsXwBPy-jK*xs

delta 587
zcmYk2J#W)s5XYbM%M&|ILR%6+C0g359W1J-L`5nQ;-yHiKnFTuAtK9<L!<g7y~n16
zr5*ACKprYS0&FNBfPn?U#A<|?*-*P6Chl<uILUwA{e8OMb2o0jYSq8g>s3Z^^!v%P
z<Ja}M!+TU5Gr<IeECLrI&DA2^)j@e$Xhf!Kmbo5smkUc6V{j|N95dGvoYWQ-(&|We
z?4N3~qZh`(ZV(rypJZ`b=rW5OC~7-H5BGKgDF;cs?Q5!);(hQK%xOya<Rd)POf)FZ
zFBpr^$Ty9Zi|42;YQc+Pm|)NoUh3g0SqqcY_zZBA-{MPoz_+wVd-*#)56%1&zjA+`
zI&gu+F|elchLUBH!rspU99`{+zEq?|3Or*BG4bhjXN)!aw~==%Phn>Arm_k!mw&L9
zZ#T(y%%XZQNO$ihVHU-BnJm1d<R4WlfrW)$I!F{sZL2`}lHD|~QSl(}+Wo^B${ct(
zy}fm(*V*XF)JtXCm(QK4qBR^zb+ESF58M4L_GwZqQCA@Ka5PDZssFl=D@Vfs)j7*6
zp;`kK?=3?GD_8wv-G~yAg~9ck*d-I_lj^9ybO89lI$v4mJ6rk2>{H%?#>x75m_1o*
M!R+Zehs|=%U(0EVtN;K2

diff --git a/lib.py b/lib.py
index 8f3312c..358ac4c 100644
--- a/lib.py
+++ b/lib.py
@@ -5,8 +5,6 @@
 def save_to_markdown(filename, content):
     """
     Save the given content to a markdown file.
-    :param filename: Name of the markdown file.
-    :param content: Content to write into the file.
     """
     with open(filename, "w") as f:
         f.write(content)
@@ -39,14 +37,14 @@ def main():
     process_output = process_data(spark, df)
     markdown_content += "## SQL Queries\n\n"
     markdown_content += (
-        "### Query 1: Top 10 High-Scoring Games\n\n" + process_output["query"] + "\n\n"
+        "### Top 10 Highest Scoring Players\n\n" + process_output["query"] + "\n\n"
     )
 
     # Transformation 1
     winner_output = transformation(df)
-    markdown_content += "## Data Transformation 1\n\n"
+    markdown_content += "## Data Transformation\n\n"
     markdown_content += (
-        "### Creating a winner column for each game and calculate the point difference (1st 10 rows)\n\n"
+        "### Creating a column to calculate Assist/Turnover Ratio (1st 10 rows)\n\n"
         + winner_output
         + "\n\n"
     )
diff --git a/output.md b/output.md
index 4b525f8..5cbd207 100644
--- a/output.md
+++ b/output.md
@@ -28,24 +28,24 @@ Data loaded successfully.
 
 ## SQL Queries
 
-### Query 1: Top 10 High-Scoring Games
-
-| Player                  | Team   | Pos   |
-|:------------------------|:-------|:------|
-| De'Aaron Fox            | SAC    | PG    |
-| Jayson Tatum            | BOS    | PF    |
-| Kevin Durant            | PHO    | PF    |
-| Devin Booker            | PHO    | PG    |
-| Jalen Brunson           | NYK    | PG    |
-| Shai Gilgeous-Alexander | OKC    | PG    |
-| Giannis Antetokounmpo   | MIL    | PF    |
-| Luka Dončić             | DAL    | PG    |
-| Ja Morant               | MEM    | PG    |
-| Kyrie Irving            | DAL    | SG    |
-
-## Data Transformation 1
-
-### Creating a winner column for each game and calculate the point difference (1st 10 rows)
+### Top 10 Highest Scoring Players
+
+| Player                  | Team   | Pos   |   PTS |
+|:------------------------|:-------|:------|------:|
+| Joel Embiid             | PHI    | C     |  34.7 |
+| Luka Dončić             | DAL    | PG    |  33.9 |
+| Giannis Antetokounmpo   | MIL    | PF    |  30.4 |
+| Shai Gilgeous-Alexander | OKC    | PG    |  30.1 |
+| Jalen Brunson           | NYK    | PG    |  28.7 |
+| Devin Booker            | PHO    | PG    |  27.1 |
+| Kevin Durant            | PHO    | PF    |  27.1 |
+| Jayson Tatum            | BOS    | PF    |  26.9 |
+| De'Aaron Fox            | SAC    | PG    |  26.6 |
+| Donovan Mitchell        | CLE    | SG    |  26.6 |
+
+## Data Transformation
+
+### Creating a column to calculate Assist/Turnover Ratio (1st 10 rows)
 
 | Player                  | Team   | Pos   |   AST/TOV |
 |:------------------------|:-------|:------|----------:|
diff --git a/script.py b/script.py
index a896bc2..51e4549 100644
--- a/script.py
+++ b/script.py
@@ -3,22 +3,6 @@ def load_data(spark, local_path="NBA_24_stats.csv"):
     Load data from a local CSV file into a PySpark DataFrame, clean up column names, and remove duplicate columns.
     """
     df = spark.read.csv(local_path, header=True, inferSchema=True)
-
-    # # Clean column names
-    # for column_name in df.columns:
-    #     new_col_name = re.sub(r"[^a-zA-Z0-9]", "", column_name)
-    #     df = df.withColumnRenamed(column_name, new_col_name)
-
-    # # Remove duplicate columns
-    # column_set = set()
-    # duplicate_columns = [
-    #     col for col in df.columns if col in column_set or column_set.add(col)
-    # ]
-    # if duplicate_columns:
-    #     print(f"Duplicate Columns Detected: {duplicate_columns}")
-    #     for col in duplicate_columns:
-    #         df = df.drop(col)
-
     return df
 
 
@@ -56,10 +40,10 @@ def process_data(spark, df):
     # Query 1
     query1_df = spark.sql(
         """
-        SELECT Player, Team, Pos
+        SELECT Player, Team, Pos, PTS
         FROM nba_players
         WHERE PTS > 25
-        ORDER BY Rk DESC
+        ORDER BY PTS DESC
         LIMIT 10
     """
     )
diff --git a/test_lib.py b/test_lib.py
index a8933f7..00c06e5 100644
--- a/test_lib.py
+++ b/test_lib.py
@@ -35,7 +35,7 @@ def test_explore_data(sample_data):
     output = explore_data(sample_data)
     assert "first_rows" in output
     assert "row_count" in output
-    assert output["row_count"] == 3  # Check sample data row count
+    assert output["row_count"] == 3
     assert "summary_stats" in output