generated from nogibjj/fanxu_template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
61 lines (48 loc) · 1.53 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def load_data(spark, local_path="NBA_24_stats.csv"):
"""
Load data from a local CSV file into a PySpark DataFrame, clean up column names, and remove duplicate columns.
"""
df = spark.read.csv(local_path, header=True, inferSchema=True)
return df
def explore_data(df):
"""
Perform basic exploration on the DataFrame.
"""
output = {}
# Collect first rows
output["first_rows"] = df.limit(3).toPandas().to_markdown(index=False)
# Count rows
output["row_count"] = df.count()
# Collect summary statistics
try:
summary_df = df.select("Age", "AST", "STL").describe().toPandas()
output["summary_stats"] = summary_df.to_markdown(index=False)
except Exception as e:
output["summary_stats"] = f"Error: {e}"
return output
def process_data(spark, df):
"""
Perform SQL queries on the DataFrame.
"""
output = {}
# Register the DataFrame as a SQL table
df.createOrReplaceTempView("nba_players")
# Query 1
query1_df = spark.sql(
"""
SELECT Player, Team, Pos, PTS
FROM nba_players
WHERE PTS > 25
ORDER BY PTS DESC
LIMIT 10
"""
)
output["query"] = query1_df.toPandas().to_markdown(index=False)
return output
def transformation(df):
"""
Add assist-turnover ratio column to the DataFrame.
"""
df = df.withColumn("AST/TOV", df["AST"] / df["TOV"])
ratio_df = df.select("Player", "Team", "Pos", "AST/TOV").limit(10)
return ratio_df.toPandas().to_markdown(index=False)