Skip to content

Commit 02e3486

Browse files
authored
332 improve spark integration (#333)
* Refactor demo notebook structure by converting a code cell to markdown for improved documentation. Removed redundant markdown cell and updated execution metadata for clarity. This enhances the user experience by providing better context and organization within the notebook. * Refactor Spark session creation in notebook and JavaScript model - Renamed `create_spark_dev` to `create_spark` in `startup.py` to simplify the function name. - Made `create_spark` available in IPython's global namespace for easier access. - Removed the default Spark instance creation to allow for manual session management. - Updated `SparkModel.js` to use the new `create_spark` function for initializing Spark sessions, enhancing integration with the backend API. * Enhance Spark session creation and configuration handling - Updated `create_spark` function in `startup.py` to accept an optional `notebook_path` parameter, allowing for dynamic configuration retrieval based on the notebook context. - Improved error handling in `create_spark` to log errors and use default Spark configuration when the API request fails. - Modified `SparkModel.js` to pass the `notebookPath` to the `create_spark` function, ensuring proper session initialization. - Cleaned up the demo notebook by removing outdated code cells, enhancing clarity and usability. * Refactor Spark configuration keys in create_spark function - Updated the configuration keys in the create_spark function within startup.py to use a more concise naming convention, changing 'spark.executor.memory', 'spark.executor.cores', and 'spark.executor.instances' to 'executor_memory', 'executor_cores', and 'executor_instances' respectively. - Adjusted the corresponding references in the Spark session creation logic to align with the new key names, improving consistency and readability of the configuration handling. * Refactor Spark configuration keys in create_spark function Updated the configuration keys in the create_spark function within startup.py to use the correct Spark naming convention, changing 'executor_memory', 'executor_cores', and 'executor_instances' to 'spark.executor.memory', 'spark.executor.cores', and 'spark.executor.instances'. Adjusted the corresponding references in the Spark session creation logic for improved consistency and clarity.
1 parent 0cf6d19 commit 02e3486

File tree

3 files changed

+23
-181
lines changed

3 files changed

+23
-181
lines changed

docker/notebook/startup.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,21 @@ def _repr_html_(self):
106106
</div>
107107
"""
108108

109-
def create_spark_dev():
109+
def create_spark(notebook_path=None):
110110
logger.info("Creating Spark session")
111111
try:
112-
config_json = requests.get("http://server:5002/spark_app/config").json()
112+
if notebook_path:
113+
config_json = requests.get(f"http://server:5002/spark_app/{notebook_path}/config").json()
114+
else:
115+
config_json = requests.get("http://server:5002/spark_app/config").json()
113116
except Exception as e:
114-
config_json = 'Error loading config: ' + str(e)
115-
117+
logger.error(f"Error loading config: {str(e)}. Using defaults.")
118+
config_json = {
119+
'spark.executor.memory': '1g',
120+
'spark.executor.cores': 1,
121+
'spark.executor.instances': 1
122+
}
123+
116124
spark = PawMarkSparkSession(
117125
config_json,
118126
SparkSession.builder \
@@ -125,12 +133,16 @@ def create_spark_dev():
125133
.config("spark.eventLog.dir", "/opt/data/spark-events") \
126134
.config("spark.history.fs.logDirectory", "/opt/data/spark-events") \
127135
.config("spark.sql.warehouse.dir", "/opt/data/spark-warehouse") \
128-
.config("executor.memory", config_json['executor.memory']) \
129-
.config("executor.cores", config_json['executor.cores']) \
136+
.config("spark.executor.memory", config_json['spark.executor.memory']) \
137+
.config("spark.executor.cores", config_json['spark.executor.cores']) \
130138
.config("spark.executor.instances", config_json['spark.executor.instances']) \
131139
.getOrCreate()
132-
)
140+
)
133141

134142
return spark
135143

136-
spark = create_spark_dev()
144+
# Make create_spark_dev available in IPython's global namespace
145+
ip = get_ipython()
146+
if ip is not None:
147+
# Add to global namespace
148+
ip.user_global_ns['create_spark'] = create_spark

examples/user_0@gmail.com/demo.ipynb

Lines changed: 0 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,5 @@
11
{
22
"cells": [
3-
{
4-
"cell_type": "code",
5-
"isExecuted": true,
6-
"lastExecutionResult": "success",
7-
"lastExecutionTime": "2024-12-10 03:27:50",
8-
"metadata": {},
9-
"outputs": [
10-
{
11-
"data": {
12-
"text/html": [
13-
"\n",
14-
" <div>\n",
15-
" <p><b>SparkSession - in-memory</b></p>\n",
16-
" \n",
17-
" <div>\n",
18-
" <p><b>SparkContext</b></p>\n",
19-
"\n",
20-
" <p><a href=\"http://8e207d700c27:4040\">Spark UI</a></p>\n",
21-
"\n",
22-
" <dl>\n",
23-
" <dt>Version</dt>\n",
24-
" <dd><code>v3.5.0</code></dd>\n",
25-
" <dt>Master</dt>\n",
26-
" <dd><code>spark://spark-master:7077</code></dd>\n",
27-
" <dt>AppName</dt>\n",
28-
" <dd><code>spark-1733801092185</code></dd>\n",
29-
" </dl>\n",
30-
" </div>\n",
31-
" \n",
32-
" </div>\n",
33-
" "
34-
],
35-
"text/plain": [
36-
"<pyspark.sql.session.SparkSession at 0x7fffe2dc9ed0>"
37-
]
38-
},
39-
"execution_count": 7,
40-
"metadata": {},
41-
"output_type": "execute_result"
42-
}
43-
],
44-
"source": [
45-
"\n",
46-
"from pyspark.sql import SparkSession\n",
47-
"\n",
48-
"spark = SparkSession.builder\\\n",
49-
" .appName(\"spark-1733801270245\")\\\n",
50-
" .master(\"spark://spark-master:7077\")\\\n",
51-
" .config(\"spark.jars.packages\", \"io.delta:delta-spark_2.12:3.0.0\")\\\n",
52-
" .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\\\n",
53-
" .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\\\n",
54-
" .config(\"spark.eventLog.enabled\", \"true\")\\\n",
55-
" .config(\"spark.eventLog.dir\", \"/opt/data/spark-events\")\\\n",
56-
" .config(\"spark.history.fs.logDirectory\", \"/opt/data/spark-events\")\\\n",
57-
" .config(\"spark.sql.warehouse.dir\", \"/opt/data/spark-warehouse\")\\\n",
58-
" .config(\"spark.executor.memory\", \"1g\")\\\n",
59-
" .config(\"spark.executor.cores\", 1)\\\n",
60-
" .config(\"spark.executor.instances\", 1)\\\n",
61-
" .config(\"spark.driver.memory\", \"1g\")\\\n",
62-
" .config(\"spark.driver.cores\", 1)\\\n",
63-
" .getOrCreate()\n",
64-
"\n",
65-
"spark\n"
66-
]
67-
},
683
{
694
"cell_type": "markdown",
705
"isExecuted": true,
@@ -75,91 +10,6 @@
7510
"- This is just a demo notebook\n",
7611
"- For testing only"
7712
]
78-
},
79-
{
80-
"cell_type": "code",
81-
"execution_count": null,
82-
"isExecuted": false,
83-
"lastExecutionResult": "success",
84-
"lastExecutionTime": "2024-08-04 15:29:17",
85-
"metadata": {},
86-
"outputs": [
87-
{
88-
"data": {
89-
"text/html": [
90-
"\n",
91-
" <div style=\"border: 1px solid #e8e8e8; padding: 10px;\">\n",
92-
" <h3>Spark Session Information</h3>\n",
93-
" <p><strong>Application ID:</strong> app-20240804152430-0000</p>\n",
94-
" <p><strong>Spark UI:</strong> <a href=\"http://localhost:18080/history/app-20240804152430-0000\">http://localhost:18080/history/app-20240804152430-0000</a></p>\n",
95-
" </div>\n",
96-
" "
97-
],
98-
"text/plain": [
99-
"Custom Spark Session (App ID: app-20240804152430-0000) - UI: http://66eef2d0ade3:4040"
100-
]
101-
},
102-
"execution_count": 4,
103-
"metadata": {},
104-
"output_type": "execute_result"
105-
}
106-
],
107-
"source": [
108-
"# SparkSession is already defined in `spark` variable\n",
109-
"spark"
110-
]
111-
},
112-
{
113-
"cell_type": "code",
114-
"execution_count": 1,
115-
"isExecuted": false,
116-
"metadata": {},
117-
"outputs": [
118-
{
119-
"ename": "NameError",
120-
"evalue": "name 'a' is not defined",
121-
"output_type": "error",
122-
"traceback": [
123-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
124-
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
125-
"Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43ma\u001b[49m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2233666777888\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
126-
"\u001b[0;31mNameError\u001b[0m: name 'a' is not defined"
127-
]
128-
}
129-
],
130-
"source": [
131-
"print(a + \"2233666777888\")"
132-
]
133-
},
134-
{
135-
"cell_type": "code",
136-
"execution_count": null,
137-
"isExecuted": false,
138-
"metadata": {},
139-
"outputs": [
140-
{
141-
"name": "stdout",
142-
"output_type": "stream",
143-
"text": [
144-
"66\n",
145-
"77\n"
146-
]
147-
}
148-
],
149-
"source": [
150-
"print(66)\n",
151-
"print(77)"
152-
]
153-
},
154-
{
155-
"cell_type": "code",
156-
"execution_count": null,
157-
"isExecuted": false,
158-
"metadata": {},
159-
"outputs": [],
160-
"source": [
161-
"spark.stop()"
162-
]
16313
}
16414
],
16515
"metadata": {

webapp/src/models/SparkModel.js

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -60,29 +60,9 @@ class SparkModel {
6060
// Generate a unique spark app ID
6161
const sparkAppId = `spark-${Date.now()}`;
6262

63-
// Create a cell with Spark initialization code that uses the config
64-
const sparkInitCode = `
65-
from pyspark.sql import SparkSession
66-
67-
spark = SparkSession.builder\\
68-
.appName("${sparkAppId}")\\
69-
.master("spark://spark-master:7077")\\
70-
.config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")\\
71-
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\\
72-
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\\
73-
.config("spark.eventLog.enabled", "true")\\
74-
.config("spark.eventLog.dir", "/opt/data/spark-events")\\
75-
.config("spark.history.fs.logDirectory", "/opt/data/spark-events")\\
76-
.config("spark.sql.warehouse.dir", "/opt/data/spark-warehouse")\\
77-
.config("spark.executor.memory", "${sparkConfig['spark.executor.memory']}")\\
78-
.config("spark.executor.cores", ${sparkConfig['spark.executor.cores']})\\
79-
.config("spark.executor.instances", ${sparkConfig['spark.executor.instances']})\\
80-
.config("spark.driver.memory", "${sparkConfig['spark.driver.memory']}")\\
81-
.config("spark.driver.cores", ${sparkConfig['spark.driver.cores']})\\
82-
.getOrCreate()
83-
84-
spark
85-
`;
63+
// Create a cell with Spark initialization code that uses the existing spark instance
64+
const sparkInitCode = `spark = create_spark("${notebookPath}")
65+
spark`;
8666

8767
// Create the Spark session with this config
8868
const response = await fetch(`${config.serverBaseUrl}/spark_app/${sparkAppId}`, {

0 commit comments

Comments
 (0)