Skip to content

Commit 5aad3a1

Browse files
authored
chore: ensure colab sample notebooks are tested (#351)
* chore: ensure colab sample notebooks are tested * make restore from backup robust to when the backup doesn't exist * fix path to notebook params scripts * exclude notebooks that need parameters other than project_id * add missing dependencies * notebook testing fixes * add sleep to avoid some bucket flakiness * Revert "add sleep to avoid some bucket flakiness" This reverts commit dfee838. * exclude bq_dataframes_llm_code_generation sample
1 parent 6795ed2 commit 5aad3a1

File tree

4 files changed

+168
-25
lines changed

4 files changed

+168
-25
lines changed

notebooks/getting_started/getting_started_bq_dataframes.ipynb

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,10 @@
532532
},
533533
"outputs": [],
534534
"source": [
535+
"# BigQuery DataFrames can read directly from GCS.\n",
536+
"fn = 'gs://cloud-samples-data/vertex-ai/bigframe/penguins.csv'\n",
537+
"\n",
538+
"# Or from a local file.\n",
535539
"# fn = 'penguins.csv'"
536540
]
537541
},
@@ -580,7 +584,9 @@
580584
},
581585
"outputs": [],
582586
"source": [
583-
"df_from_local = bf.read_csv(fn)"
587+
"# If order is not important, use the \"bigquery\" engine to\n",
588+
"# allow BigQuery DataFrames to read directly from GCS.\n",
589+
"df_from_local = bf.read_csv(fn, engine=\"bigquery\")"
584590
]
585591
},
586592
{
@@ -658,7 +664,10 @@
658664
},
659665
"outputs": [],
660666
"source": [
661-
"df_from_local.to_gbq(PROJECT_ID + \".\" + DATASET_ID + \".penguins\")"
667+
"df_from_local.to_gbq(\n",
668+
" PROJECT_ID + \".\" + DATASET_ID + \".penguins\",\n",
669+
" if_exists=\"replace\",\n",
670+
")"
662671
]
663672
},
664673
{
@@ -771,7 +780,7 @@
771780
},
772781
"outputs": [],
773782
"source": [
774-
"bq_df[\"species\", \"body_mass_g\"].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()"
783+
"bq_df[[\"species\", \"body_mass_g\"]].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()"
775784
]
776785
},
777786
{

noxfile.py

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -657,9 +657,23 @@ def system_prerelease(session: nox.sessions.Session):
657657

658658

659659
@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
660-
def notebook(session):
660+
def notebook(session: nox.Session):
661+
GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
662+
if not GOOGLE_CLOUD_PROJECT:
663+
session.error(
664+
"Set GOOGLE_CLOUD_PROJECT environment variable to run notebook session."
665+
)
666+
661667
session.install("-e", ".[all]")
662-
session.install("pytest", "pytest-xdist", "pytest-retry", "nbmake")
668+
session.install(
669+
"pytest",
670+
"pytest-xdist",
671+
"pytest-retry",
672+
"nbmake",
673+
"google-cloud-aiplatform",
674+
"matplotlib",
675+
"seaborn",
676+
)
663677

664678
notebooks_list = list(Path("notebooks/").glob("*/*.ipynb"))
665679

@@ -669,19 +683,22 @@ def notebook(session):
669683
# These notebooks contain special colab `param {type:"string"}`
670684
# comments, which make it easy for customers to fill in their
671685
# own information.
686+
#
687+
# With the notebooks_fill_params.py script, we are able to find and
688+
# replace the PROJECT_ID parameter, but not the others.
689+
#
672690
# TODO(ashleyxu): Test these notebooks by replacing parameters with
673691
# appropriate values and omitting cleanup logic that may break
674692
# our test infrastructure.
675-
"notebooks/getting_started/getting_started_bq_dataframes.ipynb",
676-
"notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb",
677-
"notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb",
678-
"notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb",
679-
"notebooks/regression/bq_dataframes_ml_linear_regression.ipynb",
680-
"notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb",
681-
"notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb",
682-
"notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb",
683-
"notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb",
684-
"notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb",
693+
"notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET.
694+
"notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID.
695+
"notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION.
696+
# TODO(swast): investigate why we get 404 errors, even though
697+
# bq_dataframes_llm_code_generation creates a bucket in the sample.
698+
"notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI.
699+
"notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI.
700+
"notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", # Needs BUCKET_URI.
701+
"notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI.
685702
# The experimental notebooks imagine features that don't yet
686703
# exist or only exist as temporary prototypes.
687704
"notebooks/experimental/longer_ml_demo.ipynb",
@@ -709,9 +726,9 @@ def notebook(session):
709726
for nb, regions in notebooks_reg.items()
710727
}
711728

712-
# For some reason nbmake exits silently with "no tests ran" message if
729+
# The pytest --nbmake exits silently with "no tests ran" message if
713730
# one of the notebook paths supplied does not exist. Let's make sure that
714-
# each path exists
731+
# each path exists.
715732
for nb in notebooks + list(notebooks_reg):
716733
assert os.path.exists(nb), nb
717734

@@ -723,16 +740,33 @@ def notebook(session):
723740
pytest_command = [
724741
"py.test",
725742
"--nbmake",
726-
"--nbmake-timeout=600",
743+
"--nbmake-timeout=900", # 15 minutes
727744
]
728745

729-
# Run self-contained notebooks in single session.run
730-
# achieve parallelization via -n
731-
session.run(
732-
*pytest_command,
733-
"-nauto",
734-
*notebooks,
735-
)
746+
try:
747+
# Populate notebook parameters and make a backup so that the notebooks
748+
# are runnable.
749+
session.run(
750+
"python",
751+
CURRENT_DIRECTORY / "scripts" / "notebooks_fill_params.py",
752+
*notebooks,
753+
)
754+
755+
# Run self-contained notebooks in single session.run
756+
# achieve parallelization via -n
757+
session.run(
758+
*pytest_command,
759+
"-nauto",
760+
*notebooks,
761+
)
762+
finally:
763+
# Prevent our notebook changes from getting checked in to git
764+
# accidentally.
765+
session.run(
766+
"python",
767+
CURRENT_DIRECTORY / "scripts" / "notebooks_restore_from_backup.py",
768+
*notebooks,
769+
)
736770

737771
# Run regionalized notebooks in parallel session.run's, since each notebook
738772
# takes a different region via env param.

scripts/notebooks_fill_params.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import os
17+
import re
18+
import shutil
19+
import sys
20+
21+
GOOGLE_CLOUD_PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
22+
23+
24+
def make_backup(notebook_path: str):
25+
shutil.copy(
26+
notebook_path,
27+
f"{notebook_path}.backup",
28+
)
29+
30+
31+
def replace_project(line):
32+
"""
33+
Notebooks contain special colab `param {type:"string"}`
34+
comments, which make it easy for customers to fill in their
35+
own information.
36+
"""
37+
# Make sure we're robust to whitespace differences.
38+
cleaned = re.sub(r"\s", "", line)
39+
if cleaned == 'PROJECT_ID=""#@param{type:"string"}':
40+
return f'PROJECT_ID = "{GOOGLE_CLOUD_PROJECT}" # @param {{type:"string"}}\n'
41+
else:
42+
return line
43+
44+
45+
def replace_params(notebook_path: str):
46+
with open(notebook_path, "r", encoding="utf-8") as notebook_file:
47+
notebook_json = json.load(notebook_file)
48+
49+
for cell in notebook_json["cells"]:
50+
lines = cell.get("source", [])
51+
new_lines = [replace_project(line) for line in lines]
52+
cell["source"] = new_lines
53+
54+
with open(notebook_path, "w", encoding="utf-8") as notebook_file:
55+
json.dump(notebook_json, notebook_file, indent=2, ensure_ascii=False)
56+
57+
58+
def main(notebook_paths):
59+
for notebook_path in notebook_paths:
60+
make_backup(notebook_path)
61+
replace_params(notebook_path)
62+
63+
64+
if __name__ == "__main__":
65+
main(sys.argv[1:])
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pathlib
16+
import shutil
17+
import sys
18+
19+
20+
def restore_from_backup(notebook_path):
21+
backup_path = pathlib.Path(f"{notebook_path}.backup")
22+
if backup_path.exists():
23+
shutil.move(
24+
backup_path,
25+
notebook_path,
26+
)
27+
28+
29+
def main(notebook_paths):
30+
for notebook_path in notebook_paths:
31+
restore_from_backup(notebook_path)
32+
33+
34+
if __name__ == "__main__":
35+
main(sys.argv[1:])

0 commit comments

Comments
 (0)