chore: ensure colab sample notebooks are tested (#351)

tswast · web-flow · commit 5aad3a115245 · 2024-01-30T10:09:43.000-06:00
* chore: ensure colab sample notebooks are tested * make restore from backup robust to when the backup doesn't exist * fix path to notebook params scripts * exclude notebooks that need parameters other than project_id * add missing dependencies * notebook testing fixes * add sleep to avoid some bucket flakiness * Revert "add sleep to avoid some bucket flakiness" This reverts commit dfee838. * exclude bq_dataframes_llm_code_generation sample
diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb
@@ -532,6 +532,10 @@
       },
       "outputs": [],
       "source": [
+        "# BigQuery DataFrames can read directly from GCS.\n",
+        "fn = 'gs://cloud-samples-data/vertex-ai/bigframe/penguins.csv'\n",
+        "\n",
+        "# Or from a local file.\n",
         "# fn = 'penguins.csv'"
       ]
     },
@@ -580,7 +584,9 @@
       },
       "outputs": [],
       "source": [
-        "df_from_local = bf.read_csv(fn)"
+        "# If order is not important, use the \"bigquery\" engine to\n",
+        "# allow BigQuery DataFrames to read directly from GCS.\n",
+        "df_from_local = bf.read_csv(fn, engine=\"bigquery\")"
       ]
     },
     {
@@ -658,7 +664,10 @@
       },
       "outputs": [],
       "source": [
-        "df_from_local.to_gbq(PROJECT_ID + \".\" + DATASET_ID + \".penguins\")"
+        "df_from_local.to_gbq(\n",
+        "    PROJECT_ID + \".\" + DATASET_ID + \".penguins\",\n",
+        "    if_exists=\"replace\",\n",
+        ")"
       ]
     },
     {
@@ -771,7 +780,7 @@
       },
       "outputs": [],
       "source": [
-        "bq_df[\"species\", \"body_mass_g\"].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()"
+        "bq_df[[\"species\", \"body_mass_g\"]].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()"
       ]
     },
     {
diff --git a/noxfile.py b/noxfile.py
@@ -657,9 +657,23 @@ def system_prerelease(session: nox.sessions.Session):
 
 
 @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
-def notebook(session):
+def notebook(session: nox.Session):
+    GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+    if not GOOGLE_CLOUD_PROJECT:
+        session.error(
+            "Set GOOGLE_CLOUD_PROJECT environment variable to run notebook session."
+        )
+
     session.install("-e", ".[all]")
-    session.install("pytest", "pytest-xdist", "pytest-retry", "nbmake")
+    session.install(
+        "pytest",
+        "pytest-xdist",
+        "pytest-retry",
+        "nbmake",
+        "google-cloud-aiplatform",
+        "matplotlib",
+        "seaborn",
+    )
 
     notebooks_list = list(Path("notebooks/").glob("*/*.ipynb"))
 
@@ -669,19 +683,22 @@ def notebook(session):
         # These notebooks contain special colab `param {type:"string"}`
         # comments, which make it easy for customers to fill in their
         # own information.
+        #
+        # With the notebooks_fill_params.py script, we are able to find and
+        # replace the PROJECT_ID parameter, but not the others.
+        #
         # TODO(ashleyxu): Test these notebooks by replacing parameters with
         # appropriate values and omitting cleanup logic that may break
         # our test infrastructure.
-        "notebooks/getting_started/getting_started_bq_dataframes.ipynb",
-        "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb",
-        "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb",
-        "notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb",
-        "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb",
-        "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb",
-        "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb",
-        "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb",
-        "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb",
-        "notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb",
+        "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb",  # Needs DATASET.
+        "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb",  # Needs DATASET_ID.
+        "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb",  # Needs CONNECTION.
+        # TODO(swast): investigate why we get 404 errors, even though
+        # bq_dataframes_llm_code_generation creates a bucket in the sample.
+        "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb",  # Needs BUCKET_URI.
+        "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb",  # Needs BUCKET_URI.
+        "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb",  # Needs BUCKET_URI.
+        "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb",  # Needs BUCKET_URI.
         # The experimental notebooks imagine features that don't yet
         # exist or only exist as temporary prototypes.
         "notebooks/experimental/longer_ml_demo.ipynb",
@@ -709,9 +726,9 @@ def notebook(session):
         for nb, regions in notebooks_reg.items()
     }
 
-    # For some reason nbmake exits silently with "no tests ran" message if
+    # The pytest --nbmake exits silently with "no tests ran" message if
     # one of the notebook paths supplied does not exist. Let's make sure that
-    # each path exists
+    # each path exists.
     for nb in notebooks + list(notebooks_reg):
         assert os.path.exists(nb), nb
 
@@ -723,16 +740,33 @@ def notebook(session):
     pytest_command = [
         "py.test",
         "--nbmake",
-        "--nbmake-timeout=600",
+        "--nbmake-timeout=900",  # 15 minutes
     ]
 
-    # Run self-contained notebooks in single session.run
-    # achieve parallelization via -n
-    session.run(
-        *pytest_command,
-        "-nauto",
-        *notebooks,
-    )
+    try:
+        # Populate notebook parameters and make a backup so that the notebooks
+        # are runnable.
+        session.run(
+            "python",
+            CURRENT_DIRECTORY / "scripts" / "notebooks_fill_params.py",
+            *notebooks,
+        )
+
+        # Run self-contained notebooks in single session.run
+        # achieve parallelization via -n
+        session.run(
+            *pytest_command,
+            "-nauto",
+            *notebooks,
+        )
+    finally:
+        # Prevent our notebook changes from getting checked in to git
+        # accidentally.
+        session.run(
+            "python",
+            CURRENT_DIRECTORY / "scripts" / "notebooks_restore_from_backup.py",
+            *notebooks,
+        )
 
     # Run regionalized notebooks in parallel session.run's, since each notebook
     # takes a different region via env param.
diff --git a/scripts/notebooks_fill_params.py b/scripts/notebooks_fill_params.py
@@ -0,0 +1,65 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+import shutil
+import sys
+
+GOOGLE_CLOUD_PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
+
+
+def make_backup(notebook_path: str):
+    shutil.copy(
+        notebook_path,
+        f"{notebook_path}.backup",
+    )
+
+
+def replace_project(line):
+    """
+    Notebooks contain special colab `param {type:"string"}`
+    comments, which make it easy for customers to fill in their
+    own information.
+    """
+    # Make sure we're robust to whitespace differences.
+    cleaned = re.sub(r"\s", "", line)
+    if cleaned == 'PROJECT_ID=""#@param{type:"string"}':
+        return f'PROJECT_ID = "{GOOGLE_CLOUD_PROJECT}"  # @param {{type:"string"}}\n'
+    else:
+        return line
+
+
+def replace_params(notebook_path: str):
+    with open(notebook_path, "r", encoding="utf-8") as notebook_file:
+        notebook_json = json.load(notebook_file)
+
+    for cell in notebook_json["cells"]:
+        lines = cell.get("source", [])
+        new_lines = [replace_project(line) for line in lines]
+        cell["source"] = new_lines
+
+    with open(notebook_path, "w", encoding="utf-8") as notebook_file:
+        json.dump(notebook_json, notebook_file, indent=2, ensure_ascii=False)
+
+
+def main(notebook_paths):
+    for notebook_path in notebook_paths:
+        make_backup(notebook_path)
+        replace_params(notebook_path)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/scripts/notebooks_restore_from_backup.py b/scripts/notebooks_restore_from_backup.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import shutil
+import sys
+
+
+def restore_from_backup(notebook_path):
+    backup_path = pathlib.Path(f"{notebook_path}.backup")
+    if backup_path.exists():
+        shutil.move(
+            backup_path,
+            notebook_path,
+        )
+
+
+def main(notebook_paths):
+    for notebook_path in notebook_paths:
+        restore_from_backup(notebook_path)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])