Add more unit tests for data pipeline utils in agent example (#29)

smurching · web-flow · commit 2ed98a5967fa · 2024-09-27T16:00:33.000-04:00
* WIP

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* WIP

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Remove lint workflow

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Add comments to utils

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Fix test

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Delete autogenerated comments

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Switch to .show() which is defined locally

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Add more tests for file loading &amp; parsing logic in data pipeline

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Add more tests for file parsing

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

* Update agent_app_sample_code/tests/test_file_loading.py

* Add developer README

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;

---------

Signed-off-by: Sid Murching &lt;sid.murching@databricks.com&gt;
diff --git a/agent_app_sample_code/02_data_pipeline.py b/agent_app_sample_code/02_data_pipeline.py
@@ -216,7 +216,6 @@ def file_parser(
     """
     try:
         filename, file_extension = os.path.splitext(doc_path)
-        parsed_document = {}
 
         if file_extension == ".pdf":
             pdf = io.BytesIO(raw_doc_contents_bytes)
diff --git a/agent_app_sample_code/tests/test_file_loading.py b/agent_app_sample_code/tests/test_file_loading.py
@@ -1,26 +1,35 @@
+from datetime import datetime
+
 import pytest
 import pyspark
 import pandas as pd
+from typing import TypedDict
 
-from agent_app_sample_code.utils.file_loading import load_files_to_df
+from agent_app_sample_code.utils.file_loading import load_files_to_df, apply_parsing_udf
+from agent_app_sample_code.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema
 
 @pytest.fixture(scope="module")
 def spark():
     return (
         pyspark.sql.SparkSession.builder
         .master("local[1]")
-        # Uncomment the following line for testing on Apple silicon locally
         .config("spark.driver.bindAddress", "127.0.0.1")
         .config("spark.task.maxFailures", "1")  # avoid retry failed spark tasks
         .getOrCreate()
-            )
+    )
 
-def test_load_files_to_df(spark, tmpdir):
+@pytest.fixture()
+def example_files_dir(tmpdir):
     temp_dir = tmpdir.mkdir("files_subdir")
     file_1 = temp_dir.join("file1.txt")
     file_2 = temp_dir.join("file2.txt")
     file_1.write("file1 content")
     file_2.write("file2 content")
+    yield temp_dir, file_1, file_2
+
+
+def test_load_files_to_df(spark, example_files_dir):
+    temp_dir, file_1, file_2 = example_files_dir
     raw_files_df = load_files_to_df(spark, str(temp_dir)).drop("modificationTime").orderBy("path")
     assert raw_files_df.count() == 2
     raw_pandas_df = raw_files_df.toPandas()
@@ -44,3 +53,41 @@ def test_load_files_to_df_throws_if_no_files(spark, tmpdir):
     temp_dir = tmpdir.mkdir("files_subdir")
     with pytest.raises(Exception, match="does not contain any files"):
         load_files_to_df(spark, str(temp_dir))
+
+class ParserReturnValue(TypedDict):
+    # Parsed content of the document
+    doc_content: str  # do not change this name
+    # The status of whether the parser succeeds or fails, used to exclude failed files downstream
+    parser_status: str  # do not change this name
+    # Unique ID of the document
+    doc_uri: str  # do not change this name
+
+def test_apply_parsing_udf(spark, example_files_dir):
+    def _mock_file_parser(
+            raw_doc_contents_bytes: bytes,
+            doc_path: str,
+            modification_time: datetime,
+            doc_bytes_length: int,
+    ):
+        return {
+            "doc_content": raw_doc_contents_bytes.decode("utf-8"),
+            "parser_status": "SUCCESS",
+            "doc_uri": doc_path,
+        }
+
+    temp_dir, file_1, file_2 = example_files_dir
+    raw_files_df = load_files_to_df(spark, str(temp_dir)).orderBy("path")
+    parsed_df = apply_parsing_udf(raw_files_df, _mock_file_parser, parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue))
+    assert parsed_df.count() == 2
+    parsed_pandas_df = parsed_df.toPandas()
+    # Expected DataFrame
+    expected_df = pd.DataFrame([{
+        "doc_content": file_1.read_text(encoding="utf-8"),
+        "parser_status": "SUCCESS",
+        "doc_uri": f"file:{str(file_1)}",
+    }, {
+        "doc_content": file_2.read_text(encoding="utf-8"),
+        "parser_status": "SUCCESS",
+        "doc_uri": f"file:{str(file_2)}",
+    }])
+    pd.testing.assert_frame_equal(parsed_pandas_df, expected_df)
diff --git a/dev/README.md b/dev/README.md
@@ -1,13 +1,22 @@
 # Databricks Mosaic Generative AI Cookbook
 
-To start working on this book:
+## Dev env setup
 - clone the repo; `cd cookbook`
 - use your preferred approach to starting a new python environment
 - in that environment, `pip install -r dev/dev_requirements.txt`
+
+## Updating website content
+To test updates to site content at ai-cookbook.io
 - build and preview the site with `jupyter-book build --all genai_cookbook`
 
 The homepage is at `genai_cookbook/index.md`
 
 The content pages are in `genai_cookbook/nbs/`
 
 Jupyter book is fairly flexible and offers a lot of different options for formatting, cross-referencing, adding formatted callouts, etc. Read more at the [Jupyter Book docs](https://jupyterbook.org/en/stable/intro.html).
+
+## Updating code
+Use the `databricks sync` CLI command ([docs](https://docs.databricks.com/en/dev-tools/cli/sync-commands.html)) to sync the code in this repo to 
+your Databricks workspace. You can then iterate on code in your IDE and test changes in 
+Databricks. Be sure to add unit tests (as of the time of writing, tests are under `agent_app_sample_code/tests`).
+You can run unit tests via `pytest`