Skip to content

Commit 2ed98a5

Browse files
authored
Add more unit tests for data pipeline utils in agent example (#29)
* WIP Signed-off-by: Sid Murching <sid.murching@databricks.com> * WIP Signed-off-by: Sid Murching <sid.murching@databricks.com> * Remove lint workflow Signed-off-by: Sid Murching <sid.murching@databricks.com> * Add comments to utils Signed-off-by: Sid Murching <sid.murching@databricks.com> * Fix test Signed-off-by: Sid Murching <sid.murching@databricks.com> * Delete autogenerated comments Signed-off-by: Sid Murching <sid.murching@databricks.com> * Switch to .show() which is defined locally Signed-off-by: Sid Murching <sid.murching@databricks.com> * Add more tests for file loading & parsing logic in data pipeline Signed-off-by: Sid Murching <sid.murching@databricks.com> * Add more tests for file parsing Signed-off-by: Sid Murching <sid.murching@databricks.com> * Update agent_app_sample_code/tests/test_file_loading.py * Add developer README Signed-off-by: Sid Murching <sid.murching@databricks.com> --------- Signed-off-by: Sid Murching <sid.murching@databricks.com>
1 parent dcbbe41 commit 2ed98a5

File tree

3 files changed

+61
-6
lines changed

3 files changed

+61
-6
lines changed

agent_app_sample_code/02_data_pipeline.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ def file_parser(
216216
"""
217217
try:
218218
filename, file_extension = os.path.splitext(doc_path)
219-
parsed_document = {}
220219

221220
if file_extension == ".pdf":
222221
pdf = io.BytesIO(raw_doc_contents_bytes)

agent_app_sample_code/tests/test_file_loading.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,35 @@
1+
from datetime import datetime
2+
13
import pytest
24
import pyspark
35
import pandas as pd
6+
from typing import TypedDict
47

5-
from agent_app_sample_code.utils.file_loading import load_files_to_df
8+
from agent_app_sample_code.utils.file_loading import load_files_to_df, apply_parsing_udf
9+
from agent_app_sample_code.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema
610

711
@pytest.fixture(scope="module")
812
def spark():
913
return (
1014
pyspark.sql.SparkSession.builder
1115
.master("local[1]")
12-
# Uncomment the following line for testing on Apple silicon locally
1316
.config("spark.driver.bindAddress", "127.0.0.1")
1417
.config("spark.task.maxFailures", "1") # avoid retry failed spark tasks
1518
.getOrCreate()
16-
)
19+
)
1720

18-
def test_load_files_to_df(spark, tmpdir):
21+
@pytest.fixture()
22+
def example_files_dir(tmpdir):
1923
temp_dir = tmpdir.mkdir("files_subdir")
2024
file_1 = temp_dir.join("file1.txt")
2125
file_2 = temp_dir.join("file2.txt")
2226
file_1.write("file1 content")
2327
file_2.write("file2 content")
28+
yield temp_dir, file_1, file_2
29+
30+
31+
def test_load_files_to_df(spark, example_files_dir):
32+
temp_dir, file_1, file_2 = example_files_dir
2433
raw_files_df = load_files_to_df(spark, str(temp_dir)).drop("modificationTime").orderBy("path")
2534
assert raw_files_df.count() == 2
2635
raw_pandas_df = raw_files_df.toPandas()
@@ -44,3 +53,41 @@ def test_load_files_to_df_throws_if_no_files(spark, tmpdir):
4453
temp_dir = tmpdir.mkdir("files_subdir")
4554
with pytest.raises(Exception, match="does not contain any files"):
4655
load_files_to_df(spark, str(temp_dir))
56+
57+
class ParserReturnValue(TypedDict):
58+
# Parsed content of the document
59+
doc_content: str # do not change this name
60+
# The status of whether the parser succeeds or fails, used to exclude failed files downstream
61+
parser_status: str # do not change this name
62+
# Unique ID of the document
63+
doc_uri: str # do not change this name
64+
65+
def test_apply_parsing_udf(spark, example_files_dir):
66+
def _mock_file_parser(
67+
raw_doc_contents_bytes: bytes,
68+
doc_path: str,
69+
modification_time: datetime,
70+
doc_bytes_length: int,
71+
):
72+
return {
73+
"doc_content": raw_doc_contents_bytes.decode("utf-8"),
74+
"parser_status": "SUCCESS",
75+
"doc_uri": doc_path,
76+
}
77+
78+
temp_dir, file_1, file_2 = example_files_dir
79+
raw_files_df = load_files_to_df(spark, str(temp_dir)).orderBy("path")
80+
parsed_df = apply_parsing_udf(raw_files_df, _mock_file_parser, parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue))
81+
assert parsed_df.count() == 2
82+
parsed_pandas_df = parsed_df.toPandas()
83+
# Expected DataFrame
84+
expected_df = pd.DataFrame([{
85+
"doc_content": file_1.read_text(encoding="utf-8"),
86+
"parser_status": "SUCCESS",
87+
"doc_uri": f"file:{str(file_1)}",
88+
}, {
89+
"doc_content": file_2.read_text(encoding="utf-8"),
90+
"parser_status": "SUCCESS",
91+
"doc_uri": f"file:{str(file_2)}",
92+
}])
93+
pd.testing.assert_frame_equal(parsed_pandas_df, expected_df)

dev/README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,22 @@
11
# Databricks Mosaic Generative AI Cookbook
22

3-
To start working on this book:
3+
## Dev env setup
44
- clone the repo; `cd cookbook`
55
- use your preferred approach to starting a new python environment
66
- in that environment, `pip install -r dev/dev_requirements.txt`
7+
8+
## Updating website content
9+
To test updates to site content at ai-cookbook.io
710
- build and preview the site with `jupyter-book build --all genai_cookbook`
811

912
The homepage is at `genai_cookbook/index.md`
1013

1114
The content pages are in `genai_cookbook/nbs/`
1215

1316
Jupyter book is fairly flexible and offers a lot of different options for formatting, cross-referencing, adding formatted callouts, etc. Read more at the [Jupyter Book docs](https://jupyterbook.org/en/stable/intro.html).
17+
18+
## Updating code
19+
Use the `databricks sync` CLI command ([docs](https://docs.databricks.com/en/dev-tools/cli/sync-commands.html)) to sync the code in this repo to
20+
your Databricks workspace. You can then iterate on code in your IDE and test changes in
21+
Databricks. Be sure to add unit tests (as of the time of writing, tests are under `agent_app_sample_code/tests`).
22+
You can run unit tests via `pytest`

0 commit comments

Comments
 (0)