Skip to content

Commit b22d4ae

Browse files
committed
add examples smoke tests
1 parent 22175cb commit b22d4ae

File tree

8 files changed

+130
-25
lines changed

8 files changed

+130
-25
lines changed

.github/workflows/benchmarks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ env:
1111
FORCE_COLOR: "1"
1212

1313
jobs:
14-
build:
14+
run:
1515
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
1616
runs-on: ubuntu-latest
1717

.github/workflows/examples.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Examples
2+
3+
on:
4+
workflow_dispatch:
5+
schedule:
6+
- cron: '0 3 * * *'
7+
push: # to remove
8+
9+
env:
10+
FORCE_COLOR: "1"
11+
12+
jobs:
13+
run:
14+
runs-on: ${{ matrix.os }}
15+
timeout-minutes: 60
16+
strategy:
17+
fail-fast: false
18+
matrix:
19+
os: [ubuntu-latest-8-cores, macos-latest, windows-latest-8-cores]
20+
pyv: ['3.9', '3.12']
21+
steps:
22+
- uses: actions/checkout@v4
23+
24+
- name: Set up Python ${{ matrix.pyv }}
25+
uses: actions/setup-python@v5
26+
with:
27+
python-version: ${{ matrix.pyv }}
28+
cache: 'pip'
29+
30+
- name: Upgrade nox and uv
31+
run: |
32+
python -m pip install --upgrade 'nox[uv]'
33+
nox --version
34+
uv --version
35+
36+
- name: Run examples
37+
run: nox -s examples -p ${{ matrix.pyv }}

examples/get_started/json-csv-reader.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def main():
3636
print("========================================================================")
3737
uri = "gs://datachain-demo/jsonl/object.jsonl"
3838
jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", show_schema=True)
39-
print(jsonl_ds.to_pandas())
39+
jsonl_ds.show()
4040

4141
print()
4242
print("========================================================================")
@@ -49,8 +49,7 @@ def main():
4949
json_pairs_ds = DataChain.from_json(
5050
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
5151
)
52-
print(json_pairs_ds.to_pandas())
53-
# print(list(json_pairs_ds.collect())[0])
52+
json_pairs_ds.show()
5453

5554
uri = "gs://datachain-demo/coco2017/annotations_captions/"
5655

@@ -72,14 +71,14 @@ def main():
7271
static_json_ds = DataChain.from_json(
7372
uri, jmespath="licenses", spec=LicenseFeature, nrows=3
7473
)
75-
print(static_json_ds.to_pandas())
74+
static_json_ds.show()
7675

7776
print()
7877
print("========================================================================")
7978
print("dynamic JSON schema test parsing 5K objects")
8079
print("========================================================================")
8180
dynamic_json_ds = DataChain.from_json(uri, jmespath="images", show_schema=True)
82-
print(dynamic_json_ds.to_pandas())
81+
dynamic_json_ds.show()
8382

8483
uri = "gs://datachain-demo/chatbot-csv/"
8584
print()
@@ -88,16 +87,16 @@ def main():
8887
print("========================================================================")
8988
static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
9089
static_csv_ds.print_schema()
91-
print(static_csv_ds.to_pandas())
90+
static_csv_ds.show()
9291

93-
uri = "gs://datachain-demo/laion-aesthetics-csv"
92+
uri = "gs://datachain-demo/laion-aesthetics-csv/"
9493
print()
9594
print("========================================================================")
96-
print("dynamic CSV with header schema test parsing 3/3M objects")
95+
print("dynamic CSV with header schema test parsing 3M objects")
9796
print("========================================================================")
98-
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
97+
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion")
9998
dynamic_csv_ds.print_schema()
100-
print(dynamic_csv_ds.to_pandas())
99+
dynamic_csv_ds.show()
101100

102101

103102
if __name__ == "__main__":

examples/get_started/torch-loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def forward(self, x):
6464
optimizer = optim.Adam(model.parameters(), lr=0.001)
6565

6666
# Train the model
67-
num_epochs = 10
67+
num_epochs = 3
6868
for epoch in range(num_epochs):
6969
for i, data in enumerate(train_loader):
7070
inputs, labels = data

examples/multimodal/wds.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import pandas as pd
2-
31
from datachain import C, DataChain
42
from datachain.lib.webdataset import process_webdataset
53
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
@@ -9,25 +7,36 @@
97
.filter(C("file.name").glob("00000000.tar"))
108
.settings(cache=True)
119
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
10+
.save() # materialize chain to avoid downloading data multiple times
11+
)
12+
13+
meta_pq = (
14+
DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
15+
.filter(
16+
C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
17+
)
18+
.map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
19+
.save()
1220
)
1321

1422
meta_emd = (
15-
DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
16-
.filter(C("file.name").glob("0020f*.npz"))
23+
DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
1724
.gen(emd=process_laion_meta)
25+
.filter(
26+
C("emd.index").in_(
27+
values[0] for values in meta_pq.select("source.index").collect()
28+
)
29+
)
1830
.map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
1931
)
2032

21-
meta_pq = DataChain.from_parquet(
22-
"gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
23-
).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
2433

2534
meta = meta_emd.merge(
26-
meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"]
35+
meta_pq,
36+
on=["stem", "emd.index"],
37+
right_on=["stem", "source.index"],
2738
)
2839

2940
res = wds.merge(meta, on="laion.json.uid", right_on="uid")
3041

31-
df = res.limit(10).to_pandas()
32-
with pd.option_context("display.max_columns", None):
33-
print(df)
42+
res.show()

noxfile.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,14 @@ def dev(session: nox.Session) -> None:
7474

7575
python = os.path.join(venv_dir, "bin/python")
7676
session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
77+
78+
79+
@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
80+
def examples(session: nox.Session) -> None:
81+
session.install(".[tests]")
82+
session.run(
83+
"pytest",
84+
"-m",
85+
"examples",
86+
*session.posargs,
87+
)

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ tests = [
8383
"hypothesis",
8484
"open_clip_torch",
8585
"aiotools>=1.7.0",
86-
"requests-mock"
86+
"requests-mock",
87+
"unstructured[all-docs]"
8788
]
8889
dev = [
8990
"datachain[docs,tests]",
@@ -110,7 +111,7 @@ namespaces = false
110111
[tool.setuptools_scm]
111112

112113
[tool.pytest.ini_options]
113-
addopts = "-rfEs -m 'not benchmark'"
114+
addopts = "-rfEs -m 'not benchmark and not examples'"
114115
markers = [
115116
"benchmark: benchmarks.",
116117
"e2e: End-to-end tests"

tests/examples/test_examples.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import glob
2+
import os
3+
import subprocess
4+
import sys
5+
6+
import pytest
7+
8+
get_started_examples = glob.glob("examples/get_started/**/*.py", recursive=True)
9+
llm_and_nlp_examples = filter(
10+
# no anthropic token
11+
lambda filename: "claude" not in filename,
12+
glob.glob("examples/llm_and_nlp/**/*.py", recursive=True),
13+
)
14+
multimodal_examples = filter(
15+
# no OpenAI token and hf download painfully slow
16+
lambda filename: "openai" not in filename and "hf" not in filename,
17+
glob.glob("examples/multimodal/**/*.py", recursive=True),
18+
)
19+
20+
21+
def smoke_test(example: str):
22+
completed_process = subprocess.run( # noqa: S603
23+
[sys.executable, example],
24+
capture_output=True,
25+
cwd=os.path.abspath(os.path.join(__file__, "..", "..", "..")),
26+
check=True,
27+
)
28+
29+
assert completed_process.stdout
30+
assert completed_process.stderr
31+
32+
33+
@pytest.mark.examples
34+
@pytest.mark.parametrize("example", get_started_examples)
35+
def test_get_started_examples(example):
36+
smoke_test(example)
37+
38+
39+
@pytest.mark.examples
40+
@pytest.mark.parametrize("example", llm_and_nlp_examples)
41+
def test_llm_and_nlp_examples(example):
42+
smoke_test(example)
43+
44+
45+
@pytest.mark.examples
46+
@pytest.mark.parametrize("example", multimodal_examples)
47+
def test_multimodal(example):
48+
smoke_test(example)

0 commit comments

Comments
 (0)