Skip to content

Commit bdede34

Browse files
committed
add examples smoke tests
1 parent 915e5c8 commit bdede34

File tree

10 files changed

+175
-30
lines changed

10 files changed

+175
-30
lines changed

.github/workflows/benchmarks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ env:
1111
FORCE_COLOR: "1"
1212

1313
jobs:
14-
build:
14+
run:
1515
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
1616
runs-on: ubuntu-latest
1717

.github/workflows/examples.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Examples
2+
3+
on:
4+
workflow_dispatch:
5+
schedule:
6+
- cron: '0 3 * * *'
7+
push: # to remove
8+
9+
env:
10+
FORCE_COLOR: "1"
11+
12+
jobs:
13+
run:
14+
runs-on: ${{ matrix.os }}
15+
timeout-minutes: 60
16+
strategy:
17+
fail-fast: false
18+
matrix:
19+
os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
20+
pyv: ['3.9', '3.12']
21+
steps:
22+
- uses: actions/checkout@v4
23+
24+
- name: Set up Python ${{ matrix.pyv }}
25+
uses: actions/setup-python@v5
26+
with:
27+
python-version: ${{ matrix.pyv }}
28+
cache: 'pip'
29+
30+
- name: Upgrade nox and uv
31+
run: |
32+
python -m pip install --upgrade 'nox[uv]'
33+
nox --version
34+
uv --version
35+
36+
- name: Run examples
37+
run: nox -s examples -p ${{ matrix.pyv }}

examples/computer_vision/iptc_exif_xmp_lib.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# pip install defusedxml
12
import json
23

34
from PIL import (
@@ -63,7 +64,7 @@ def image_description(file):
6364
DataChain.from_storage(source, type="image")
6465
.settings(parallel=-1)
6566
.filter(C("file.name").glob("*.jpg"))
66-
.limit(10000)
67+
.limit(5000)
6768
.map(
6869
image_description,
6970
params=["file"],

examples/computer_vision/openimage-detect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def openimage_detect(args):
4848
yield fstream, bbox
4949

5050

51-
source = "gs://datachain-demo/openimages-v6-test-jsonpairs"
51+
source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
5252

5353
(
5454
DataChain.from_storage(source)

examples/get_started/json-csv-reader.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def main():
3636
print("========================================================================")
3737
uri = "gs://datachain-demo/jsonl/object.jsonl"
3838
jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True)
39-
print(jsonl_ds.to_pandas())
39+
jsonl_ds.show()
4040

4141
print()
4242
print("========================================================================")
@@ -49,8 +49,7 @@ def main():
4949
json_pairs_ds = DataChain.from_json(
5050
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
5151
)
52-
print(json_pairs_ds.to_pandas())
53-
# print(list(json_pairs_ds.collect())[0])
52+
json_pairs_ds.show()
5453

5554
uri = "gs://datachain-demo/coco2017/annotations_captions/"
5655

@@ -72,7 +71,7 @@ def main():
7271
static_json_ds = DataChain.from_json(
7372
uri, jmespath="licenses", spec=LicenseFeature, nrows=3
7473
)
75-
print(static_json_ds.to_pandas())
74+
static_json_ds.show()
7675

7776
print()
7877
print("========================================================================")
@@ -88,16 +87,7 @@ def main():
8887
print("========================================================================")
8988
static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
9089
static_csv_ds.print_schema()
91-
print(static_csv_ds.to_pandas())
92-
93-
uri = "gs://datachain-demo/laion-aesthetics-csv"
94-
print()
95-
print("========================================================================")
96-
print("dynamic CSV with header schema test parsing 3/3M objects")
97-
print("========================================================================")
98-
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
99-
dynamic_csv_ds.print_schema()
100-
print(dynamic_csv_ds.to_pandas())
90+
static_csv_ds.show()
10191

10292

10393
if __name__ == "__main__":

examples/get_started/torch-loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def forward(self, x):
6464
optimizer = optim.Adam(model.parameters(), lr=0.001)
6565

6666
# Train the model
67-
num_epochs = 10
67+
num_epochs = 1
6868
for epoch in range(num_epochs):
6969
for i, data in enumerate(train_loader):
7070
inputs, labels = data

examples/multimodal/wds.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import pandas as pd
2-
31
from datachain import C, DataChain
42
from datachain.lib.webdataset import process_webdataset
53
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
@@ -9,25 +7,36 @@
97
.filter(C("file.name").glob("00000000.tar"))
108
.settings(cache=True)
119
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
10+
.save() # materialize chain to avoid downloading data multiple times
11+
)
12+
13+
meta_pq = (
14+
DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
15+
.filter(
16+
C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
17+
)
18+
.map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
19+
.save()
1220
)
1321

1422
meta_emd = (
15-
DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
16-
.filter(C("file.name").glob("0020f*.npz"))
23+
DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
1724
.gen(emd=process_laion_meta)
25+
.filter(
26+
C("emd.index").in_(
27+
values[0] for values in meta_pq.select("source.index").collect()
28+
)
29+
)
1830
.map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
1931
)
2032

21-
meta_pq = DataChain.from_parquet(
22-
"gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
23-
).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
2433

2534
meta = meta_emd.merge(
26-
meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"]
35+
meta_pq,
36+
on=["stem", "emd.index"],
37+
right_on=["stem", "source.index"],
2738
)
2839

2940
res = wds.merge(meta, on="laion.json.uid", right_on="uid")
3041

31-
df = res.limit(10).to_pandas()
32-
with pd.option_context("display.max_columns", None):
33-
print(df)
42+
res.show(3)

noxfile.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,19 @@ def dev(session: nox.Session) -> None:
7474

7575
python = os.path.join(venv_dir, "bin/python")
7676
session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
77+
78+
79+
@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
80+
def examples(session: nox.Session) -> None:
81+
session.install(".[examples]")
82+
try:
83+
session.install("unstructured[all-docs]")
84+
except: # noqa: S110, E722
85+
pass
86+
session.run(
87+
"pytest",
88+
"-m",
89+
"examples",
90+
"-vvv",
91+
*session.posargs,
92+
)

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ dev = [
9393
"types-PyYAML",
9494
"types-requests"
9595
]
96+
examples = [
97+
"datachain[tests]",
98+
"defusedxml",
99+
"accelerate",
100+
]
96101

97102
[project.urls]
98103
Documentation = "https://datachain.dvc.ai"
@@ -110,7 +115,7 @@ namespaces = false
110115
[tool.setuptools_scm]
111116

112117
[tool.pytest.ini_options]
113-
addopts = "-rfEs -m 'not benchmark'"
118+
addopts = "-rfEs -m 'not benchmark and not examples'"
114119
markers = [
115120
"benchmark: benchmarks.",
116121
"e2e: End-to-end tests"

tests/examples/test_examples.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import glob
2+
import os
3+
import subprocess
4+
import sys
5+
6+
import pytest
7+
8+
NO_EXAMPLES = "no examples found"
9+
10+
11+
def can_import_unstructured():
12+
try:
13+
import unstructured # noqa: F401
14+
15+
return True
16+
except ImportError:
17+
return False
18+
19+
20+
get_started_examples = [
21+
filename
22+
for filename in glob.glob("examples/get_started/**/*.py", recursive=True)
23+
if "torch" not in filename or os.environ.get("RUNNER_OS") != "Linux"
24+
]
25+
26+
llm_and_nlp_examples = [
27+
filename
28+
for filename in glob.glob("examples/llm_and_nlp/**/*.py", recursive=True)
29+
# no anthropic token
30+
if "claude" not in filename
31+
and ("unstructured" not in filename or can_import_unstructured())
32+
] or [NO_EXAMPLES]
33+
34+
multimodal_examples = [
35+
filename
36+
for filename in glob.glob("examples/multimodal/**/*.py", recursive=True)
37+
# no OpenAI token
38+
# and hf download painfully slow
39+
if "openai" not in filename and "hf" not in filename
40+
]
41+
42+
computer_vision_examples = [
43+
filename
44+
for filename in glob.glob("examples/multimodal/**/*.py", recursive=True)
45+
# fashion product images tutorial out of scope
46+
# and hf download painfully slow
47+
if "image_desc" not in filename and "fashion_product_images" not in filename
48+
]
49+
50+
51+
def smoke_test(example: str):
52+
if example == NO_EXAMPLES:
53+
return
54+
55+
completed_process = subprocess.run( # noqa: S603
56+
[sys.executable, example],
57+
capture_output=True,
58+
cwd=os.path.abspath(os.path.join(__file__, "..", "..", "..")),
59+
check=True,
60+
)
61+
62+
assert completed_process.stdout
63+
assert completed_process.stderr
64+
65+
66+
@pytest.mark.examples
67+
@pytest.mark.parametrize("example", get_started_examples)
68+
def test_get_started_examples(example):
69+
smoke_test(example)
70+
71+
72+
@pytest.mark.examples
73+
@pytest.mark.parametrize("example", llm_and_nlp_examples)
74+
def test_llm_and_nlp_examples(example):
75+
smoke_test(example)
76+
77+
78+
@pytest.mark.examples
79+
@pytest.mark.parametrize("example", multimodal_examples)
80+
def test_multimodal(example):
81+
smoke_test(example)
82+
83+
84+
@pytest.mark.examples
85+
@pytest.mark.parametrize("example", computer_vision_examples)
86+
def test_computer_vision_examples(example):
87+
smoke_test(example)

0 commit comments

Comments
 (0)