Skip to content

Commit 08f4625

Browse files
authored
do not pre-load dataset version preview from string (#642)
1 parent 049f718 commit 08f4625

File tree

2 files changed

+47
-4
lines changed

2 files changed

+47
-4
lines changed

src/datachain/dataset.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
from dataclasses import dataclass, fields
44
from datetime import datetime
5+
from functools import cached_property
56
from typing import (
67
Any,
78
NewType,
@@ -11,6 +12,8 @@
1112
)
1213
from urllib.parse import urlparse
1314

15+
import orjson
16+
1417
from datachain.error import DatasetVersionNotFoundError
1518
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
1619

@@ -178,7 +181,7 @@ class DatasetVersion:
178181
schema: dict[str, Union[SQLType, type[SQLType]]]
179182
num_objects: Optional[int]
180183
size: Optional[int]
181-
preview: Optional[list[dict]]
184+
_preview_data: Optional[Union[str, list[dict]]]
182185
sources: str = ""
183186
query_script: str = ""
184187
job_id: Optional[str] = None
@@ -199,7 +202,7 @@ def parse( # noqa: PLR0913
199202
script_output: str,
200203
num_objects: Optional[int],
201204
size: Optional[int],
202-
preview: Optional[str],
205+
preview: Optional[Union[str, list[dict]]],
203206
schema: dict[str, Union[SQLType, type[SQLType]]],
204207
sources: str = "",
205208
query_script: str = "",
@@ -220,7 +223,7 @@ def parse( # noqa: PLR0913
220223
schema,
221224
num_objects,
222225
size,
223-
json.loads(preview) if preview else None,
226+
preview,
224227
sources,
225228
query_script,
226229
job_id,
@@ -260,9 +263,17 @@ def serialized_schema(self) -> dict[str, Any]:
260263
for c_name, c_type in self.schema.items()
261264
}
262265

266+
@cached_property
267+
def preview(self) -> Optional[list[dict]]:
268+
if isinstance(self._preview_data, str):
269+
return orjson.loads(self._preview_data)
270+
return self._preview_data if self._preview_data else None
271+
263272
@classmethod
264273
def from_dict(cls, d: dict[str, Any]) -> "DatasetVersion":
265274
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
275+
if not hasattr(kwargs, "_preview_data"):
276+
kwargs["_preview_data"] = d.get("preview")
266277
return cls(**kwargs)
267278

268279

tests/unit/test_dataset.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from datetime import datetime, timezone
23

34
import pytest
@@ -6,7 +7,7 @@
67
from sqlalchemy.schema import CreateTable
78

89
from datachain.data_storage.schema import DataTable
9-
from datachain.dataset import DatasetDependency, DatasetDependencyType
10+
from datachain.dataset import DatasetDependency, DatasetDependencyType, DatasetVersion
1011
from datachain.sql.types import (
1112
JSON,
1213
Array,
@@ -106,3 +107,34 @@ def test_dataset_dependency_dataset_name(dep_name, dep_type, expected):
106107
)
107108

108109
assert dep.dataset_name == expected
110+
111+
112+
@pytest.mark.parametrize(
113+
"use_string",
114+
[True, False],
115+
)
116+
def test_dataset_version_from_dict(use_string):
117+
preview = [{"id": 1, "thing": "a"}, {"id": 2, "thing": "b"}]
118+
119+
preview_data = json.dumps(preview) if use_string else preview
120+
121+
data = {
122+
"id": 1,
123+
"uuid": "98928be4-b6e8-4b7b-a7c5-2ce3b33130d8",
124+
"dataset_id": 40,
125+
"version": 2,
126+
"status": 1,
127+
"feature_schema": {},
128+
"created_at": datetime.fromisoformat("2023-10-01T12:00:00"),
129+
"finished_at": None,
130+
"error_message": "",
131+
"error_stack": "",
132+
"script_output": "",
133+
"schema": {},
134+
"num_objects": 100,
135+
"size": 1000000,
136+
"preview": preview_data,
137+
}
138+
139+
dataset_version = DatasetVersion.from_dict(data)
140+
assert dataset_version.preview == preview

0 commit comments

Comments
 (0)