Skip to content

Commit b1f2fc5

Browse files
committed
pull _is_file checks to get_listing
The parsing utility should not hit remote servers at all. Also returning `None` does not make sense for `parse_listing_uri` when the uri is a file since there is no listing. It's better to keep this function naive than to add other logic into it. And I think `get_listing` is a better place for that, to have all kinds of logic with side-effects. The only other place where this function is used besides `get_listing` is in `DatasetDependency.dataset_name`. In that case, a dataset already exists and since there can be no listing for a file uri, it's safe to assume that the uri is either a glob or a directory path. https://github.com/iterative/datachain/blob/e45a62f43cad47a162345f01e3a2f9a1fff8f47a/src/datachain/dataset.py#L94 I stumbled upon this when I was investigating why a unit-test was slow. Turns out it was hitting the remote with a fake s3 path for `is_file()`, which was failing and taking 4-5 seconds to run. https://github.com/iterative/datachain/blob/e45a62f43cad47a162345f01e3a2f9a1fff8f47a/tests/unit/test_dataset.py#L88-L99
1 parent 740be82 commit b1f2fc5

File tree

6 files changed

+16
-21
lines changed

6 files changed

+16
-21
lines changed

src/datachain/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def dataset_name(self) -> str:
9191
if self.type == DatasetDependencyType.DATASET:
9292
return self.name
9393

94-
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
94+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), {})
9595
assert list_dataset_name
9696
return list_dataset_name
9797

src/datachain/lib/listing.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,12 @@ def _isfile(client: "Client", path: str) -> bool:
103103
return False
104104

105105

106-
def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
106+
def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
107107
"""
108108
Parsing uri and returns listing dataset name, listing uri and listing path
109109
"""
110110
client_config = client_config or {}
111-
client = Client.get_client(uri, cache, **client_config)
112111
storage_uri, path = Client.parse_url(uri)
113-
telemetry.log_param("client", client.PREFIX)
114-
115-
if not uri.endswith("/") and _isfile(client, uri):
116-
return None, f"{storage_uri}/{path.lstrip('/')}", path
117112
if uses_glob(path):
118113
lst_uri_path = posixpath.dirname(path)
119114
else:
@@ -157,13 +152,15 @@ def get_listing(
157152
client_config = catalog.client_config
158153

159154
client = Client.get_client(uri, cache, **client_config)
160-
ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
161-
listing = None
155+
telemetry.log_param("client", client.PREFIX)
162156

163-
# if we don't want to use cached dataset (e.g. for a single file listing)
164-
if not ds_name:
165-
return None, list_uri, list_path, False
157+
# we don't want to use cached dataset (e.g. for a single file listing)
158+
if not uri.endswith("/") and _isfile(client, uri):
159+
storage_uri, path = Client.parse_url(uri)
160+
return None, f"{storage_uri}/{path.lstrip('/')}", path, False
166161

162+
ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
163+
listing = None
167164
listings = [
168165
ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
169166
]

tests/func/test_catalog.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717

1818

1919
def listing_stats(uri, catalog):
20-
list_dataset_name, _, _ = parse_listing_uri(
21-
uri, catalog.cache, catalog.client_config
22-
)
20+
list_dataset_name, _, _ = parse_listing_uri(uri, catalog.client_config)
2321
dataset = catalog.get_dataset(list_dataset_name)
2422
return catalog.dataset_stats(dataset.name, dataset.latest_version)
2523

tests/func/test_datachain.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def test_from_storage_reindex_expired(tmp_dir, test_session):
110110
os.mkdir(tmp_dir)
111111
uri = tmp_dir.as_uri()
112112

113-
lst_ds_name = parse_listing_uri(uri, catalog.cache, catalog.client_config)[0]
113+
lst_ds_name = parse_listing_uri(uri, catalog.client_config)[0]
114114

115115
pd.DataFrame({"name": ["Alice", "Bob"]}).to_parquet(tmp_dir / "test1.parquet")
116116
assert DataChain.from_storage(uri, session=test_session).count() == 1
@@ -138,7 +138,7 @@ def test_from_storage_partials(cloud_test_catalog):
138138
catalog = session.catalog
139139

140140
def _list_dataset_name(uri: str) -> str:
141-
name = parse_listing_uri(uri, catalog.cache, catalog.client_config)[0]
141+
name = parse_listing_uri(uri, catalog.client_config)[0]
142142
assert name
143143
return name
144144

@@ -182,7 +182,7 @@ def test_from_storage_partials_with_update(cloud_test_catalog):
182182
catalog = session.catalog
183183

184184
def _list_dataset_name(uri: str) -> str:
185-
name = parse_listing_uri(uri, catalog.cache, catalog.client_config)[0]
185+
name = parse_listing_uri(uri, catalog.client_config)[0]
186186
assert name
187187
return name
188188

tests/func/test_datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,7 @@ def test_dataset_storage_dependencies(cloud_test_catalog, cloud_type, indirect):
862862
ds_name = "some_ds"
863863
DataChain.from_storage(uri, session=session).save(ds_name)
864864

865-
lst_ds_name, _, _ = parse_listing_uri(uri, catalog.cache, catalog.client_config)
865+
lst_ds_name, _, _ = parse_listing_uri(uri, catalog.client_config)
866866
lst_dataset = catalog.metastore.get_dataset(lst_ds_name)
867867

868868
assert [

tests/func/test_listing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def test_parse_listing_uri(cloud_test_catalog, cloud_type):
3838
ctc = cloud_test_catalog
3939
catalog = ctc.catalog
4040
dataset_name, listing_uri, listing_path = parse_listing_uri(
41-
f"{ctc.src_uri}/dogs", catalog.cache, catalog.client_config
41+
f"{ctc.src_uri}/dogs", catalog.client_config
4242
)
4343
assert dataset_name == f"lst__{ctc.src_uri}/dogs/"
4444
assert listing_uri == f"{ctc.src_uri}/dogs/"
@@ -57,7 +57,7 @@ def test_parse_listing_uri_with_glob(cloud_test_catalog):
5757
ctc = cloud_test_catalog
5858
catalog = ctc.catalog
5959
dataset_name, listing_uri, listing_path = parse_listing_uri(
60-
f"{ctc.src_uri}/dogs/*", catalog.cache, catalog.client_config
60+
f"{ctc.src_uri}/dogs/*", catalog.client_config
6161
)
6262
assert dataset_name == f"lst__{ctc.src_uri}/dogs/"
6363
assert listing_uri == f"{ctc.src_uri}/dogs"

0 commit comments

Comments
 (0)