Skip to content

Commit 60963a0

Browse files
fix(tests): stabilize flaky Hub LFS integration test
`test_push_dataset_dict_to_hub_overwrite_files` intermittently fails with: ``` BadRequestError: LFS pointer pointed to a file that does not exist ``` Root cause: Two race conditions in the test design: 1. Rapid successive `push_to_hub` calls don't wait for Hub's LFS object propagation between pushes 2. Second test scenario reused the same repo name, creating a race between repo deletion and recreation Fix: - Add `_wait_for_repo_ready()` helper that ensures Hub repository is in a consistent state before subsequent operations - Use unique repo name (`ds_name_2`) for second scenario to eliminate the delete/create race entirely Tested: All 4 integration test variants now pass consistently (ubuntu/windows, deps-latest/deps-minimum).
1 parent 2ed6f72 commit 60963a0

File tree

1 file changed

+47
-7
lines changed

1 file changed

+47
-7
lines changed

tests/test_upstream_hub.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,34 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
267267
num_commits_after_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
268268
assert num_commits_after_push - num_commits_before_push > 1
269269

270+
def _wait_for_repo_ready(self, repo_id, max_wait=30):
271+
"""Wait for repository to be in a consistent state after push operations.
272+
273+
This helper addresses race conditions where rapid successive push_to_hub calls
274+
don't wait for Hub's LFS object propagation between pushes, causing errors like:
275+
"LFS pointer pointed to a file that does not exist"
276+
277+
Args:
278+
repo_id: The repository ID to check.
279+
max_wait: Maximum time in seconds to wait for repository readiness.
280+
281+
Raises:
282+
TimeoutError: If repository is not ready within max_wait seconds.
283+
"""
284+
from huggingface_hub.errors import HfHubHTTPError
285+
286+
start_time = time.monotonic()
287+
while (time.monotonic() - start_time) < max_wait:
288+
try:
289+
# Verify we can list files (repo is consistent)
290+
self._api.list_repo_files(repo_id, repo_type="dataset", token=self._token)
291+
# Small delay to ensure LFS objects are fully propagated
292+
time.sleep(1)
293+
return
294+
except HfHubHTTPError:
295+
time.sleep(1)
296+
raise TimeoutError(f"Repository {repo_id} not ready after {max_wait}s")
297+
270298
def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
271299
ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
272300
ds2 = Dataset.from_dict({"x": list(range(100)), "y": list(range(100))})
@@ -278,6 +306,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
278306
with temporary_repo() as ds_name:
279307
local_ds.push_to_hub(ds_name, token=self._token)
280308

309+
# Wait for Hub to fully process the first push
310+
self._wait_for_repo_ready(ds_name)
311+
281312
with tempfile.TemporaryDirectory() as tmp:
282313
# Add a file starting with "data" to ensure it doesn't get deleted.
283314
path = Path(tmp) / "datafile.txt"
@@ -292,6 +323,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
292323
token=self._token,
293324
)
294325

326+
# Wait again before second push
327+
self._wait_for_repo_ready(ds_name)
328+
295329
local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
296330

297331
# Ensure that there are two files on the repository that have the correct name
@@ -320,8 +354,11 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
320354

321355
# Push to hub two times, but the second time with fewer files.
322356
# Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
323-
with temporary_repo(ds_name):
324-
local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
357+
with temporary_repo() as ds_name_2:
358+
local_ds.push_to_hub(ds_name_2, token=self._token, max_shard_size=500 << 5)
359+
360+
# Wait for Hub to fully process the first push
361+
self._wait_for_repo_ready(ds_name_2)
325362

326363
with tempfile.TemporaryDirectory() as tmp:
327364
# Add a file starting with "data" to ensure it doesn't get deleted.
@@ -332,15 +369,18 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
332369
self._api.upload_file(
333370
path_or_fileobj=str(path),
334371
path_in_repo="datafile.txt",
335-
repo_id=ds_name,
372+
repo_id=ds_name_2,
336373
repo_type="dataset",
337374
token=self._token,
338375
)
339376

340-
local_ds.push_to_hub(ds_name, token=self._token)
377+
# Wait again before second push
378+
self._wait_for_repo_ready(ds_name_2)
379+
380+
local_ds.push_to_hub(ds_name_2, token=self._token)
341381

342382
# Ensure that there are two files on the repository that have the correct name
343-
files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
383+
files = sorted(self._api.list_repo_files(ds_name_2, repo_type="dataset", token=self._token))
344384
assert files == [
345385
".gitattributes",
346386
"README.md",
@@ -350,9 +390,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
350390
]
351391

352392
# Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
353-
self._api.delete_file("datafile.txt", repo_id=ds_name, repo_type="dataset", token=self._token)
393+
self._api.delete_file("datafile.txt", repo_id=ds_name_2, repo_type="dataset", token=self._token)
354394

355-
hub_ds = load_dataset(ds_name, download_mode="force_redownload")
395+
hub_ds = load_dataset(ds_name_2, download_mode="force_redownload")
356396

357397
assert local_ds.column_names == hub_ds.column_names
358398
assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())

0 commit comments

Comments
 (0)