Skip to content

Commit b7e5eac

Browse files
Harmonize retry behavior for metadata fetch and HfFileSystem (#3583)
* harmonize retry behavior * nit * retry machanism for 5xx errors as well * fix quality * nit * style * reuse constants
1 parent b2215d1 commit b7e5eac

File tree

3 files changed

+76
-50
lines changed

3 files changed

+76
-50
lines changed

src/huggingface_hub/file_download.py

Lines changed: 59 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,13 @@
3939
tqdm,
4040
validate_hf_hub_args,
4141
)
42-
from .utils._http import _adjust_range_header, http_backoff, http_stream_backoff
42+
from .utils._http import (
43+
_DEFAULT_RETRY_ON_EXCEPTIONS,
44+
_DEFAULT_RETRY_ON_STATUS_CODES,
45+
_adjust_range_header,
46+
http_backoff,
47+
http_stream_backoff,
48+
)
4349
from .utils._runtime import is_xet_available
4450
from .utils._typing import HTTP_METHOD_T
4551
from .utils.sha import sha_fileobj
@@ -267,30 +273,38 @@ def hf_hub_url(
267273
return url
268274

269275

270-
def _httpx_follow_relative_redirects(method: HTTP_METHOD_T, url: str, **httpx_kwargs) -> httpx.Response:
276+
def _httpx_follow_relative_redirects(
277+
method: HTTP_METHOD_T, url: str, *, retry_on_errors: bool = False, **httpx_kwargs
278+
) -> httpx.Response:
271279
"""Perform an HTTP request with backoff and follow relative redirects only.
272280
273281
This is useful to follow a redirection to a renamed repository without following redirection to a CDN.
274282
275-
A backoff mechanism retries the HTTP call on 5xx errors and network errors.
283+
A backoff mechanism retries the HTTP call on errors (429, 5xx, timeout, network errors).
276284
277285
Args:
278286
method (`str`):
279287
HTTP method, such as 'GET' or 'HEAD'.
280288
url (`str`):
281289
The URL of the resource to fetch.
290+
retry_on_errors (`bool`, *optional*, defaults to `False`):
291+
Whether to retry on errors. If False, no retry is performed (fast fallback to local cache).
292+
If True, uses default retry behavior (429, 5xx, timeout, network errors).
282293
**httpx_kwargs (`dict`, *optional*):
283294
Params to pass to `httpx.request`.
284295
"""
296+
# if `retry_on_errors=False`, disable all retries for fast fallback to cache
297+
no_retry_kwargs: dict[str, Any] = (
298+
{} if retry_on_errors else {"retry_on_exceptions": (), "retry_on_status_codes": ()}
299+
)
300+
285301
while True:
286-
# Make the request
287302
response = http_backoff(
288303
method=method,
289304
url=url,
290305
**httpx_kwargs,
291306
follow_redirects=False,
292-
retry_on_exceptions=(),
293-
retry_on_status_codes=(429,),
307+
**no_retry_kwargs,
294308
)
295309
hf_raise_for_status(response)
296310

@@ -1134,9 +1148,11 @@ def _hf_hub_download_to_cache_dir(
11341148
if not force_download:
11351149
return pointer_path
11361150

1137-
# No local file found, retry with longer timeout if it was a timeout error
1138-
if isinstance(head_call_error, httpx.TimeoutException):
1139-
logger.info("Metadata fetch timed out and no local file found. Retrying with longer timeout..")
1151+
if isinstance(head_call_error, _DEFAULT_RETRY_ON_EXCEPTIONS) or (
1152+
isinstance(head_call_error, HfHubHTTPError)
1153+
and head_call_error.response.status_code in _DEFAULT_RETRY_ON_STATUS_CODES
1154+
):
1155+
logger.info("No local file found. Retrying..")
11401156
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
11411157
_get_metadata_or_catch_error(
11421158
repo_id=repo_id,
@@ -1150,6 +1166,7 @@ def _hf_hub_download_to_cache_dir(
11501166
local_files_only=local_files_only,
11511167
storage_folder=storage_folder,
11521168
relative_filename=relative_filename,
1169+
retry_on_errors=True,
11531170
)
11541171
)
11551172

@@ -1323,22 +1340,26 @@ def _hf_hub_download_to_local_dir(
13231340
)
13241341
if not force_download:
13251342
return local_path
1326-
elif not force_download and isinstance(head_call_error, httpx.TimeoutException):
1327-
# No local file found, retry with longer timeout if it was a timeout error
1328-
logger.info("Metadata fetch timed out and no local file found. Retrying with longer timeout...")
1329-
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
1330-
_get_metadata_or_catch_error(
1331-
repo_id=repo_id,
1332-
filename=filename,
1333-
repo_type=repo_type,
1334-
revision=revision,
1335-
endpoint=endpoint,
1336-
etag_timeout=_ETAG_RETRY_TIMEOUT,
1337-
headers=headers,
1338-
token=token,
1339-
local_files_only=local_files_only,
1343+
elif not force_download:
1344+
if isinstance(head_call_error, _DEFAULT_RETRY_ON_EXCEPTIONS) or (
1345+
isinstance(head_call_error, HfHubHTTPError)
1346+
and head_call_error.response.status_code in _DEFAULT_RETRY_ON_STATUS_CODES
1347+
):
1348+
logger.info("No local file found. Retrying..")
1349+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
1350+
_get_metadata_or_catch_error(
1351+
repo_id=repo_id,
1352+
filename=filename,
1353+
repo_type=repo_type,
1354+
revision=revision,
1355+
endpoint=endpoint,
1356+
etag_timeout=_ETAG_RETRY_TIMEOUT,
1357+
headers=headers,
1358+
token=token,
1359+
local_files_only=local_files_only,
1360+
retry_on_errors=True,
1361+
)
13401362
)
1341-
)
13421363

13431364
# If still error, raise
13441365
if head_call_error is not None:
@@ -1547,6 +1568,7 @@ def get_hf_file_metadata(
15471568
user_agent: Union[dict, str, None] = None,
15481569
headers: Optional[dict[str, str]] = None,
15491570
endpoint: Optional[str] = None,
1571+
retry_on_errors: bool = False,
15501572
) -> HfFileMetadata:
15511573
"""Fetch metadata of a file versioned on the Hub for a given url.
15521574
@@ -1571,6 +1593,9 @@ def get_hf_file_metadata(
15711593
Additional headers to be sent with the request.
15721594
endpoint (`str`, *optional*):
15731595
Endpoint of the Hub. Defaults to <https://huggingface.co>.
1596+
retry_on_errors (`bool`, *optional*, defaults to `False`):
1597+
Whether to retry on errors (429, 5xx, timeout, network errors).
1598+
If False, no retry for fast fallback to local cache.
15741599
15751600
Returns:
15761601
A [`HfFileMetadata`] object containing metadata such as location, etag, size and
@@ -1586,7 +1611,9 @@ def get_hf_file_metadata(
15861611
hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
15871612

15881613
# Retrieve metadata
1589-
response = _httpx_follow_relative_redirects(method="HEAD", url=url, headers=hf_headers, timeout=timeout)
1614+
response = _httpx_follow_relative_redirects(
1615+
method="HEAD", url=url, headers=hf_headers, timeout=timeout, retry_on_errors=retry_on_errors
1616+
)
15901617
hf_raise_for_status(response)
15911618

15921619
# Return
@@ -1619,6 +1646,7 @@ def _get_metadata_or_catch_error(
16191646
local_files_only: bool,
16201647
relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
16211648
storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
1649+
retry_on_errors: bool = False,
16221650
) -> Union[
16231651
# Either an exception is caught and returned
16241652
tuple[None, None, None, None, None, Exception],
@@ -1661,7 +1689,12 @@ def _get_metadata_or_catch_error(
16611689
try:
16621690
try:
16631691
metadata = get_hf_file_metadata(
1664-
url=url, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
1692+
url=url,
1693+
timeout=etag_timeout,
1694+
headers=headers,
1695+
token=token,
1696+
endpoint=endpoint,
1697+
retry_on_errors=retry_on_errors,
16651698
)
16661699
except RemoteEntryNotFoundError as http_error:
16671700
if storage_folder is not None and relative_filename is not None:

src/huggingface_hub/hf_file_system.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1205,7 +1205,6 @@ def _open_connection(self):
12051205
"GET",
12061206
url,
12071207
headers=headers,
1208-
retry_on_status_codes=(500, 502, 503, 504),
12091208
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
12101209
)
12111210
)

src/huggingface_hub/utils/_http.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import uuid
2525
from contextlib import contextmanager
2626
from dataclasses import dataclass
27-
from http import HTTPStatus
2827
from shlex import quote
2928
from typing import Any, Callable, Generator, Mapping, Optional, Union
3029

@@ -338,18 +337,19 @@ def close_session() -> None:
338337
os.register_at_fork(after_in_child=close_session)
339338

340339

340+
_DEFAULT_RETRY_ON_EXCEPTIONS: tuple[type[Exception], ...] = (httpx.TimeoutException, httpx.NetworkError)
341+
_DEFAULT_RETRY_ON_STATUS_CODES: tuple[int, ...] = (429, 500, 502, 503, 504)
342+
343+
341344
def _http_backoff_base(
342345
method: HTTP_METHOD_T,
343346
url: str,
344347
*,
345348
max_retries: int = 5,
346349
base_wait_time: float = 1,
347350
max_wait_time: float = 8,
348-
retry_on_exceptions: Union[type[Exception], tuple[type[Exception], ...]] = (
349-
httpx.TimeoutException,
350-
httpx.NetworkError,
351-
),
352-
retry_on_status_codes: Union[int, tuple[int, ...]] = HTTPStatus.SERVICE_UNAVAILABLE,
351+
retry_on_exceptions: Union[type[Exception], tuple[type[Exception], ...]] = _DEFAULT_RETRY_ON_EXCEPTIONS,
352+
retry_on_status_codes: Union[int, tuple[int, ...]] = _DEFAULT_RETRY_ON_STATUS_CODES,
353353
stream: bool = False,
354354
**kwargs,
355355
) -> Generator[httpx.Response, None, None]:
@@ -445,11 +445,8 @@ def http_backoff(
445445
max_retries: int = 5,
446446
base_wait_time: float = 1,
447447
max_wait_time: float = 8,
448-
retry_on_exceptions: Union[type[Exception], tuple[type[Exception], ...]] = (
449-
httpx.TimeoutException,
450-
httpx.NetworkError,
451-
),
452-
retry_on_status_codes: Union[int, tuple[int, ...]] = HTTPStatus.SERVICE_UNAVAILABLE,
448+
retry_on_exceptions: Union[type[Exception], tuple[type[Exception], ...]] = _DEFAULT_RETRY_ON_EXCEPTIONS,
449+
retry_on_status_codes: Union[int, tuple[int, ...]] = _DEFAULT_RETRY_ON_STATUS_CODES,
453450
**kwargs,
454451
) -> httpx.Response:
455452
"""Wrapper around httpx to retry calls on an endpoint, with exponential backoff.
@@ -478,9 +475,9 @@ def http_backoff(
478475
retry_on_exceptions (`type[Exception]` or `tuple[type[Exception]]`, *optional*):
479476
Define which exceptions must be caught to retry the request. Can be a single type or a tuple of types.
480477
By default, retry on `httpx.TimeoutException` and `httpx.NetworkError`.
481-
retry_on_status_codes (`int` or `tuple[int]`, *optional*, defaults to `503`):
482-
Define on which status codes the request must be retried. By default, only
483-
HTTP 503 Service Unavailable is retried.
478+
retry_on_status_codes (`int` or `tuple[int]`, *optional*, defaults to `(429, 500, 502, 503, 504)`):
479+
Define on which status codes the request must be retried. By default, retries
480+
on rate limit (429) and server errors (5xx).
484481
**kwargs (`dict`, *optional*):
485482
kwargs to pass to `httpx.request`.
486483
@@ -529,11 +526,8 @@ def http_stream_backoff(
529526
max_retries: int = 5,
530527
base_wait_time: float = 1,
531528
max_wait_time: float = 8,
532-
retry_on_exceptions: Union[type[Exception], tuple[type[Exception], ...]] = (
533-
httpx.TimeoutException,
534-
httpx.NetworkError,
535-
),
536-
retry_on_status_codes: Union[int, tuple[int, ...]] = HTTPStatus.SERVICE_UNAVAILABLE,
529+
retry_on_exceptions: Union[type[Exception], tuple[type[Exception], ...]] = _DEFAULT_RETRY_ON_EXCEPTIONS,
530+
retry_on_status_codes: Union[int, tuple[int, ...]] = _DEFAULT_RETRY_ON_STATUS_CODES,
537531
**kwargs,
538532
) -> Generator[httpx.Response, None, None]:
539533
"""Wrapper around httpx to retry calls on an endpoint, with exponential backoff.
@@ -561,10 +555,10 @@ def http_stream_backoff(
561555
Maximum duration (in seconds) to wait before retrying.
562556
retry_on_exceptions (`type[Exception]` or `tuple[type[Exception]]`, *optional*):
563557
Define which exceptions must be caught to retry the request. Can be a single type or a tuple of types.
564-
By default, retry on `httpx.Timeout` and `httpx.NetworkError`.
565-
retry_on_status_codes (`int` or `tuple[int]`, *optional*, defaults to `503`):
566-
Define on which status codes the request must be retried. By default, only
567-
HTTP 503 Service Unavailable is retried.
558+
By default, retry on `httpx.TimeoutException` and `httpx.NetworkError`.
559+
retry_on_status_codes (`int` or `tuple[int]`, *optional*, defaults to `(429, 500, 502, 503, 504)`):
560+
Define on which status codes the request must be retried. By default, retries
561+
on rate limit (429) and server errors (5xx).
568562
**kwargs (`dict`, *optional*):
569563
kwargs to pass to `httpx.request`.
570564

0 commit comments

Comments
 (0)