From ce6a72111078a0ee29a780a677350a37f6c92f62 Mon Sep 17 00:00:00 2001 From: xiaohanzhangcmu Date: Wed, 12 Nov 2025 11:03:45 -0800 Subject: [PATCH] increase dist timeout --- composer/utils/dist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/utils/dist.py b/composer/utils/dist.py index b5832740a6..9da65aee8a 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -498,7 +498,7 @@ def is_initialized(): return dist.is_initialized() -def initialize_dist(device: Optional[Union[str, Device]] = None, timeout: float = 300.0) -> None: +def initialize_dist(device: Optional[Union[str, Device]] = None, timeout: float = 1800.0) -> None: """Initialize the default PyTorch distributed process group. This function assumes that the following environment variables are set: @@ -521,7 +521,7 @@ def initialize_dist(device: Optional[Union[str, Device]] = None, timeout: float interpreted. Either a string corresponding to a device (one of ``'cpu'``, ``'gpu'``, ``'mps'``, or ``'tpu'``) or a :class:`.Device`. (default: ``None``) timeout (float, optional): The timeout for operations executed against the process - group, expressed in seconds. (default: ``300.0``). + group, expressed in seconds. (default: ``1800.0``). """ # If device is string, get corresponding composer.devices.Device object device_obj = get_device(device)