diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index 10743e0dd..ed5ae551f 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -20,6 +20,7 @@ import sys import tempfile import time +import traceback import uuid from collections import defaultdict from concurrent.futures import ThreadPoolExecutor @@ -864,6 +865,7 @@ def _initialize_workers(self, worker_group): f"training({self.__class__.__name__}) workers." ) start_pending = 0 + err_cnt = 0 pend_timeout = float( self._config.rdzv_configs.get("pend_timeout", "inf") ) @@ -885,6 +887,18 @@ def _initialize_workers(self, worker_group): time.sleep(JobConstant.TRAINING_AGENT_LOOP_DEFAULT_INTERVAL) if time.time() - start_pending > pend_timeout: raise TimeoutError("Timeout to wait for new nodes.") + except Exception as e: + err_cnt += 1 + if err_cnt < 3: + stack_trace = traceback.format_exc() + logger.error( + f"Unexpected exception in _initialize_workers: {e}" + ) + logger.error(f"Stack backtrace:\n {stack_trace}") + self._stop_workers(worker_group) + continue + else: + raise e else: logger.info("Finish initializing training workers.") break