Skip to content

Commit

Permalink
add exception catch in _initialize_workers to protect dlrover from ex…
Browse files Browse the repository at this point in the history
…iting
  • Loading branch information
Ma Jie Yue committed Feb 5, 2025
1 parent 2ce0e02 commit 27074a1
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions dlrover/python/elastic_agent/torch/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import sys
import tempfile
import time
import traceback
import uuid
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -864,6 +865,7 @@ def _initialize_workers(self, worker_group):
f"training({self.__class__.__name__}) workers."
)
start_pending = 0
err_cnt = 0
pend_timeout = float(
self._config.rdzv_configs.get("pend_timeout", "inf")
)
Expand All @@ -885,6 +887,18 @@ def _initialize_workers(self, worker_group):
time.sleep(JobConstant.TRAINING_AGENT_LOOP_DEFAULT_INTERVAL)
if time.time() - start_pending > pend_timeout:
raise TimeoutError("Timeout to wait for new nodes.")
except Exception as e:
err_cnt += 1
if err_cnt < 3:
stack_trace = traceback.format_exc()
logger.error(

Check warning on line 894 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L890-L894

Added lines #L890 - L894 were not covered by tests
f"Unexpected exception in _initialize_workers: {e}"
)
logger.error(f"Stack backtrace:\n {stack_trace}")
self._stop_workers(worker_group)
continue

Check warning on line 899 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L897-L899

Added lines #L897 - L899 were not covered by tests
else:
raise e

Check warning on line 901 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L901

Added line #L901 was not covered by tests
else:
logger.info("Finish initializing training workers.")
break
Expand Down

0 comments on commit 27074a1

Please sign in to comment.