diff --git a/CHANGELOG.md b/CHANGELOG.md index 63e0a69..26b7e04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ job arrays. Adding possiblity to force the initialization of distributed even when world size=1 by setting the `DORA_FORCE_DISTRIB=1` env variable. Always export LOCAL_RANK when running with `dora run`. +Not longer store the XP in the _SubmitItTarget in order to avoid potential pickling errors. + ## [0.1.12] - 2023-05-23 Fixed bug with PL (Thanks @kingjr). diff --git a/dora/__init__.py b/dora/__init__.py index 37730e8..05f6f9d 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.13a4" +__version__ = "0.1.13a5" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/shep.py b/dora/shep.py index aae5f57..abede17 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -24,7 +24,7 @@ from .conf import SlurmConfig, SubmitRules from .main import DecoratedMain from .utils import try_load -from .xp import XP, _get_sig +from .xp import XP, _get_sig, get_xp logger = logging.getLogger(__name__) @@ -41,7 +41,7 @@ def register_preemption_callaback(callback: PreemptionCallback): class _SubmitItTarget: def __call__(self, main: DecoratedMain, argv: tp.Sequence[str], requeue: bool = True): from .distrib import get_distrib_spec # this will import torch which can be quite slow. - self.xp = main.get_xp(argv) + xp = main.get_xp(argv) self.requeue = requeue spec = get_distrib_spec() # We export the RANK as it can be used to customize logging early on @@ -60,8 +60,9 @@ def checkpoint(self, *args, **kwargs): if get_distrib_spec().rank == 0: # cleanup rendezvous file on requeue, otherwise things will fail. - if self.xp.rendezvous_file.exists(): - self.xp.rendezvous_file.unlink() + xp = get_xp() + if xp.rendezvous_file.exists(): + xp.rendezvous_file.unlink() return submitit.helpers.DelayedSubmission(self, *args, **kwargs)