Skip to content

Commit

Permalink
no longer storing xp in submitit target to avoid pickling errors
Browse files Browse the repository at this point in the history
  • Loading branch information
adefossez committed Oct 5, 2023
1 parent 5bc96b6 commit 1d6be76
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ job arrays.
Adding possiblity to force the initialization of distributed even when world size=1 by setting
the `DORA_FORCE_DISTRIB=1` env variable. Always export LOCAL_RANK when running with `dora run`.

Not longer store the XP in the _SubmitItTarget in order to avoid potential pickling errors.

## [0.1.12] - 2023-05-23

Fixed bug with PL (Thanks @kingjr).
Expand Down
2 changes: 1 addition & 1 deletion dora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
__pdoc__ = {}
__pdoc__['tests'] = False

__version__ = "0.1.13a4"
__version__ = "0.1.13a5"

# flake8: noqa
from .explore import Explorer, Launcher
Expand Down
9 changes: 5 additions & 4 deletions dora/shep.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from .conf import SlurmConfig, SubmitRules
from .main import DecoratedMain
from .utils import try_load
from .xp import XP, _get_sig
from .xp import XP, _get_sig, get_xp


logger = logging.getLogger(__name__)
Expand All @@ -41,7 +41,7 @@ def register_preemption_callaback(callback: PreemptionCallback):
class _SubmitItTarget:
def __call__(self, main: DecoratedMain, argv: tp.Sequence[str], requeue: bool = True):
from .distrib import get_distrib_spec # this will import torch which can be quite slow.
self.xp = main.get_xp(argv)
xp = main.get_xp(argv)
self.requeue = requeue
spec = get_distrib_spec()
# We export the RANK as it can be used to customize logging early on
Expand All @@ -60,8 +60,9 @@ def checkpoint(self, *args, **kwargs):

if get_distrib_spec().rank == 0:
# cleanup rendezvous file on requeue, otherwise things will fail.
if self.xp.rendezvous_file.exists():
self.xp.rendezvous_file.unlink()
xp = get_xp()
if xp.rendezvous_file.exists():
xp.rendezvous_file.unlink()
return submitit.helpers.DelayedSubmission(self, *args, **kwargs)


Expand Down

0 comments on commit 1d6be76

Please sign in to comment.