File tree Expand file tree Collapse file tree 2 files changed +8
-3
lines changed Expand file tree Collapse file tree 2 files changed +8
-3
lines changed Original file line number Diff line number Diff line change @@ -103,3 +103,4 @@ HF_HUB_ETAG_TIMEOUT=500
103
103
| ` ZERO_BAND_GLOBAL_STORE_POLLING_INTERVAL_SECONDS ` | Number of seconds between polls to the store when waiting for values | ` 0.1 ` |
104
104
| ` ZERO_BAND_EDM_HEARTBEAT_INTERVAL_SECONDS ` | Interval in seconds between heartbeats | ` 2 ` |
105
105
| ` ZERO_BAND_EDM_HEARTBEAT_TIMEOUT_SECONDS ` | Time in seconds after which a node is considered dead if no heartbeat is received | ` 10 ` |
106
+ | ` ZERO_BAND_LIVE_RECO_PORT ` | Port number for the live recovery server | ` 8000 ` |
Original file line number Diff line number Diff line change 34
34
35
35
SHM_PATH = "/dev/shm/zeroband"
36
36
37
+ ZERO_BAND_LIVE_RECO_PORT = int (os .environ .get ("ZERO_BAND_LIVE_RECO_PORT" , "8000" ))
38
+
37
39
38
40
@dataclass
39
41
class TrainingProgress (Stateful ):
@@ -135,7 +137,9 @@ def __init__(
135
137
self .async_save_process : list [multiprocessing .Process ] = []
136
138
137
139
if live_ckpt_server :
138
- self .live_server = CkptLiveServer (port = 8000 + self .world_info .global_rank , ckpt_path = SHM_PATH )
140
+ self .live_server = CkptLiveServer (
141
+ port = ZERO_BAND_LIVE_RECO_PORT + self .world_info .global_rank , ckpt_path = SHM_PATH
142
+ )
139
143
140
144
def _init_state (self ):
141
145
# states can only be stateful object, hence we need to wrap Model and Optimizer
@@ -288,11 +292,11 @@ def download_and_load_ckpt_from_peers(self):
288
292
if self .world_info .local_rank == 0 :
289
293
# only local rank download the ckpt
290
294
wget (
291
- source = f"http://localhost:{ 8000 + dest_rank } /latest/diloco_{ dest_rank } " ,
295
+ source = f"http://localhost:{ ZERO_BAND_LIVE_RECO_PORT + dest_rank } /latest/diloco_{ dest_rank } " ,
292
296
destination = path ,
293
297
)
294
298
wget (
295
- source = f"http://localhost:{ 8000 + dest_rank } /latest/diloco_{ dest_rank } /.metadata" ,
299
+ source = f"http://localhost:{ ZERO_BAND_LIVE_RECO_PORT + dest_rank } /latest/diloco_{ dest_rank } /.metadata" ,
296
300
destination = path ,
297
301
)
298
302
dist .barrier ()
You can’t perform that action at this time.
0 commit comments