File tree Expand file tree Collapse file tree 2 files changed +4
-1
lines changed Expand file tree Collapse file tree 2 files changed +4
-1
lines changed Original file line number Diff line number Diff line change @@ -104,3 +104,4 @@ HF_HUB_ETAG_TIMEOUT=500
104
104
| ` ZERO_BAND_EDM_HEARTBEAT_INTERVAL_SECONDS ` | Interval in seconds between heartbeats | ` 2 ` |
105
105
| ` ZERO_BAND_EDM_HEARTBEAT_TIMEOUT_SECONDS ` | Time in seconds after which a node is considered dead if no heartbeat is received | ` 10 ` |
106
106
| ` ZERO_BAND_LIVE_RECO_PORT ` | Port number for the live recovery server | ` 8000 ` |
107
+ | ` ZERO_BAND_LIVE_RECO_ADDR ` | IP Address for the live recovery server | ` localhost ` |
Original file line number Diff line number Diff line change 22
22
23
23
LIVE_RECO_PORT = int (os .environ .get ("ZERO_BAND_LIVE_RECO_PORT" , "8000" ))
24
24
25
+ LIVE_RECO_ADDR = os .environ .get ("ZERO_BAND_LIVE_RECO_ADDR" , "localhost" )
26
+
25
27
26
28
class ElasticDeviceMesh :
27
29
"""A class to manage the process groups for elastic training without restarts.
@@ -395,7 +397,7 @@ def init_live_endpoint(self, store: dist.Store):
395
397
return
396
398
self .store = dist .PrefixStore ("live_reco_adress" , store )
397
399
port = LIVE_RECO_PORT + self .world_info .global_rank
398
- self .store .set (f"adress_{ self .world_info .global_unique_id } " , f"localhost :{ port } " )
400
+ self .store .set (f"adress_{ self .world_info .global_unique_id } " , f"{ LIVE_RECO_ADDR } :{ port } " )
399
401
400
402
def get_adress (self , rank : int ) -> str :
401
403
"""Get the live recovery adress for a given rank."""
You can’t perform that action at this time.
0 commit comments