Skip to content

Commit 38e833b

Browse files
committed
use deathrattle to fasttrack leaving
1 parent ddf39cb commit 38e833b

File tree

1 file changed

+12
-4
lines changed

1 file changed

+12
-4
lines changed

src/zeroband/comms.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
MAX_JOINERS = 100 # Maximum number of nodes that can join in a single reinit
1616
MAX_LEAVERS = 100 # Maximum number of nodes that can leave in a single reinit
1717
HEARTBEAT_INTERVAL = 2 # Interval in seconds between heartbeats
18-
HEARTBEAT_TIMEOUT = 6 # Time in seconds after which a node is considered dead if no heartbeat is received
18+
HEARTBEAT_TIMEOUT = 10 # Time in seconds after which a node is considered dead if no heartbeat is received
1919

2020

2121
class ElasticDeviceMesh:
@@ -212,21 +212,29 @@ def _start_heartbeat(self):
212212

213213
def _stop_heartbeat(self):
214214
"""Stop the heartbeat process."""
215+
self._send_deathrattle()
215216
if hasattr(self, "_heartbeat_stop_event"):
216217
self._heartbeat_stop_event.set()
217218
self._heartbeat_process.join()
218219

219220
def _heartbeat_loop(self, stop_event):
220221
"""Continuously send heartbeats until stopped."""
221-
while not stop_event.is_set():
222-
self._send_heartbeat()
223-
time.sleep(HEARTBEAT_INTERVAL)
222+
try:
223+
while not stop_event.is_set():
224+
self._send_heartbeat()
225+
time.sleep(HEARTBEAT_INTERVAL)
226+
finally:
227+
self._send_deathrattle()
224228

225229
def _send_heartbeat(self):
226230
"""Send a heartbeat to the global store."""
227231
current_time = time.time()
228232
self.global_store.set(f"heartbeat_{self.world_info.global_rank}", str(current_time))
229233

234+
def _send_deathrattle(self):
235+
"""Send a deathrattle to the global store."""
236+
self.global_store.set(f"heartbeat_{self.world_info.global_rank}", "-100")
237+
230238
def _check_heartbeats(self) -> List[str]:
231239
"""Check heartbeats and return a list of nodes that have missed their heartbeats."""
232240
dead_nodes = []

0 commit comments

Comments
 (0)