Skip to content

Commit 719a6ee

Browse files
authored
Merge pull request #989 from leonidc/fix_rebalance_exeption_handling
fix exception handling in rebalane thread
2 parents 2539ac4 + e8d349c commit 719a6ee

File tree

2 files changed

+9
-2
lines changed

2 files changed

+9
-2
lines changed

control/rebalance.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@ def __init__(self, gateway_service):
2727
self.ceph_utils = gateway_service.ceph_utils
2828
self.rebalance_period_sec = gateway_service.config.getint_with_default("gateway", "rebalance_period_sec", 7)
2929
self.rebalance_max_ns_to_change_lb_grp = gateway_service.config.getint_with_default("gateway", "max_ns_to_change_lb_grp", 8)
30-
self.auto_rebalance = threading.Thread(target=self.auto_rebalance_task, daemon=True)
30+
self.rebalance_event = threading.Event()
31+
self.auto_rebalance = threading.Thread(target=self.auto_rebalance_task, daemon=True, args=(self.rebalance_event,))
3132
self.auto_rebalance.start() #start the thread
3233

33-
def auto_rebalance_task(self ):
34+
def auto_rebalance_task(self, death_event):
3435
"""Periodically calls for auto rebalance."""
3536
while (self.rebalance_period_sec > 0):
3637
for i in range(self.rebalance_max_ns_to_change_lb_grp):
@@ -41,6 +42,8 @@ def auto_rebalance_task(self ):
4142
break
4243
except Exception:
4344
self.logger.exception(f"Exception in auto rebalance")
45+
if death_event:
46+
death_event.set()
4447
raise
4548
time.sleep(0.01) #release lock for 10ms after rebalancing each 1 NS
4649
time.sleep(self.rebalance_period_sec)

control/server.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,10 @@ def keep_alive(self):
692692
spdk_ping_interval_in_seconds = 0.0
693693

694694
while True:
695+
if self.gateway_rpc:
696+
if self.gateway_rpc.rebalance.rebalance_event.is_set():
697+
self.logger.critical(f"Failure in rebalance, aborting")
698+
raise SystemExit(f"Failure in rebalance, quitting gateway")
695699
timedout = self.server.wait_for_termination(timeout=1)
696700
if not timedout:
697701
break

0 commit comments

Comments
 (0)