Skip to content

Commit 2effa5b

Browse files
committed
multiple repair processes on the Cassandra DB in Contrail AnalyticsDB nodes
We need to make sure that we run don't repair in all the nodes at the same time because in that case there is a possibility that all nodes try to repair same sstable and the repair may hang. Hence we need to run repair in all the nodes at interval gap of 4hr Closes-Jira-Bug: CEM-15348 Change-Id: I81b161c54a8fc57ff6fa1b1eb0f62b16041d5041
1 parent b98a061 commit 2effa5b

File tree

2 files changed

+43
-2
lines changed

2 files changed

+43
-2
lines changed

src/nodemgr/analytics_database_nodemgr/event_manager.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(self, config, unit_names):
2828
config.db_port, config.db_jmx_port, config.db_use_ssl,
2929
config.db_user, config.db_password,
3030
self.process_info_manager, hostname=config.hostname)
31+
self.node_idx = None
3132

3233
def get_failbits_nodespecific_desc(self, fail_status_bits):
3334
return self.cassandra_mgr.get_failbits_nodespecific_desc(
@@ -36,6 +37,13 @@ def get_failbits_nodespecific_desc(self, fail_status_bits):
3637
def do_periodic_events(self):
3738
self.cassandra_mgr.database_periodic(self)
3839
# Perform nodetool repair every cassandra_repair_interval hours
39-
if self.tick_count % (60 * self.cassandra_repair_interval) == 0:
40-
self.cassandra_mgr.repair(self)
40+
if self.node_idx is None:
41+
self.node_idx = self.cassandra_mgr.get_cassandra_node_idx(self)
42+
if self.node_idx >= 0:
43+
# We need to make sure that we run don't repair in all the nodes
44+
# at the same time because in that case there is a possibility that
45+
# all the nodes try to repair same sstable and the repair may hang.
46+
# Hence we need to run repair in all the nodes at interval gap of 4hr
47+
if self.tick_count % (60 * self.cassandra_repair_interval) == self.node_idx * 60 * 4:
48+
self.cassandra_mgr.repair(self)
4149
super(AnalyticsDatabaseEventManager, self).do_periodic_events()

src/nodemgr/common/cassandra_manager.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) 2016 Juniper Networks, Inc. All rights reserved.#
22
from gevent import monkey
33

4+
import os
45
import socket
56
import yaml
67
from pysandesh.gen_py.sandesh.ttypes import SandeshLevel
@@ -42,7 +43,39 @@ def status(self):
4243
# and this is not allowed in micrioservices setup
4344
pass
4445

46+
def is_nodetool_repair_running(self, event_mgr):
47+
repair_cmd = "nodetool -p {} repair".format(self.db_jmx_port)
48+
cmd = 'ps auxww'
49+
try:
50+
return repair_cmd in self.exec_cmd(cmd)
51+
except Exception as e:
52+
err_msg = "Failed to run cmd: {}.\nError: {}".format(cmd, e)
53+
event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
54+
# let's return True in case we can't detect nodetool status to avoid second run
55+
return True
56+
57+
def get_cassandra_node_idx(self, event_mgr):
58+
db_nodes = None
59+
try:
60+
if self._db_owner == 'analytics':
61+
db_nodes = os.getenv('ANALYTICSDB_NODES')
62+
elif self._db_owner == 'config':
63+
db_nodes = os.getenv('CONFIGDB_NODES')
64+
db_nodes = db_nodes.split(',')
65+
db_nodes.sort()
66+
idx = db_nodes.index(self.hostip)
67+
except Exception as e:
68+
err_msg = "Failed to get node index. Error is {}".format(e)
69+
event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
70+
return -1
71+
else:
72+
return idx
73+
4574
def repair(self, event_mgr):
75+
if self.is_nodetool_repair_running(event_mgr):
76+
msg = "Can't run repair as nodetool is already running"
77+
event_mgr.msg_log(msg, level=SandeshLevel.SYS_ERR)
78+
return
4679
keyspaces = []
4780
if self._db_owner == 'analytics':
4881
keyspaces = AnalyticsRepairNeededKeyspaces

0 commit comments

Comments
 (0)