multiple repair processes on the Cassandra DB in Contrail AnalyticsDB nodes

msahu-jnpr · msahu-jnpr · commit 2effa5b9585d · 2020-09-10T12:52:11.000+05:30
We need to make sure that we run don't repair in all the nodes
at the same time because in that case there is a possibility that
all nodes try to repair same sstable and the repair may hang.
Hence we need to run repair in all the nodes at interval gap of 4hr

Closes-Jira-Bug: CEM-15348
Change-Id: I81b161c54a8fc57ff6fa1b1eb0f62b16041d5041
diff --git a/src/nodemgr/analytics_database_nodemgr/event_manager.py b/src/nodemgr/analytics_database_nodemgr/event_manager.py
@@ -28,6 +28,7 @@ def __init__(self, config, unit_names):
             config.db_port, config.db_jmx_port, config.db_use_ssl,
             config.db_user, config.db_password,
             self.process_info_manager, hostname=config.hostname)
+        self.node_idx = None
 
     def get_failbits_nodespecific_desc(self, fail_status_bits):
         return self.cassandra_mgr.get_failbits_nodespecific_desc(
@@ -36,6 +37,13 @@ def get_failbits_nodespecific_desc(self, fail_status_bits):
     def do_periodic_events(self):
         self.cassandra_mgr.database_periodic(self)
         # Perform nodetool repair every cassandra_repair_interval hours
-        if self.tick_count % (60 * self.cassandra_repair_interval) == 0:
-            self.cassandra_mgr.repair(self)
+        if self.node_idx is None:
+            self.node_idx = self.cassandra_mgr.get_cassandra_node_idx(self)
+        if self.node_idx >= 0:
+            # We need to make sure that we run don't repair in all the nodes
+            # at the same time because in that case there is a possibility that
+            # all the nodes try to repair same sstable and the repair may hang.
+            # Hence we need to run repair in all the nodes at interval gap of 4hr
+            if self.tick_count % (60 * self.cassandra_repair_interval) == self.node_idx * 60 * 4:
+                self.cassandra_mgr.repair(self)
         super(AnalyticsDatabaseEventManager, self).do_periodic_events()
diff --git a/src/nodemgr/common/cassandra_manager.py b/src/nodemgr/common/cassandra_manager.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2016 Juniper Networks, Inc. All rights reserved.#
 from gevent import monkey
 
+import os
 import socket
 import yaml
 from pysandesh.gen_py.sandesh.ttypes import SandeshLevel
@@ -42,7 +43,39 @@ def status(self):
         # and this is not allowed in micrioservices setup
         pass
 
+    def is_nodetool_repair_running(self, event_mgr):
+        repair_cmd = "nodetool -p {} repair".format(self.db_jmx_port)
+        cmd = 'ps auxww'
+        try:
+            return repair_cmd in self.exec_cmd(cmd)
+        except Exception as e:
+            err_msg = "Failed to run cmd: {}.\nError: {}".format(cmd, e)
+            event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
+        # let's return True in case we can't detect nodetool status to avoid second run
+        return True
+
+    def get_cassandra_node_idx(self, event_mgr):
+        db_nodes = None
+        try:
+            if self._db_owner == 'analytics':
+                db_nodes = os.getenv('ANALYTICSDB_NODES')
+            elif self._db_owner == 'config':
+                db_nodes = os.getenv('CONFIGDB_NODES')
+            db_nodes = db_nodes.split(',')
+            db_nodes.sort()
+            idx = db_nodes.index(self.hostip)
+        except Exception as e:
+            err_msg = "Failed to get node index. Error is {}".format(e)
+            event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
+            return -1
+        else:
+            return idx
+
     def repair(self, event_mgr):
+        if self.is_nodetool_repair_running(event_mgr):
+            msg = "Can't run repair as nodetool is already running"
+            event_mgr.msg_log(msg, level=SandeshLevel.SYS_ERR)
+            return
         keyspaces = []
         if self._db_owner == 'analytics':
             keyspaces = AnalyticsRepairNeededKeyspaces