Merge "multiple repair processes on the Cassandra DB in Contrail Anal…

…yticsDB nodes"
Juniper · Sep 10, 2020 · 76554ed · 76554ed
2 parents 87f12c7 + 2effa5b
commit 76554ed
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 2 deletions.
diff --git a/src/nodemgr/analytics_database_nodemgr/event_manager.py b/src/nodemgr/analytics_database_nodemgr/event_manager.py
@@ -28,6 +28,7 @@ def __init__(self, config, unit_names):
             config.db_port, config.db_jmx_port, config.db_use_ssl,
             config.db_user, config.db_password,
             self.process_info_manager, hostname=config.hostname)
+        self.node_idx = None
 
     def get_failbits_nodespecific_desc(self, fail_status_bits):
         return self.cassandra_mgr.get_failbits_nodespecific_desc(
@@ -36,6 +37,13 @@ def get_failbits_nodespecific_desc(self, fail_status_bits):
     def do_periodic_events(self):
         self.cassandra_mgr.database_periodic(self)
         # Perform nodetool repair every cassandra_repair_interval hours
-        if self.tick_count % (60 * self.cassandra_repair_interval) == 0:
-            self.cassandra_mgr.repair(self)
+        if self.node_idx is None:
+            self.node_idx = self.cassandra_mgr.get_cassandra_node_idx(self)
+        if self.node_idx >= 0:
+            # We need to make sure that we run don't repair in all the nodes
+            # at the same time because in that case there is a possibility that
+            # all the nodes try to repair same sstable and the repair may hang.
+            # Hence we need to run repair in all the nodes at interval gap of 4hr
+            if self.tick_count % (60 * self.cassandra_repair_interval) == self.node_idx * 60 * 4:
+                self.cassandra_mgr.repair(self)
         super(AnalyticsDatabaseEventManager, self).do_periodic_events()
diff --git a/src/nodemgr/common/cassandra_manager.py b/src/nodemgr/common/cassandra_manager.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2016 Juniper Networks, Inc. All rights reserved.#
 from gevent import monkey
 
+import os
 import socket
 import yaml
 from pysandesh.gen_py.sandesh.ttypes import SandeshLevel
@@ -42,7 +43,39 @@ def status(self):
         # and this is not allowed in micrioservices setup
         pass
 
+    def is_nodetool_repair_running(self, event_mgr):
+        repair_cmd = "nodetool -p {} repair".format(self.db_jmx_port)
+        cmd = 'ps auxww'
+        try:
+            return repair_cmd in self.exec_cmd(cmd)
+        except Exception as e:
+            err_msg = "Failed to run cmd: {}.\nError: {}".format(cmd, e)
+            event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
+        # let's return True in case we can't detect nodetool status to avoid second run
+        return True
+
+    def get_cassandra_node_idx(self, event_mgr):
+        db_nodes = None
+        try:
+            if self._db_owner == 'analytics':
+                db_nodes = os.getenv('ANALYTICSDB_NODES')
+            elif self._db_owner == 'config':
+                db_nodes = os.getenv('CONFIGDB_NODES')
+            db_nodes = db_nodes.split(',')
+            db_nodes.sort()
+            idx = db_nodes.index(self.hostip)
+        except Exception as e:
+            err_msg = "Failed to get node index. Error is {}".format(e)
+            event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
+            return -1
+        else:
+            return idx
+
     def repair(self, event_mgr):
+        if self.is_nodetool_repair_running(event_mgr):
+            msg = "Can't run repair as nodetool is already running"
+            event_mgr.msg_log(msg, level=SandeshLevel.SYS_ERR)
+            return
         keyspaces = []
         if self._db_owner == 'analytics':
             keyspaces = AnalyticsRepairNeededKeyspaces