Skip to content

Commit

Permalink
Merge "multiple repair processes on the Cassandra DB in Contrail Anal…
Browse files Browse the repository at this point in the history
…yticsDB nodes"
  • Loading branch information
Jenkins CI authored and opencontrail-ci-admin committed Sep 10, 2020
2 parents 87f12c7 + 2effa5b commit 76554ed
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/nodemgr/analytics_database_nodemgr/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self, config, unit_names):
config.db_port, config.db_jmx_port, config.db_use_ssl,
config.db_user, config.db_password,
self.process_info_manager, hostname=config.hostname)
self.node_idx = None

def get_failbits_nodespecific_desc(self, fail_status_bits):
return self.cassandra_mgr.get_failbits_nodespecific_desc(
Expand All @@ -36,6 +37,13 @@ def get_failbits_nodespecific_desc(self, fail_status_bits):
def do_periodic_events(self):
self.cassandra_mgr.database_periodic(self)
# Perform nodetool repair every cassandra_repair_interval hours
if self.tick_count % (60 * self.cassandra_repair_interval) == 0:
self.cassandra_mgr.repair(self)
if self.node_idx is None:
self.node_idx = self.cassandra_mgr.get_cassandra_node_idx(self)
if self.node_idx >= 0:
# We need to make sure that we run don't repair in all the nodes
# at the same time because in that case there is a possibility that
# all the nodes try to repair same sstable and the repair may hang.
# Hence we need to run repair in all the nodes at interval gap of 4hr
if self.tick_count % (60 * self.cassandra_repair_interval) == self.node_idx * 60 * 4:
self.cassandra_mgr.repair(self)
super(AnalyticsDatabaseEventManager, self).do_periodic_events()
33 changes: 33 additions & 0 deletions src/nodemgr/common/cassandra_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2016 Juniper Networks, Inc. All rights reserved.#
from gevent import monkey

import os
import socket
import yaml
from pysandesh.gen_py.sandesh.ttypes import SandeshLevel
Expand Down Expand Up @@ -42,7 +43,39 @@ def status(self):
# and this is not allowed in micrioservices setup
pass

def is_nodetool_repair_running(self, event_mgr):
repair_cmd = "nodetool -p {} repair".format(self.db_jmx_port)
cmd = 'ps auxww'
try:
return repair_cmd in self.exec_cmd(cmd)
except Exception as e:
err_msg = "Failed to run cmd: {}.\nError: {}".format(cmd, e)
event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
# let's return True in case we can't detect nodetool status to avoid second run
return True

def get_cassandra_node_idx(self, event_mgr):
db_nodes = None
try:
if self._db_owner == 'analytics':
db_nodes = os.getenv('ANALYTICSDB_NODES')
elif self._db_owner == 'config':
db_nodes = os.getenv('CONFIGDB_NODES')
db_nodes = db_nodes.split(',')
db_nodes.sort()
idx = db_nodes.index(self.hostip)
except Exception as e:
err_msg = "Failed to get node index. Error is {}".format(e)
event_mgr.msg_log(err_msg, level=SandeshLevel.SYS_ERR)
return -1
else:
return idx

def repair(self, event_mgr):
if self.is_nodetool_repair_running(event_mgr):
msg = "Can't run repair as nodetool is already running"
event_mgr.msg_log(msg, level=SandeshLevel.SYS_ERR)
return
keyspaces = []
if self._db_owner == 'analytics':
keyspaces = AnalyticsRepairNeededKeyspaces
Expand Down

0 comments on commit 76554ed

Please sign in to comment.