From 4b76309174e667e2a3efaf856bce52be44e6dd3f Mon Sep 17 00:00:00 2001 From: Ying Chen Date: Tue, 24 Sep 2024 08:28:13 -0700 Subject: [PATCH] [metrics] Fix lost connection when metrics query database --- apps/useradmin/src/useradmin/metrics.py | 30 ++++++++++++++--- desktop/core/src/desktop/metrics.py | 43 ++++++++++++++++++------- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/apps/useradmin/src/useradmin/metrics.py b/apps/useradmin/src/useradmin/metrics.py index 8f08f558dd0..cae6f159864 100644 --- a/apps/useradmin/src/useradmin/metrics.py +++ b/apps/useradmin/src/useradmin/metrics.py @@ -15,8 +15,10 @@ # limitations under the License. import logging - from datetime import datetime, timedelta + +from django.db import connection +from django.db.utils import OperationalError from prometheus_client import Gauge from desktop.lib.metrics import global_registry @@ -24,6 +26,7 @@ LOG = logging.getLogger() + def active_users(): from useradmin.models import UserProfile try: @@ -32,11 +35,21 @@ def active_users(): first_login=False, hostname__isnull=False ).count() - except: + except OperationalError as oe: + LOG.debug('active_users recovering from %s' % str(oe)) + connection.close() + connection.connect() + count = UserProfile.objects.filter( + last_activity__gt=datetime.now() - timedelta(hours=1), + first_login=False, + hostname__isnull=False + ).count() + except Exception as e: LOG.exception('Could not get active_users') count = 0 return count + global_registry().gauge_callback( name='users.active.total', callback=active_users, @@ -48,15 +61,24 @@ def active_users(): prometheus_active_users = Gauge('hue_active_users', 'Hue Active Users in All Instances') prometheus_active_users.set_function(active_users) + def active_users_per_instance(): from useradmin.models import UserProfile try: - count = UserProfile.objects.filter(last_activity__gt=datetime.now() - timedelta(hours=1), hostname=get_localhost_name()).count() - except: + count = UserProfile.objects.filter(last_activity__gt=datetime.now() - timedelta(hours=1), + hostname=get_localhost_name()).count() + except OperationalError as oe: + LOG.debug('active_users_per_instance recovering from %s' % str(oe)) + connection.close() + connection.connect() + count = UserProfile.objects.filter(last_activity__gt=datetime.now() - timedelta(hours=1), + hostname=get_localhost_name()).count() + except Exception as e: LOG.exception('Could not get active_users per instance') count = 0 return count + global_registry().gauge_callback( name='users.active', callback=active_users_per_instance, diff --git a/desktop/core/src/desktop/metrics.py b/desktop/core/src/desktop/metrics.py index d7eb7be5b0e..63fa6555201 100644 --- a/desktop/core/src/desktop/metrics.py +++ b/desktop/core/src/desktop/metrics.py @@ -16,22 +16,23 @@ from __future__ import absolute_import -from future import standard_library -standard_library.install_aliases() -from builtins import range import gc import logging -import multiprocessing import threading - +import multiprocessing +from builtins import range from datetime import datetime, timedelta -from prometheus_client import Gauge, REGISTRY -from useradmin.models import User +from django.db import connection +from django.db.utils import OperationalError +from future import standard_library +from prometheus_client import REGISTRY, Gauge from desktop.conf import ENABLE_PROMETHEUS from desktop.lib.metrics import global_registry +from useradmin.models import User +standard_library.install_aliases() LOG = logging.getLogger() @@ -49,7 +50,9 @@ django_collectors = set() django_metrics_names = [ name - for name in REGISTRY._names_to_collectors.keys() if name.startswith('django_') and not name.startswith(ALLOWED_DJANGO_PROMETHEUS_METRICS) + for name in REGISTRY._names_to_collectors.keys() + if name.startswith('django_') + and not name.startswith(ALLOWED_DJANGO_PROMETHEUS_METRICS) ] for metric_name in django_metrics_names: @@ -141,14 +144,21 @@ # ------------------------------------------------------------------------------ + def user_count(): users = 0 try: users = User.objects.count() - except: + except OperationalError as oe: + LOG.debug('user_count recovering from %s' % str(oe)) + connection.close() + connection.connect() + users = User.objects.count() + except Exception as e: LOG.exception('Metrics: Failed to get number of user accounts') return users + user_count = global_registry().gauge_callback( name='users', callback=user_count, @@ -188,19 +198,30 @@ def user_count(): # ------------------------------------------------------------------------------ + def num_of_queries(): - from desktop.models import Document2 # Avoid circular dependency + from desktop.models import Document2 # Avoid circular dependency try: count = Document2.objects.filter( type__istartswith='query-', is_history=True, last_modified__gt=datetime.now() - timedelta(minutes=10) ).count() - except: + except OperationalError as oe: + LOG.debug('num_of_queries recovering from %s' % str(oe)) + connection.close() + connection.connect() + count = Document2.objects.filter( + type__istartswith='query-', + is_history=True, + last_modified__gt=datetime.now() - timedelta(minutes=10) + ).count() + except Exception as e: LOG.exception('Could not get num_of_queries') count = 0 return count + global_registry().gauge_callback( name='queries.number', callback=num_of_queries,