From b74bae42eccc11d012a6101d23d4d657929bcaa3 Mon Sep 17 00:00:00 2001 From: Pedro Crespo-Valero <32402063+pcrespov@users.noreply.github.com> Date: Mon, 9 Sep 2024 14:54:17 +0200 Subject: [PATCH] =?UTF-8?q?=20=F0=9F=8E=A8=20Healtcheck=20diagnostics=20se?= =?UTF-8?q?nsor=20is=20now=20optional=20(#6327)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env-devel | 1 + services/docker-compose.yml | 1 + .../diagnostics/_healthcheck.py | 18 ++++++------- .../diagnostics/plugin.py | 14 +++++------ .../diagnostics/settings.py | 17 +++++++++---- .../rest/_handlers.py | 4 +-- .../rest/healthcheck.py | 4 +-- .../isolated/test_diagnostics_healthcheck.py | 25 +++++++++++++------ 8 files changed, 49 insertions(+), 35 deletions(-) diff --git a/.env-devel b/.env-devel index 22690e81871..0959e408f87 100644 --- a/.env-devel +++ b/.env-devel @@ -282,6 +282,7 @@ WB_DB_EL_WALLETS=0 # WEBSERVER ---- AIODEBUG_SLOW_DURATION_SECS=0 +DIAGNOSTICS_HEALTHCHECK_ENABLED=False DIAGNOSTICS_MAX_AVG_LATENCY=10 DIAGNOSTICS_MAX_TASK_DELAY=30 DIAGNOSTICS_SLOW_DURATION_SECS=1 diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 994cb71c890..37fe44d03c5 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -601,6 +601,7 @@ services: # WEBSERVER_DIAGNOSTICS WEBSERVER_DIAGNOSTICS: ${WEBSERVER_DIAGNOSTICS} + DIAGNOSTICS_HEALTHCHECK_ENABLED: ${DIAGNOSTICS_HEALTHCHECK_ENABLED} DIAGNOSTICS_MAX_AVG_LATENCY: ${DIAGNOSTICS_MAX_AVG_LATENCY} DIAGNOSTICS_MAX_TASK_DELAY: ${DIAGNOSTICS_MAX_TASK_DELAY} DIAGNOSTICS_SLOW_DURATION_SECS: ${DIAGNOSTICS_SLOW_DURATION_SECS} diff --git a/services/web/server/src/simcore_service_webserver/diagnostics/_healthcheck.py b/services/web/server/src/simcore_service_webserver/diagnostics/_healthcheck.py index 66a839a98b1..2db7f5251c5 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics/_healthcheck.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics/_healthcheck.py @@ -57,25 +57,25 @@ def value(self) -> float: return delay -logged_once = False +_logged_once = False def is_sensing_enabled(app: web.Application): """Diagnostics will not activate sensing immediatly but after some time since the app started """ - global logged_once # pylint: disable=global-statement + global _logged_once # pylint: disable=global-statement settings = get_plugin_settings(app) time_elapsed_since_setup = time.time() - app[HEALTH_PLUGIN_START_TIME] enabled = time_elapsed_since_setup > settings.DIAGNOSTICS_START_SENSING_DELAY - if enabled and not logged_once: + if enabled and not _logged_once: _logger.debug( "Diagnostics starts sensing after waiting %3.2f secs [> %3.2f secs] since submodule init", time_elapsed_since_setup, settings.DIAGNOSTICS_START_SENSING_DELAY, ) - logged_once = True + _logged_once = True return enabled @@ -106,10 +106,7 @@ def assert_healthy_app(app: web.Application) -> None: ) if max_delay > max_delay_allowed: - msg = "{:3.1f} secs delay [at most {:3.1f} secs allowed]".format( - max_delay, - max_delay_allowed, - ) + msg = f"{max_delay:3.1f} secs delay [at most {max_delay_allowed:3.1f} secs allowed]" raise HealthCheckError(msg) # CRITERIA 2: Mean latency of the last N request slower than 1 sec @@ -125,6 +122,5 @@ def assert_healthy_app(app: web.Application) -> None: ) if max_latency_allowed < latency: - raise HealthCheckError( - f"Last requests average latency is {latency} secs and surpasses {max_latency_allowed} secs" - ) + msg = f"Last requests average latency is {latency} secs and surpasses {max_latency_allowed} secs" + raise HealthCheckError(msg) diff --git a/services/web/server/src/simcore_service_webserver/diagnostics/plugin.py b/services/web/server/src/simcore_service_webserver/diagnostics/plugin.py index d457c92943e..a9bcb1306b8 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics/plugin.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics/plugin.py @@ -23,6 +23,10 @@ _logger = logging.getLogger(__name__) +async def _on_healthcheck_async_adapter(app: web.Application) -> None: + assert_healthy_app(app) + + @app_module_setup( __name__, ModuleCategory.ADDON, @@ -46,13 +50,9 @@ def setup_diagnostics( # adds middleware and /metrics setup_monitoring(app) - # injects healthcheck - healthcheck: HealthCheck = app[HealthCheck.__name__] - - async def _on_healthcheck_async_adapter(app: web.Application) -> None: - assert_healthy_app(app) - - healthcheck.on_healthcheck.append(_on_healthcheck_async_adapter) + if settings.DIAGNOSTICS_HEALTHCHECK_ENABLED: + healthcheck: HealthCheck = app[HealthCheck.__name__] + healthcheck.on_healthcheck.append(_on_healthcheck_async_adapter) # adds other diagnostic routes: healthcheck, etc app.router.add_routes(_handlers.routes) diff --git a/services/web/server/src/simcore_service_webserver/diagnostics/settings.py b/services/web/server/src/simcore_service_webserver/diagnostics/settings.py index 777a1ac9457..5c82496ad34 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics/settings.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics/settings.py @@ -6,7 +6,7 @@ class DiagnosticsSettings(BaseCustomSettings): DIAGNOSTICS_SLOW_DURATION_SECS: PositiveFloat = Field( - 1.0, + default=1.0, description=( "Any task blocked more than slow_duration_secs is logged as WARNING" "Aims to identify possible blocking calls" @@ -14,20 +14,27 @@ class DiagnosticsSettings(BaseCustomSettings): env=["DIAGNOSTICS_SLOW_DURATION_SECS", "AIODEBUG_SLOW_DURATION_SECS"], ) + DIAGNOSTICS_HEALTHCHECK_ENABLED: bool = Field( + default=False, + description="Enables/disables healthcheck callback hook based on diagnostic sensors", + ) + DIAGNOSTICS_MAX_TASK_DELAY: PositiveFloat = Field( - 0.0, - description="Sets an upper threshold for blocking functions, i.e. slow_duration_secs < max_task_delay", + default=0.0, + description="Sets an upper threshold for blocking functions, " + "i.e. slow_duration_secs < max_task_delay (healthcheck metric)", ) DIAGNOSTICS_MAX_AVG_LATENCY: PositiveFloat = Field( - 3.0, description="Maximum average response latency in seconds" + default=3.0, + description="Maximum average response latency in seconds (healthcheck metric)", ) DIAGNOSTICS_START_SENSING_DELAY: NonNegativeFloat = 60.0 @validator("DIAGNOSTICS_MAX_TASK_DELAY", pre=True) @classmethod - def validate_max_task_delay(cls, v, values): + def _validate_max_task_delay(cls, v, values): # Sets an upper threshold for blocking functions, i.e. # settings.DIAGNOSTICS_SLOW_DURATION_SECS < settings.DIAGNOSTICS_MAX_TASK_DELAY # diff --git a/services/web/server/src/simcore_service_webserver/rest/_handlers.py b/services/web/server/src/simcore_service_webserver/rest/_handlers.py index 1cc4a52b7d1..0cf11ad4810 100644 --- a/services/web/server/src/simcore_service_webserver/rest/_handlers.py +++ b/services/web/server/src/simcore_service_webserver/rest/_handlers.py @@ -57,9 +57,9 @@ async def healthcheck_readiness_probe(request: web.Request): """ healthcheck: HealthCheck = request.app[HealthCheck.__name__] - health_report = healthcheck.get_app_info(request.app) + app_info = healthcheck.get_app_info(request.app) # NOTE: do NOT run healthcheck here, just return info fast. - return envelope_json_response(health_report) + return envelope_json_response(app_info) @routes.get(f"/{API_VTAG}/config", name="get_config") diff --git a/services/web/server/src/simcore_service_webserver/rest/healthcheck.py b/services/web/server/src/simcore_service_webserver/rest/healthcheck.py index d0f50f5fb16..638a99cac96 100644 --- a/services/web/server/src/simcore_service_webserver/rest/healthcheck.py +++ b/services/web/server/src/simcore_service_webserver/rest/healthcheck.py @@ -81,9 +81,7 @@ def __init__(self, app: web.Application): self._timeout: int | None = app[APP_SETTINGS_KEY].SC_HEALTHCHECK_TIMEOUT def __repr__(self): - return "".format( - self._timeout, len(self._on_healthcheck) - ) + return f"" @property def on_healthcheck(self) -> _HealthCheckSignal: diff --git a/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py b/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py index e70f153e369..f35d7991539 100644 --- a/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py +++ b/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py @@ -14,6 +14,8 @@ from aiohttp import web from aiohttp.test_utils import TestClient from pytest_simcore.helpers.assert_checks import assert_status +from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict +from pytest_simcore.helpers.typing_env import EnvVarsDict from servicelib.aiohttp import status from servicelib.aiohttp.application import create_safe_application from simcore_service_webserver._constants import APP_SETTINGS_KEY @@ -78,12 +80,21 @@ async def _check_entrypoint(): @pytest.fixture -def mock_environment(mock_env_devel_environment: dict[str, str], monkeypatch): - monkeypatch.setenv("AIODEBUG_SLOW_DURATION_SECS", f"{SLOW_HANDLER_DELAY_SECS / 10}") - monkeypatch.setenv("DIAGNOSTICS_MAX_TASK_DELAY", f"{SLOW_HANDLER_DELAY_SECS}") - monkeypatch.setenv("DIAGNOSTICS_MAX_AVG_LATENCY", f"{2.0}") - monkeypatch.setenv("DIAGNOSTICS_START_SENSING_DELAY", f"{0}") - monkeypatch.setenv("SC_HEALTHCHECK_TIMEOUT", "2m") +def mock_environment( + mock_env_devel_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch +) -> EnvVarsDict: + return setenvs_from_dict( + monkeypatch, + { + **mock_env_devel_environment, + "AIODEBUG_SLOW_DURATION_SECS": f"{SLOW_HANDLER_DELAY_SECS / 10}", + "DIAGNOSTICS_MAX_TASK_DELAY": f"{SLOW_HANDLER_DELAY_SECS}", + "DIAGNOSTICS_MAX_AVG_LATENCY": f"{2.0}", + "DIAGNOSTICS_START_SENSING_DELAY": f"{0}", + "SC_HEALTHCHECK_TIMEOUT": "2m", + "DIAGNOSTICS_HEALTHCHECK_ENABLED": "1", + }, + ) @pytest.fixture @@ -92,7 +103,7 @@ def client( unused_tcp_port_factory: Callable, aiohttp_client: Callable, api_version_prefix: str, - mock_environment: None, + mock_environment: EnvVarsDict, ) -> TestClient: routes = web.RouteTableDef()