Skip to content

Commit

Permalink
🎨 Healtcheck diagnostics sensor is now optional (ITISFoundation#6327)
Browse files Browse the repository at this point in the history
  • Loading branch information
pcrespov authored Sep 9, 2024
1 parent 6d6f5c5 commit b74bae4
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 35 deletions.
1 change: 1 addition & 0 deletions .env-devel
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ WB_DB_EL_WALLETS=0

# WEBSERVER ----
AIODEBUG_SLOW_DURATION_SECS=0
DIAGNOSTICS_HEALTHCHECK_ENABLED=False
DIAGNOSTICS_MAX_AVG_LATENCY=10
DIAGNOSTICS_MAX_TASK_DELAY=30
DIAGNOSTICS_SLOW_DURATION_SECS=1
Expand Down
1 change: 1 addition & 0 deletions services/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ services:

# WEBSERVER_DIAGNOSTICS
WEBSERVER_DIAGNOSTICS: ${WEBSERVER_DIAGNOSTICS}
DIAGNOSTICS_HEALTHCHECK_ENABLED: ${DIAGNOSTICS_HEALTHCHECK_ENABLED}
DIAGNOSTICS_MAX_AVG_LATENCY: ${DIAGNOSTICS_MAX_AVG_LATENCY}
DIAGNOSTICS_MAX_TASK_DELAY: ${DIAGNOSTICS_MAX_TASK_DELAY}
DIAGNOSTICS_SLOW_DURATION_SECS: ${DIAGNOSTICS_SLOW_DURATION_SECS}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,25 +57,25 @@ def value(self) -> float:
return delay


logged_once = False
_logged_once = False


def is_sensing_enabled(app: web.Application):
"""Diagnostics will not activate sensing immediatly but after some
time since the app started
"""
global logged_once # pylint: disable=global-statement
global _logged_once # pylint: disable=global-statement
settings = get_plugin_settings(app)

time_elapsed_since_setup = time.time() - app[HEALTH_PLUGIN_START_TIME]
enabled = time_elapsed_since_setup > settings.DIAGNOSTICS_START_SENSING_DELAY
if enabled and not logged_once:
if enabled and not _logged_once:
_logger.debug(
"Diagnostics starts sensing after waiting %3.2f secs [> %3.2f secs] since submodule init",
time_elapsed_since_setup,
settings.DIAGNOSTICS_START_SENSING_DELAY,
)
logged_once = True
_logged_once = True
return enabled


Expand Down Expand Up @@ -106,10 +106,7 @@ def assert_healthy_app(app: web.Application) -> None:
)

if max_delay > max_delay_allowed:
msg = "{:3.1f} secs delay [at most {:3.1f} secs allowed]".format(
max_delay,
max_delay_allowed,
)
msg = f"{max_delay:3.1f} secs delay [at most {max_delay_allowed:3.1f} secs allowed]"
raise HealthCheckError(msg)

# CRITERIA 2: Mean latency of the last N request slower than 1 sec
Expand All @@ -125,6 +122,5 @@ def assert_healthy_app(app: web.Application) -> None:
)

if max_latency_allowed < latency:
raise HealthCheckError(
f"Last requests average latency is {latency} secs and surpasses {max_latency_allowed} secs"
)
msg = f"Last requests average latency is {latency} secs and surpasses {max_latency_allowed} secs"
raise HealthCheckError(msg)
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
_logger = logging.getLogger(__name__)


async def _on_healthcheck_async_adapter(app: web.Application) -> None:
assert_healthy_app(app)


@app_module_setup(
__name__,
ModuleCategory.ADDON,
Expand All @@ -46,13 +50,9 @@ def setup_diagnostics(
# adds middleware and /metrics
setup_monitoring(app)

# injects healthcheck
healthcheck: HealthCheck = app[HealthCheck.__name__]

async def _on_healthcheck_async_adapter(app: web.Application) -> None:
assert_healthy_app(app)

healthcheck.on_healthcheck.append(_on_healthcheck_async_adapter)
if settings.DIAGNOSTICS_HEALTHCHECK_ENABLED:
healthcheck: HealthCheck = app[HealthCheck.__name__]
healthcheck.on_healthcheck.append(_on_healthcheck_async_adapter)

# adds other diagnostic routes: healthcheck, etc
app.router.add_routes(_handlers.routes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,35 @@

class DiagnosticsSettings(BaseCustomSettings):
DIAGNOSTICS_SLOW_DURATION_SECS: PositiveFloat = Field(
1.0,
default=1.0,
description=(
"Any task blocked more than slow_duration_secs is logged as WARNING"
"Aims to identify possible blocking calls"
),
env=["DIAGNOSTICS_SLOW_DURATION_SECS", "AIODEBUG_SLOW_DURATION_SECS"],
)

DIAGNOSTICS_HEALTHCHECK_ENABLED: bool = Field(
default=False,
description="Enables/disables healthcheck callback hook based on diagnostic sensors",
)

DIAGNOSTICS_MAX_TASK_DELAY: PositiveFloat = Field(
0.0,
description="Sets an upper threshold for blocking functions, i.e. slow_duration_secs < max_task_delay",
default=0.0,
description="Sets an upper threshold for blocking functions, "
"i.e. slow_duration_secs < max_task_delay (healthcheck metric)",
)

DIAGNOSTICS_MAX_AVG_LATENCY: PositiveFloat = Field(
3.0, description="Maximum average response latency in seconds"
default=3.0,
description="Maximum average response latency in seconds (healthcheck metric)",
)

DIAGNOSTICS_START_SENSING_DELAY: NonNegativeFloat = 60.0

@validator("DIAGNOSTICS_MAX_TASK_DELAY", pre=True)
@classmethod
def validate_max_task_delay(cls, v, values):
def _validate_max_task_delay(cls, v, values):
# Sets an upper threshold for blocking functions, i.e.
# settings.DIAGNOSTICS_SLOW_DURATION_SECS < settings.DIAGNOSTICS_MAX_TASK_DELAY
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ async def healthcheck_readiness_probe(request: web.Request):
"""

healthcheck: HealthCheck = request.app[HealthCheck.__name__]
health_report = healthcheck.get_app_info(request.app)
app_info = healthcheck.get_app_info(request.app)
# NOTE: do NOT run healthcheck here, just return info fast.
return envelope_json_response(health_report)
return envelope_json_response(app_info)


@routes.get(f"/{API_VTAG}/config", name="get_config")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@ def __init__(self, app: web.Application):
self._timeout: int | None = app[APP_SETTINGS_KEY].SC_HEALTHCHECK_TIMEOUT

def __repr__(self):
return "<HealthCheck timeout={}, #on_healthcheck-slots={}>".format(
self._timeout, len(self._on_healthcheck)
)
return f"<HealthCheck timeout={self._timeout}, #on_healthcheck-slots={len(self._on_healthcheck)}>"

@property
def on_healthcheck(self) -> _HealthCheckSignal:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from aiohttp import web
from aiohttp.test_utils import TestClient
from pytest_simcore.helpers.assert_checks import assert_status
from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict
from pytest_simcore.helpers.typing_env import EnvVarsDict
from servicelib.aiohttp import status
from servicelib.aiohttp.application import create_safe_application
from simcore_service_webserver._constants import APP_SETTINGS_KEY
Expand Down Expand Up @@ -78,12 +80,21 @@ async def _check_entrypoint():


@pytest.fixture
def mock_environment(mock_env_devel_environment: dict[str, str], monkeypatch):
monkeypatch.setenv("AIODEBUG_SLOW_DURATION_SECS", f"{SLOW_HANDLER_DELAY_SECS / 10}")
monkeypatch.setenv("DIAGNOSTICS_MAX_TASK_DELAY", f"{SLOW_HANDLER_DELAY_SECS}")
monkeypatch.setenv("DIAGNOSTICS_MAX_AVG_LATENCY", f"{2.0}")
monkeypatch.setenv("DIAGNOSTICS_START_SENSING_DELAY", f"{0}")
monkeypatch.setenv("SC_HEALTHCHECK_TIMEOUT", "2m")
def mock_environment(
mock_env_devel_environment: EnvVarsDict, monkeypatch: pytest.MonkeyPatch
) -> EnvVarsDict:
return setenvs_from_dict(
monkeypatch,
{
**mock_env_devel_environment,
"AIODEBUG_SLOW_DURATION_SECS": f"{SLOW_HANDLER_DELAY_SECS / 10}",
"DIAGNOSTICS_MAX_TASK_DELAY": f"{SLOW_HANDLER_DELAY_SECS}",
"DIAGNOSTICS_MAX_AVG_LATENCY": f"{2.0}",
"DIAGNOSTICS_START_SENSING_DELAY": f"{0}",
"SC_HEALTHCHECK_TIMEOUT": "2m",
"DIAGNOSTICS_HEALTHCHECK_ENABLED": "1",
},
)


@pytest.fixture
Expand All @@ -92,7 +103,7 @@ def client(
unused_tcp_port_factory: Callable,
aiohttp_client: Callable,
api_version_prefix: str,
mock_environment: None,
mock_environment: EnvVarsDict,
) -> TestClient:

routes = web.RouteTableDef()
Expand Down

0 comments on commit b74bae4

Please sign in to comment.