Add Elasticsearch cluster health monitor DAGs (#3748)

sarayourfriend · web-flow · commit 30abcc42fe4b · 2024-02-13T09:11:46.000+11:00
* Extract shared Elasticsearch cluster connection utilities

* Add Elasticsearch healthcheck dag

* Refactor to make it easier to test message composition

* Add note to remind about --pdb for catalog tests

* Add healthcheck dag tests

* Update dag docs with new DAG names

* Use dynamic dags to generate pre-env dag

* Raise value error if message_type is unknown

* Fix message indentation

* Only alert for non-yellow statuses in prod when data refresh is running

* Fix typo in dag tag

* Fix missing context
diff --git a/catalog/dags/common/constants.py b/catalog/dags/common/constants.py
@@ -15,6 +15,9 @@
 STAGING = "staging"
 PRODUCTION = "production"
 
+Environment = Literal["staging", "production"]
+ENVIRONMENTS = [STAGING, PRODUCTION]
+
 CONTACT_EMAIL = os.getenv("CONTACT_EMAIL")
 
 DAG_DEFAULT_ARGS = {
diff --git a/catalog/dags/common/sensors/utils.py b/catalog/dags/common/sensors/utils.py
@@ -93,6 +93,24 @@ def prevent_concurrency_with_dag(external_dag_id: str, **context):
         raise ValueError(f"Concurrency check with {external_dag_id} failed.")
 
 
+@task(retries=0)
+def is_concurrent_with_any(external_dag_ids: list[str], **context):
+    """
+    Detect whether any of the external DAG are running.
+
+    Returns the ID of the first DAG found to be running. Otherwise,
+    returns None.
+    """
+    for dag_id in external_dag_ids:
+        try:
+            prevent_concurrency_with_dag.function(dag_id, **context)
+        except ValueError:
+            return dag_id
+
+    # Explicit return None to clarify expectations
+    return None
+
+
 @task_group(group_id="prevent_concurrency")
 def prevent_concurrency_with_dags(external_dag_ids: list[str]):
     """Fail immediately if any of the given external dags are in progress."""
diff --git a/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index.py b/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index.py
@@ -2,7 +2,6 @@
 from datetime import timedelta
 
 from airflow.decorators import task, task_group
-from airflow.models.connection import Connection
 from airflow.providers.elasticsearch.hooks.elasticsearch import ElasticsearchPythonHook
 from airflow.sensors.python import PythonSensor
 
@@ -21,12 +20,6 @@
 GET_CURRENT_INDEX_CONFIG_TASK_NAME = "get_current_index_configuration"
 
 
-@task
-def get_es_host(environment: str):
-    conn = Connection.get_connection_from_secrets(f"elasticsearch_http_{environment}")
-    return conn.host
-
-
 @task
 def get_index_name(media_type: str, index_suffix: str):
     return f"{media_type}-{index_suffix}".lower()
diff --git a/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index_dag.py b/catalog/dags/elasticsearch_cluster/create_new_es_index/create_new_es_index_dag.py
@@ -109,6 +109,7 @@
     CREATE_NEW_INDEX_CONFIGS,
     CreateNewIndex,
 )
+from elasticsearch_cluster.shared import get_es_host
 
 
 logger = logging.getLogger(__name__)
@@ -188,7 +189,7 @@ def create_new_es_index_dag(config: CreateNewIndex):
     with dag:
         prevent_concurrency = prevent_concurrency_with_dags(config.blocking_dags)
 
-        es_host = es.get_es_host(environment=config.environment)
+        es_host = get_es_host(environment=config.environment)
 
         index_name = es.get_index_name(
             media_type="{{ params.media_type }}",
diff --git a/catalog/dags/elasticsearch_cluster/healthcheck_dag.py b/catalog/dags/elasticsearch_cluster/healthcheck_dag.py
@@ -0,0 +1,184 @@
+"""
+Monitor staging and production Elasticsearch cluster health endpoint.
+
+Requests the cluster health and alerts under the following conditions:
+
+- Red cluster health
+- Unexpected number of nodes
+- Unresponsive cluster
+
+Additionally, the DAG will notify (rather than alert) when the cluster health is yellow.
+Yellow cluster health may or may not be an issue, depending on whether it is expected,
+and occurs whenever shards and replicas are being relocated (e.g., during reindexes).
+It is worthwhile to notify in these cases, as an assurance, but we could choose to add
+logic that ignores yellow cluster health during data refresh or other similar operations.
+"""
+
+import json
+import logging
+from datetime import datetime
+from textwrap import dedent, indent
+
+from airflow.decorators import dag, task
+from airflow.exceptions import AirflowSkipException
+from airflow.providers.elasticsearch.hooks.elasticsearch import ElasticsearchPythonHook
+from elasticsearch import Elasticsearch
+
+from common.constants import ENVIRONMENTS, PRODUCTION, Environment
+from common.sensors.utils import is_concurrent_with_any
+from common.slack import send_alert, send_message
+from data_refresh.data_refresh_types import DATA_REFRESH_CONFIGS
+from elasticsearch_cluster.shared import get_es_host
+
+
+logger = logging.getLogger(__name__)
+
+
+_DAG_ID = "{env}_elasticsearch_cluster_healthcheck"
+
+EXPECTED_NODE_COUNT = 6
+EXPECTED_DATA_NODE_COUNT = 3
+EXPECTED_MASTER_NODE_COUNT = 3
+
+
+def _format_response_body(response_body: dict) -> str:
+    body_str = indent(json.dumps(response_body, indent=4), prefix=" " * 4)
+    # body_str is indented in, because the f string added an indentation to
+    # the front, causing the first curly brace to be incorrectly indented
+    # and interpolating a multi-line string into the f string led subsequent lines
+    # to have incorrect indentation (they did not incorporate the f-strings
+    # own indentation.
+    # Adding our own indentation using `indent` to match the f-strings
+    # allows us to correctly dedent later on without issue, with a uniform indentation
+    # on every line.
+    return f"""
+    Full healthcheck response body:
+    ```
+{body_str}
+    ```
+    """
+
+
+def _compose_red_status(env: Environment, response_body: dict):
+    message = f"""
+    Elasticsearch {env} cluster status is **red**.
+
+    This is a critical status change, **investigate ASAP**.
+
+    {_format_response_body(response_body)}
+    """
+    return message
+
+
+def _compose_unexpected_node_count(env: Environment, response_body: dict):
+    node_count = response_body["number_of_nodes"]
+    data_node_count = response_body["number_of_data_nodes"]
+    master_node_count = node_count - data_node_count
+
+    message = f"""
+    Elasticsearch {env} cluster node count is **{node_count}**.
+    Expected {EXPECTED_NODE_COUNT} total nodes.
+
+    Master nodes: **{master_node_count}** of expected {EXPECTED_MASTER_NODE_COUNT}
+    Data nodes: **{data_node_count}** of expected {EXPECTED_DATA_NODE_COUNT}
+
+    This is a critical status change, **investigate ASAP**.
+    If this is expected (e.g., during controlled node or cluster changes), acknowledge immediately with explanation.
+
+    {_format_response_body(response_body)}
+    """
+    logger.error(f"Unexpected node count; {json.dumps(response_body)}")
+    return message
+
+
+def _compose_yellow_cluster_health(env: Environment, response_body: dict):
+    message = f"""
+    Elasticsearch {env} cluster health is **yellow**.
+
+    This does not mean something is necessarily wrong, but if this is not expected (e.g., data refresh) then investigate cluster health now.
+
+    {_format_response_body(response_body)}
+    """
+    logger.info(f"Cluster health was yellow; {json.dumps(response_body)}")
+    return message
+
+
+@task
+def ping_healthcheck(env: str, es_host: str):
+    es_conn: Elasticsearch = ElasticsearchPythonHook(hosts=[es_host]).get_conn
+
+    response = es_conn.cluster.health()
+
+    return response.body
+
+
+@task
+def compose_notification(
+    env: Environment, response_body: dict, is_data_refresh_running: bool
+):
+    status = response_body["status"]
+
+    if status == "red":
+        return "alert", _compose_red_status(env, response_body)
+
+    if response_body["number_of_nodes"] != EXPECTED_NODE_COUNT:
+        return "alert", _compose_unexpected_node_count(env, response_body)
+
+    if status == "yellow":
+        if is_data_refresh_running and env == PRODUCTION:
+            raise AirflowSkipException(
+                "Production cluster health status is yellow during data refresh. "
+                "This is an expected state, so no alert is sent."
+            )
+
+        return "notification", _compose_yellow_cluster_health(env, response_body)
+
+    logger.info(f"Cluster health was green; {json.dumps(response_body)}")
+    return None, None
+
+
+@task
+def notify(env: str, message_type_and_string: tuple[str, str]):
+    message_type, message = message_type_and_string
+
+    if message_type == "alert":
+        send_alert(dedent(message), dag_id=_DAG_ID.format(env=env))
+    elif message_type == "notification":
+        send_message(dedent(message), dag_id=_DAG_ID.format(env=env))
+    else:
+        raise ValueError(
+            f"Invalid message_type. Expected 'alert' or 'notification', "
+            f"received {message_type}"
+        )
+
+
+_SHARED_DAG_ARGS = {
+    # Every 15 minutes
+    "schedule": "*/15 * * * *",
+    "start_date": datetime(2024, 2, 4),
+    "catchup": False,
+    "max_active_runs": 1,
+    "doc_md": __doc__,
+    "tags": ["elasticsearch", "monitoring"],
+}
+
+
+_DATA_REFRESH_DAG_IDS = []
+for config in DATA_REFRESH_CONFIGS.values():
+    _DATA_REFRESH_DAG_IDS += [config.dag_id, config.filtered_index_dag_id]
+
+
+for env in ENVIRONMENTS:
+
+    @dag(dag_id=_DAG_ID.format(env=env), **_SHARED_DAG_ARGS)
+    def cluster_healthcheck_dag():
+        is_data_refresh_running = is_concurrent_with_any(_DATA_REFRESH_DAG_IDS)
+
+        es_host = get_es_host(env)
+        healthcheck_response = ping_healthcheck(env, es_host)
+        notification = compose_notification(
+            env, healthcheck_response, is_data_refresh_running
+        )
+        es_host >> healthcheck_response >> notification >> notify(env, notification)
+
+    cluster_healthcheck_dag()
diff --git a/catalog/dags/elasticsearch_cluster/shared.py b/catalog/dags/elasticsearch_cluster/shared.py
@@ -0,0 +1,11 @@
+from airflow.decorators import task
+from airflow.models.connection import Connection
+from airflow.models.xcom_arg import XComArg
+
+from common.constants import Environment
+
+
+@task
+def get_es_host(environment: Environment) -> XComArg:
+    conn = Connection.get_connection_from_secrets(f"elasticsearch_http_{environment}")
+    return conn.host
diff --git a/catalog/justfile b/catalog/justfile
@@ -108,6 +108,7 @@ _mount-test command: up-deps
         {{ command }}
 
 # Launch a Bash shell in a test container under `SERVICE`
+# Run pytest with `--pdb` to workaround xdist breaking pdb.set_trace()
 test-session:
     just _mount-test bash
 
diff --git a/catalog/tests/dags/elasticsearch_cluster/test_healthcheck_dag.py b/catalog/tests/dags/elasticsearch_cluster/test_healthcheck_dag.py
diff --git a/catalog/tests/dags/test_dag_parsing.py b/catalog/tests/dags/test_dag_parsing.py
diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md