From 5d85da51ab60271791c33469a014ba9ef3e627ce Mon Sep 17 00:00:00 2001 From: Michal Hucko Date: Fri, 11 Oct 2024 12:53:35 +0200 Subject: [PATCH] Add alert rules to mlflow-server based on the KF093 spec (#262) (#284) * Add alert rules to mlflow-server based on the KF093 spec * Delete src/prometheus_alert_rules/unit_unavailable.rule * Use chisme to test cos integration Co-authored-by: Robert Gildein --- requirements-integration.in | 5 +- requirements-integration.txt | 106 ++++------------- .../KubeflowMlflowServerServices.rules | 24 ++++ .../unit_unavailable.rule | 10 -- tests/integration/test_charm.py | 107 ++++++------------ tests/integration/test_deploy_runners.py | 6 + 6 files changed, 92 insertions(+), 166 deletions(-) create mode 100644 src/prometheus_alert_rules/KubeflowMlflowServerServices.rules delete mode 100644 src/prometheus_alert_rules/unit_unavailable.rule diff --git a/requirements-integration.in b/requirements-integration.in index aaf40450..96e8afc4 100644 --- a/requirements-integration.in +++ b/requirements-integration.in @@ -6,4 +6,7 @@ minio mlflow pytest-operator requests --r requirements.txt +lightkube +lightkube-models>=1.25.4.4 +# This is required due to the abstraction of cos integration +charmed-kubeflow-chisme>=0.4.0 diff --git a/requirements-integration.txt b/requirements-integration.txt index 638874ed..17331090 100644 --- a/requirements-integration.txt +++ b/requirements-integration.txt @@ -11,18 +11,13 @@ aiosignal==1.3.1 alembic==1.12.0 # via mlflow anyio==3.7.1 - # via - # -r requirements.txt - # httpcore -appnope==0.1.4 - # via ipython + # via httpcore asttokens==2.4.0 # via stack-data async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via - # -r requirements.txt # aiohttp # jsonschema backcall==0.2.0 @@ -31,18 +26,10 @@ bcrypt==4.0.1 # via paramiko blinker==1.6.2 # via flask -boto3==1.28.25 - # via -r requirements.txt -botocore==1.31.25 - # via - # -r requirements.txt - # boto3 - # s3transfer cachetools==5.3.1 # via google-auth certifi==2023.7.22 # via - # -r requirements.txt # httpcore # httpx # kubernetes @@ -52,11 +39,10 @@ cffi==1.15.1 # via # cryptography # pynacl -charmed-kubeflow-chisme==0.2.0 - # via -r requirements.txt +charmed-kubeflow-chisme==0.4.3 + # via -r requirements-integration.in charset-normalizer==3.2.0 # via - # -r requirements.txt # aiohttp # requests click==8.1.7 @@ -79,16 +65,13 @@ decorator==5.1.1 # ipdb # ipython deepdiff==6.2.1 - # via - # -r requirements.txt - # charmed-kubeflow-chisme + # via charmed-kubeflow-chisme docker==6.1.3 # via mlflow entrypoints==0.4 # via mlflow exceptiongroup==1.1.2 # via - # -r requirements.txt # anyio # pytest executing==1.2.0 @@ -107,25 +90,20 @@ gitpython==3.1.35 # via mlflow google-auth==2.22.0 # via kubernetes +greenlet==3.0.3 + # via sqlalchemy gunicorn==21.2.0 # via mlflow h11==0.14.0 - # via - # -r requirements.txt - # httpcore + # via httpcore httpcore==0.17.3 - # via - # -r requirements.txt - # httpx + # via httpx httpx==0.24.1 - # via - # -r requirements.txt - # lightkube + # via lightkube hvac==1.2.0 # via juju idna==3.4 # via - # -r requirements.txt # anyio # httpx # requests @@ -138,7 +116,6 @@ importlib-metadata==6.8.0 # mlflow importlib-resources==6.0.1 # via - # -r requirements.txt # alembic # jsonschema # matplotlib @@ -155,37 +132,30 @@ jedi==0.19.0 jinja2==3.1.2 # via # -r requirements-integration.in - # -r requirements.txt # charmed-kubeflow-chisme # flask # mlflow # pytest-operator -jmespath==1.0.1 - # via - # -r requirements.txt - # boto3 - # botocore joblib==1.3.2 # via scikit-learn jsonschema==4.17.3 - # via - # -r requirements.txt - # serialized-data-interface + # via serialized-data-interface juju==3.2.2 # via # -r requirements-integration.in + # charmed-kubeflow-chisme # pytest-operator kiwisolver==1.4.5 # via matplotlib kubernetes==27.2.0 # via juju -lightkube==0.14.0 +lightkube==0.15.3 # via - # -r requirements.txt + # -r requirements-integration.in # charmed-kubeflow-chisme -lightkube-models==1.27.1.4 +lightkube-models==1.30.0.8 # via - # -r requirements.txt + # -r requirements-integration.in # lightkube macaroonbakery==1.3.1 # via juju @@ -195,7 +165,6 @@ markdown==3.4.4 # via mlflow markupsafe==2.1.3 # via - # -r requirements.txt # jinja2 # mako # werkzeug @@ -227,17 +196,12 @@ oauthlib==3.2.2 # databricks-cli # kubernetes # requests-oauthlib -oci-image==1.0.0 - # via -r requirements.txt ops==2.14.0 # via - # -r requirements.txt # charmed-kubeflow-chisme # serialized-data-interface ordered-set==4.1.0 - # via - # -r requirements.txt - # deepdiff + # via deepdiff packaging==23.1 # via # docker @@ -258,9 +222,7 @@ pickleshare==0.7.5 pillow==10.0.0 # via matplotlib pkgutil-resolve-name==1.3.10 - # via - # -r requirements.txt - # jsonschema + # via jsonschema pluggy==1.3.0 # via pytest prompt-toolkit==3.0.39 @@ -304,9 +266,7 @@ pyrfc3339==1.1 # juju # macaroonbakery pyrsistent==0.19.3 - # via - # -r requirements.txt - # jsonschema + # via jsonschema pytest==7.4.2 # via # pytest-asyncio @@ -317,8 +277,6 @@ pytest-operator==0.29.0 # via -r requirements-integration.in python-dateutil==2.8.2 # via - # -r requirements.txt - # botocore # kubernetes # matplotlib # pandas @@ -329,7 +287,6 @@ pytz==2023.3.post1 # pyrfc3339 pyyaml==6.0.1 # via - # -r requirements.txt # juju # kubernetes # lightkube @@ -342,7 +299,6 @@ querystring-parser==1.2.4 requests==2.31.0 # via # -r requirements-integration.in - # -r requirements.txt # databricks-cli # docker # hvac @@ -356,17 +312,9 @@ requests-oauthlib==1.3.1 rsa==4.9 # via google-auth ruamel-yaml==0.17.32 - # via - # -r requirements.txt - # charmed-kubeflow-chisme + # via charmed-kubeflow-chisme ruamel-yaml-clib==0.2.7 - # via - # -r requirements.txt - # ruamel-yaml -s3transfer==0.6.1 - # via - # -r requirements.txt - # boto3 + # via ruamel-yaml scikit-learn==1.3.0 # via mlflow scipy==1.10.1 @@ -374,12 +322,9 @@ scipy==1.10.1 # mlflow # scikit-learn serialized-data-interface==0.7.0 - # via - # -r requirements.txt - # charmed-kubeflow-chisme + # via charmed-kubeflow-chisme six==1.16.0 # via - # -r requirements.txt # asttokens # databricks-cli # google-auth @@ -393,7 +338,6 @@ smmap==5.0.0 # via gitdb sniffio==1.3.0 # via - # -r requirements.txt # anyio # httpcore # httpx @@ -408,9 +352,7 @@ stack-data==0.6.2 tabulate==0.9.0 # via databricks-cli tenacity==8.2.2 - # via - # -r requirements.txt - # charmed-kubeflow-chisme + # via charmed-kubeflow-chisme threadpoolctl==3.2.0 # via scikit-learn tomli==2.0.1 @@ -435,8 +377,6 @@ tzdata==2023.3 # via pandas urllib3==1.26.16 # via - # -r requirements.txt - # botocore # databricks-cli # docker # google-auth @@ -447,7 +387,6 @@ wcwidth==0.2.6 # via prompt-toolkit websocket-client==1.6.1 # via - # -r requirements.txt # docker # kubernetes # ops @@ -459,6 +398,5 @@ yarl==1.9.2 # via aiohttp zipp==3.16.2 # via - # -r requirements.txt # importlib-metadata # importlib-resources diff --git a/src/prometheus_alert_rules/KubeflowMlflowServerServices.rules b/src/prometheus_alert_rules/KubeflowMlflowServerServices.rules new file mode 100644 index 00000000..5641a3ca --- /dev/null +++ b/src/prometheus_alert_rules/KubeflowMlflowServerServices.rules @@ -0,0 +1,24 @@ +groups: +- name: KubeflowMlflowServerServices + rules: + - alert: KubeflowServiceDown + expr: up{} < 1 + for: 5m + labels: + severity: critical + annotations: + summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}. + LABELS = {{ $labels }} + + - alert: KubeflowServiceIsNotStable + expr: avg_over_time(up{}[10m]) < 0.5 + for: 0m + labels: + severity: warning + annotations: + summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + {{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes. + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/unit_unavailable.rule b/src/prometheus_alert_rules/unit_unavailable.rule deleted file mode 100644 index 14bfe450..00000000 --- a/src/prometheus_alert_rules/unit_unavailable.rule +++ /dev/null @@ -1,10 +0,0 @@ -alert: MLFlowServerUnitIsUnavailable -expr: up < 1 -for: 5m -labels: - severity: critical -annotations: - summary: MLFlowServer unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable - description: > - The MLFlowServer unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable - LABELS = {{ $labels }} diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index 4be6a90e..6b0bf7a9 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -18,6 +18,13 @@ import requests import yaml from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler +from charmed_kubeflow_chisme.testing import ( + assert_alert_rules, + assert_grafana_dashboards, + assert_metrics_endpoint, + get_alert_rules, + get_grafana_dashboards, +) from lightkube import codecs from lightkube.generic_resource import ( create_namespaced_resource, @@ -36,8 +43,6 @@ RELATIONAL_DB_CHARM_NAME = "mysql-k8s" RELATIONAL_DB_CHANNEL = "8.0/stable" OBJECT_STORAGE_CHARM_NAME = "minio" -PROMETHEUS_CHARM_NAME = "prometheus-k8s" -GRAFANA_CHARM_NAME = "grafana-k8s" RESOURCE_DISPATCHER_CHARM_NAME = "resource-dispatcher" ISTIO_GATEWAY_CHARM_NAME = "istio-ingressgateway" ISTIO_PILOT_CHARM_NAME = "istio-pilot" @@ -136,7 +141,7 @@ async def setup_istio(ops_test: OpsTest, istio_gateway: str, istio_pilot: str): config={"default-gateway": "test-gateway"}, trust=True, ) - await ops_test.model.add_relation(istio_pilot, istio_gateway) + await ops_test.model.integrate(istio_pilot, istio_gateway) await ops_test.model.wait_for_idle( apps=[istio_pilot, istio_gateway], @@ -197,8 +202,8 @@ async def test_add_relational_db_with_relation_expect_active(self, ops_test: Ops raise_on_error=False, timeout=600, ) - await ops_test.model.relate(OBJECT_STORAGE_CHARM_NAME, CHARM_NAME) - await ops_test.model.relate(RELATIONAL_DB_CHARM_NAME, CHARM_NAME) + await ops_test.model.integrate(OBJECT_STORAGE_CHARM_NAME, CHARM_NAME) + await ops_test.model.integrate(RELATIONAL_DB_CHARM_NAME, CHARM_NAME) await ops_test.model.wait_for_idle( apps=[CHARM_NAME], @@ -206,9 +211,35 @@ async def test_add_relational_db_with_relation_expect_active(self, ops_test: Ops raise_on_blocked=False, raise_on_error=False, timeout=600, + idle_period=60, ) assert ops_test.model.applications[CHARM_NAME].units[0].workload_status == "active" + async def test_alert_rules(self, ops_test: OpsTest): + """Test check charm alert rules and rules defined in relation data bag.""" + app = ops_test.model.applications[CHARM_NAME] + alert_rules = get_alert_rules() + logger.info("found alert_rules: %s", alert_rules) + await assert_alert_rules(app, alert_rules) + + async def test_grafana_dashboards(self, ops_test: OpsTest): + """Test Grafana dashboards are defined in relation data bag.""" + app = ops_test.model.applications[CHARM_NAME] + dashboards = get_grafana_dashboards() + logger.info("found dashboards: %s", dashboards) + await assert_grafana_dashboards(app, dashboards) + + async def test_metrics_enpoint(self, ops_test: OpsTest): + """Test metrics_endpoints are defined in relation data bag and their accessibility. + + This function gets all the metrics_endpoints from the relation data bag, checks if + they are available from the grafana-agent-k8s charm and finally compares them with the + ones provided to the function. + """ + app = ops_test.model.applications[CHARM_NAME] + await assert_metrics_endpoint(app, metrics_port=5000, metrics_path="/metrics") + await assert_metrics_endpoint(app, metrics_port=8000, metrics_path="/metrics") + @retry(stop=stop_after_delay(300), wait=wait_fixed(10)) @pytest.mark.abort_on_fail async def test_can_connect_exporter_and_get_metrics(self, ops_test: OpsTest): @@ -376,69 +407,3 @@ async def test_new_user_namespace_has_manifests( for name in poddefaults_names: pod_default = lightkube_client.get(PodDefault, name, namespace=namespace) assert pod_default is not None - - @pytest.mark.abort_on_fail - async def test_mlflow_alert_rules(self, ops_test: OpsTest): - await ops_test.model.deploy(PROMETHEUS_CHARM_NAME, channel="latest/stable", trust=True) - await ops_test.model.relate(PROMETHEUS_CHARM_NAME, CHARM_NAME) - await ops_test.model.wait_for_idle( - apps=[PROMETHEUS_CHARM_NAME], status="active", raise_on_blocked=True, timeout=60 * 10 - ) - - prometheus_subprocess = subprocess.Popen( - [ - "kubectl", - "-n", - f"{ops_test.model_name}", - "port-forward", - f"svc/{PROMETHEUS_CHARM_NAME}", - "9090:9090", - ] - ) - time.sleep(10) # Must wait for port-forward - - prometheus_url = "localhost" - - # obtain scrape targets from Prometheus - targets_result = await fetch_url(f"http://{prometheus_url}:9090/api/v1/targets") - - # verify that mlflow-server is in the target list - assert targets_result is not None - assert targets_result["status"] == "success" - discovered_labels = targets_result["data"]["activeTargets"][0]["discoveredLabels"] - assert discovered_labels["juju_application"] == CHARM_NAME - - # obtain alert rules from Prometheus - rules_url = f"http://{prometheus_url}:9090/api/v1/rules" - alert_rules_result = await fetch_url(rules_url) - - # verify alerts are available in Prometheus - assert alert_rules_result is not None - assert alert_rules_result["status"] == "success" - rules = alert_rules_result["data"]["groups"][0]["rules"] - - # load alert rules from the rules file - rules_file_alert_names = [] - with open("src/prometheus_alert_rules/mlflow-server.rule") as f: - mlflow_server = yaml.safe_load(f.read()) - alerts_list = mlflow_server["groups"][0]["rules"] - for alert in alerts_list: - rules_file_alert_names.append(alert["alert"]) - - # verify number of alerts is the same in Prometheus and in the rules file - assert len(rules) == len(rules_file_alert_names) - - # verify that all Mlflow alert rules are in the list and that alerts obtained - # from Prometheus match alerts in the rules file - for rule in rules: - assert rule["name"] in rules_file_alert_names - - prometheus_subprocess.terminate() - - @pytest.mark.abort_on_fail - async def test_grafana_integration(self, ops_test: OpsTest): - await ops_test.model.deploy(GRAFANA_CHARM_NAME, channel="latest/stable", trust=True) - await ops_test.model.relate(GRAFANA_CHARM_NAME, CHARM_NAME) - await ops_test.model.wait_for_idle( - apps=[GRAFANA_CHARM_NAME], status="active", raise_on_blocked=True, timeout=60 * 20 - ) diff --git a/tests/integration/test_deploy_runners.py b/tests/integration/test_deploy_runners.py index 15962928..5dfdfadc 100644 --- a/tests/integration/test_deploy_runners.py +++ b/tests/integration/test_deploy_runners.py @@ -2,6 +2,7 @@ import pytest import yaml +from charmed_kubeflow_chisme.testing import deploy_and_assert_grafana_agent from pytest_operator.plugin import OpsTest METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) @@ -32,3 +33,8 @@ async def test_build_and_deploy(self, ops_test: OpsTest): ops_test.model.applications[CHARM_NAME].units[0].workload_status_message == "Waiting for object-storage relation data" ) + + # Deploying grafana-agent-k8s and add all relations + await deploy_and_assert_grafana_agent( + ops_test.model, CHARM_NAME, metrics=True, dashboard=True, logging=False + )