diff --git a/charms/jupyter-controller/src/prometheus_alert_rules/KubeflowJupyterControllerServices.rules b/charms/jupyter-controller/src/prometheus_alert_rules/KubeflowJupyterControllerServices.rules new file mode 100644 index 00000000..d631cc83 --- /dev/null +++ b/charms/jupyter-controller/src/prometheus_alert_rules/KubeflowJupyterControllerServices.rules @@ -0,0 +1,24 @@ +groups: +- name: KubeflowJupyterControllerServices + rules: + - alert: KubeflowServiceDown + expr: up{} < 1 + for: 5m + labels: + severity: critical + annotations: + summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}. + LABELS = {{ $labels }} + + - alert: KubeflowServiceIsNotStable + expr: avg_over_time(up{}[10m]) < 0.5 + for: 0m + labels: + severity: warning + annotations: + summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + {{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes. + LABELS = {{ $labels }} diff --git a/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule b/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule deleted file mode 100644 index 40513504..00000000 --- a/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule +++ /dev/null @@ -1,10 +0,0 @@ -alert: JupyterControllerUnitIsUnavailable -expr: up < 1 -for: 5m -labels: - severity: critical -annotations: - summary: Jupyter controller unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable - description: > - The Jupyter controller unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable - LABELS = {{ $labels }} diff --git a/charms/jupyter-controller/tests/unit/test_operator.py b/charms/jupyter-controller/tests/unit/test_operator.py index 8cd98e61..7ebf2770 100644 --- a/charms/jupyter-controller/tests/unit/test_operator.py +++ b/charms/jupyter-controller/tests/unit/test_operator.py @@ -115,9 +115,11 @@ def test_prometheus_data_set(self, harness: Harness, mocker): with open("src/prometheus_alert_rules/model_errors.rule") as f: file_alert = yaml.safe_load(f.read()) test_alerts.append(file_alert["alert"]) - with open("src/prometheus_alert_rules/unit_unavailable.rule") as f: + with open("src/prometheus_alert_rules/KubeflowJupyterControllerServices.rules") as f: file_alert = yaml.safe_load(f.read()) - test_alerts.append(file_alert["alert"]) + # there 2 alert rules in host_resources.rules + for rule in file_alert["groups"][0]["rules"]: + test_alerts.append(rule["alert"]) # alert rules alert_rules = json.loads(