diff --git a/.github/workflows/integrate.yaml b/.github/workflows/integrate.yaml index f111e7c..7e20cd2 100644 --- a/.github/workflows/integrate.yaml +++ b/.github/workflows/integrate.yaml @@ -47,9 +47,9 @@ jobs: with: provider: microk8s channel: 1.24/stable - # Pinned until this bug is resolved: https://bugs.launchpad.net/juju/+bug/1992833 - bootstrap-options: "--agent-version=2.9.34" microk8s-addons: "dns storage rbac metallb:10.64.140.43-10.64.140.49" + charmcaft-channel: latest/candidate + juju-channel: 2.9/stable - name: Test run: sg microk8s -c "tox -vve integration -- --model testing" diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py index 1623517..c20ab2b 100644 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -218,7 +218,8 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 24 + +LIBPATCH = 32 logger = logging.getLogger(__name__) @@ -229,7 +230,7 @@ def __init__(self, *args): TOPOLOGY_TEMPLATE_DROPDOWNS = [ # type: ignore { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", "definition": "label_values(up,juju_model)", "description": None, @@ -254,9 +255,9 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "definition": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', "description": None, "error": None, "hide": 0, @@ -265,7 +266,7 @@ def __init__(self, *args): "multi": True, "name": "juju_model_uuid", "query": { - "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "query": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -279,9 +280,9 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', "description": None, "error": None, "hide": 0, @@ -290,7 +291,7 @@ def __init__(self, *args): "multi": True, "name": "juju_application", "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -304,9 +305,9 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', "description": None, "error": None, "hide": 0, @@ -315,7 +316,7 @@ def __init__(self, *args): "multi": True, "name": "juju_unit", "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -336,7 +337,7 @@ def __init__(self, *args): "error": None, "hide": 0, "includeAll": True, - "label": None, + "label": "Prometheus datasource", "multi": True, "name": "prometheusds", "options": [], @@ -351,7 +352,7 @@ def __init__(self, *args): "error": None, "hide": 0, "includeAll": True, - "label": None, + "label": "Loki datasource", "multi": True, "name": "lokids", "options": [], @@ -366,7 +367,7 @@ def __init__(self, *args): REACTIVE_CONVERTER = { # type: ignore "allValue": None, "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', "description": None, "error": None, "hide": 0, @@ -376,7 +377,7 @@ def __init__(self, *args): "name": "host", "options": [], "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -524,7 +525,7 @@ def _validate_relation_by_interface_and_direction( relation = charm.meta.relations[relation_name] actual_relation_interface = relation.interface_name - if actual_relation_interface != expected_relation_interface: + if actual_relation_interface and actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( relation_name, expected_relation_interface, actual_relation_interface ) @@ -581,7 +582,7 @@ def _convert_dashboard_fields(content: str, inject_dropdowns: bool = True) -> st # If no existing template variables exist, just insert our own if "templating" not in dict_content: - dict_content["templating"] = {"list": [d for d in template_dropdowns]} # type: ignore + dict_content["templating"] = {"list": list(template_dropdowns)} # type: ignore else: # Otherwise, set a flag so we can go back later existing_templates = True @@ -829,18 +830,18 @@ def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: if "datasource" not in panel.keys(): continue - else: - if type(panel["datasource"]) == str: - if panel["datasource"] not in known_datasources: - continue - querytype = known_datasources[panel["datasource"]] - elif type(panel["datasource"]) == dict: - if panel["datasource"]["uid"] not in known_datasources: - continue - querytype = known_datasources[panel["datasource"]["uid"]] - else: - logger.error("Unknown datasource format: skipping") + + if type(panel["datasource"]) == str: + if panel["datasource"] not in known_datasources: continue + querytype = known_datasources[panel["datasource"]] + elif type(panel["datasource"]) == dict: + if panel["datasource"]["uid"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]["uid"]] + else: + logger.error("Unknown datasource format: skipping") + continue # Capture all values inside `[]` into a list which we'll iterate over later to # put them back in-order. Then apply the regex again and replace everything with @@ -900,13 +901,12 @@ def _type_convert_stored(obj): """Convert Stored* to their appropriate types, recursively.""" if isinstance(obj, StoredList): return list(map(_type_convert_stored, obj)) - elif isinstance(obj, StoredDict): + if isinstance(obj, StoredDict): rdict = {} # type: Dict[Any, Any] for k in obj.keys(): rdict[k] = _type_convert_stored(obj[k]) return rdict - else: - return obj + return obj class GrafanaDashboardsChanged(EventBase): @@ -955,7 +955,7 @@ def restore(self, snapshot): """Restore grafana source information.""" self.error_message = snapshot["error_message"] self.valid = snapshot["valid"] - self.errors = json.loads(snapshot["errors"]) + self.errors = json.loads(str(snapshot["errors"])) class GrafanaProviderEvents(ObjectEvents): @@ -968,7 +968,7 @@ class GrafanaDashboardProvider(Object): """An API to provide Grafana dashboards to a Grafana charm.""" _stored = StoredState() - on = GrafanaProviderEvents() + on = GrafanaProviderEvents() # pyright: ignore def __init__( self, @@ -1072,7 +1072,7 @@ def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: """ # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore encoded_dashboard = _encode_dashboard_content(content) @@ -1093,7 +1093,7 @@ def remove_non_builtin_dashboards(self) -> None: """Remove all dashboards to the relation added via :method:`add_dashboard`.""" # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("prog:"): @@ -1120,7 +1120,7 @@ def _update_all_dashboards_from_dir( # Ensure we do not leave outdated dashboards by removing from stored all # the encoded dashboards that start with "file/". if self._dashboards_path: - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1174,7 +1174,7 @@ def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: e.grafana_dashboards_absolute_path, e.message, ) - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1212,16 +1212,18 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> valid = bool(data.get("valid", True)) errors = data.get("errors", []) if valid and not errors: - self.on.dashboard_status_changed.emit(valid=valid) + self.on.dashboard_status_changed.emit(valid=valid) # pyright: ignore else: - self.on.dashboard_status_changed.emit(valid=valid, errors=errors) + self.on.dashboard_status_changed.emit( # pyright: ignore + valid=valid, errors=errors + ) def _upset_dashboards_on_relation(self, relation: Relation) -> None: """Update the dashboards in the relation data bucket.""" # It's completely ridiculous to add a UUID, but if we don't have some # pseudo-random value, this never makes it across 'juju set-state' stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } @@ -1250,13 +1252,13 @@ def _juju_topology(self) -> Dict: @property def dashboard_templates(self) -> List: """Return a list of the known dashboard templates.""" - return [v for v in self._stored.dashboard_templates.values()] # type: ignore + return list(self._stored.dashboard_templates.values()) # type: ignore class GrafanaDashboardConsumer(Object): """A consumer object for working with Grafana Dashboards.""" - on = GrafanaDashboardEvents() + on = GrafanaDashboardEvents() # pyright: ignore _stored = StoredState() def __init__( @@ -1304,7 +1306,7 @@ def __init__( self._relation_name = relation_name self._tranformer = CosTool(self._charm) - self._stored.set_default(dashboards=dict()) # type: ignore + self._stored.set_default(dashboards={}) # type: ignore self.framework.observe( self._charm.on[self._relation_name].relation_changed, @@ -1348,13 +1350,13 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> changes = self._render_dashboards_and_signal_changed(event.relation) if changes: - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: """Emit dashboard events on peer events so secondary charm data updates.""" if self._charm.unit.is_leader(): return - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def update_dashboards(self, relation: Optional[Relation] = None) -> None: """Re-establish dashboards on one or more relations. @@ -1401,7 +1403,7 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # """ other_app = relation.app - raw_data = relation.data[other_app].get("dashboards", {}) # type: ignore + raw_data = relation.data[other_app].get("dashboards", "") # pyright: ignore if not raw_data: logger.warning( @@ -1416,11 +1418,6 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # The only piece of data needed on this side of the relations is "templates" templates = data.pop("templates") - # Import only if a charmed operator uses the consumer, we don't impose these - # dependencies on the client - from jinja2 import Template - from jinja2.exceptions import TemplateSyntaxError - # The dashboards are WAY too big since this ultimately calls out to Juju to # set the relation data, and it overflows the maximum argument length for # subprocess, so we have to use b64, annoyingly. @@ -1433,14 +1430,12 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # relation_has_invalid_dashboards = False for _, (fname, template) in enumerate(templates.items()): - decoded_content = None content = None error = None topology = template.get("juju_topology", {}) try: - decoded_content = _decode_dashboard_content(template["content"]) + content = _decode_dashboard_content(template["content"]) inject_dropdowns = template.get("inject_dropdowns", True) - content = Template(decoded_content).render() content = self._manage_dashboard_uid(content, template) content = _convert_dashboard_fields(content, inject_dropdowns) @@ -1455,9 +1450,6 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # error = str(e.msg) logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) continue - except TemplateSyntaxError as e: - error = str(e) - relation_has_invalid_dashboards = True # Prepend the relation name and ID to the dashboard ID to avoid clashes with # multiple relations with apps from the same charm, or having dashboards with @@ -1504,28 +1496,27 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # Dropping dashboards for a relation needs to be signalled return True - else: - stored_data = rendered_dashboards - currently_stored_data = self._get_stored_dashboards(relation.id) - coerced_data = ( - _type_convert_stored(currently_stored_data) if currently_stored_data else {} - ) + stored_data = rendered_dashboards + currently_stored_data = self._get_stored_dashboards(relation.id) - if not coerced_data == stored_data: - stored_dashboards = self.get_peer_data("dashboards") - stored_dashboards[relation.id] = stored_data - self.set_peer_data("dashboards", stored_dashboards) - return True + coerced_data = _type_convert_stored(currently_stored_data) if currently_stored_data else {} + + if not coerced_data == stored_data: + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards[relation.id] = stored_data + self.set_peer_data("dashboards", stored_dashboards) + return True + return None # type: ignore def _manage_dashboard_uid(self, dashboard: str, template: dict) -> str: """Add an uid to the dashboard if it is not present.""" - dashboard = json.loads(dashboard) + dashboard_dict = json.loads(dashboard) - if not dashboard.get("uid", None) and "dashboard_alt_uid" in template: - dashboard["uid"] = template["dashboard_alt_uid"] + if not dashboard_dict.get("uid", None) and "dashboard_alt_uid" in template: + dashboard_dict["uid"] = template["dashboard_alt_uid"] - return json.dumps(dashboard) + return json.dumps(dashboard_dict) def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: """If an errored dashboard is in stored data, remove it and trigger a deletion.""" @@ -1533,7 +1524,7 @@ def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: stored_dashboards = self.get_peer_data("dashboards") stored_dashboards.pop(str(relation.id)) self.set_peer_data("dashboards", stored_dashboards) - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def _to_external_object(self, relation_id, dashboard): return { @@ -1615,7 +1606,7 @@ class GrafanaDashboardAggregator(Object): """ _stored = StoredState() - on = GrafanaProviderEvents() + on = GrafanaProviderEvents() # pyright: ignore def __init__( self, @@ -1680,31 +1671,37 @@ def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: """Push dashboards to the downstream Grafana relation.""" # It's still ridiculous to add a UUID here, but needed stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) def remove_dashboards(self, event: RelationBrokenEvent) -> None: """Remove a dashboard if the relation is broken.""" - app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) # type: ignore + app_ids = _type_convert_stored(self._stored.id_mappings.get(event.app.name, "")) # type: ignore + + if not app_ids: + logger.info("Could not look up stored dashboards for %s", event.app.name) # type: ignore + return del self._stored.id_mappings[event.app.name] # type: ignore for id in app_ids: del self._stored.dashboard_templates[id] # type: ignore stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) # Yes, this has a fair amount of branching. It's not that complex, though - def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 + def _strip_existing_datasources(self, dash: dict) -> dict: # noqa: C901 """Remove existing reactive charm datasource templating out. This method iterates through *known* places where reactive charms may set @@ -1713,7 +1710,7 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from the Grafana UI. It is not present in earlier Grafana versions, and can be disabled in 5.3.4 and above (optionally). If set, any values present will be substituted on - import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors + import. Some reactive charms use this for Prometheus. COS uses dropdown selectors for datasources, and leaving this present results in "default" datasource values which are broken. @@ -1723,20 +1720,15 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 Further properties may be discovered. """ - dash = template["dashboard"] try: if "list" in dash["templating"]: for i in range(len(dash["templating"]["list"])): if ( "datasource" in dash["templating"]["list"][i] - and "Juju" in dash["templating"]["list"][i]["datasource"] + and dash["templating"]["list"][i]["datasource"] is not None ): - dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" - if ( - "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["name"] == "host" - ): - dash["templating"]["list"][i] = REACTIVE_CONVERTER + if "Juju" in dash["templating"]["list"][i].get("datasource", ""): + dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" # Strip out newly-added 'juju_application' template variables which # don't line up with our drop-downs @@ -1744,7 +1736,7 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 for i in range(len(dash["templating"]["list"])): if ( "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["name"] == "app" + and dash["templating"]["list"][i].get("name", "") == "app" ): del dash_mutable["templating"]["list"][i] @@ -1756,18 +1748,20 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 if "__inputs" in dash: inputs = dash for i in range(len(dash["__inputs"])): - if dash["__inputs"][i]["pluginName"] == "Prometheus": + if dash["__inputs"][i].get("pluginName", "") == "Prometheus": del inputs["__inputs"][i] if inputs: dash["__inputs"] = inputs["__inputs"] else: del dash["__inputs"] - template["dashboard"] = dash - return template + return dash def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" + if not self._charm.unit.is_leader(): + return {} + templates = [] id = "" @@ -1790,20 +1784,28 @@ def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: dashboards = {} for t in templates: - # Replace values with LMA-style templating - t = self._strip_existing_datasources(t) - # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON # in the bucket back out to the actual "dashboard" we _need_, this is the way # This is not a mistake -- there's a double nesting in reactive charms, and # Grafana won't load it. We have to unbox: # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], # and the final unboxing is below. - dash = json.dumps(t["dashboard"]) + # + # Apparently SOME newer dashboards (such as Ceph) do not have this double nesting, so + # now we get to account for both :toot: + dash = t.get("dashboard", {}) or t + + # Replace values with LMA-style templating + dash = self._strip_existing_datasources(dash) + dash = json.dumps(dash) # Replace the old-style datasource templates dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "\$datasource"', r'"datasource": "${prometheusds}"', dash + ) + dash = re.sub(r'"uid": "\$datasource"', r'"uid": "${prometheusds}"', dash) dash = re.sub( r'"datasource": "(!?\w)[\w|\s|-]+?Juju generated.*?"', r'"datasource": "${prometheusds}"', @@ -1811,12 +1813,17 @@ def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: ) # Yank out "new"+old LMA topology - dash = re.sub(r'(,?juju_application=~)"\$app"', r'\1"\$juju_application"', dash) + dash = re.sub( + r'(,?\s?juju_application=~)\\"\$app\\"', r'\1\\"$juju_application\\"', dash + ) + + # Replace old piechart panels + dash = re.sub(r'"type": "grafana-piechart-panel"', '"type": "piechart"', dash) - from jinja2 import Template + from jinja2 import DebugUndefined, Template content = _encode_dashboard_content( - Template(dash).render(host=r"$host", datasource=r"${prometheusds}") # type: ignore + Template(dash, undefined=DebugUndefined).render(datasource=r"${prometheusds}") # type: ignore ) id = "prog:{}".format(content[-24:-16]) @@ -1980,7 +1987,7 @@ def inject_label_matchers(self, expression: str, topology: dict, type: str) -> s args.extend(["--", "{}".format(expression)]) # noinspection PyBroadException try: - return self._exec(args) + return re.sub(r'="\$juju', r'=~"$juju', self._exec(args)) except subprocess.CalledProcessError as e: logger.debug('Applying the expression failed: "%s", falling back to the original', e) return expression diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 28ce507..cac364e 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -121,7 +121,7 @@ def __init__(self, *args): { "targets": ["10.1.32.215:7000", "*:8000"], "labels": { - "some-key": "some-value" + "some_key": "some-value" } } ] @@ -151,7 +151,7 @@ def __init__(self, *args): { "targets": ["*:7000"], "labels": { - "some-key": "some-value" + "some_key": "some-value" } } ] @@ -163,7 +163,7 @@ def __init__(self, *args): { "targets": ["*:8000"], "labels": { - "some-other-key": "some-other-value" + "some_other_key": "some-other-value" } } ] @@ -343,7 +343,9 @@ def _on_scrape_targets_changed(self, event): from collections import defaultdict from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from urllib.error import HTTPError, URLError from urllib.parse import urlparse +from urllib.request import urlopen import yaml from charms.observability_libs.v0.juju_topology import JujuTopology @@ -368,7 +370,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 30 +LIBPATCH = 38 logger = logging.getLogger(__name__) @@ -389,6 +391,7 @@ def _on_scrape_targets_changed(self, event): "scheme", "basic_auth", "tls_config", + "authorization", } DEFAULT_JOB = { "metrics_path": "/metrics", @@ -599,15 +602,22 @@ def render_alertmanager_static_configs(alertmanagers: List[str]): # Create a mapping from paths to netlocs # Group alertmanager targets into a dictionary of lists: # {path: [netloc1, netloc2]} - paths = defaultdict(list) # type: Dict[str, List[str]] + paths = defaultdict(list) # type: Dict[Tuple[str, str], List[str]] for parsed in map(urlparse, sanitized): path = parsed.path or "/" - paths[path].append(parsed.netloc) + paths[(parsed.scheme, path)].append(parsed.netloc) return { "alertmanagers": [ - {"path_prefix": path_prefix, "static_configs": [{"targets": netlocs}]} - for path_prefix, netlocs in paths.items() + { + "scheme": scheme, + "path_prefix": path_prefix, + "static_configs": [{"targets": netlocs}], + # FIXME figure out how to get alertmanager's ca_file into here + # Without this, prom errors: "x509: certificate signed by unknown authority" + "tls_config": {"insecure_skip_verify": True}, + } + for (scheme, path_prefix), netlocs in paths.items() ] } @@ -686,23 +696,39 @@ def restore(self, snapshot): self.errors = snapshot["errors"] +class InvalidScrapeJobEvent(EventBase): + """Event emitted when alert rule files are not valid.""" + + def __init__(self, handle, errors: str = ""): + super().__init__(handle) + self.errors = errors + + def snapshot(self) -> Dict: + """Save error information.""" + return {"errors": self.errors} + + def restore(self, snapshot): + """Restore error information.""" + self.errors = snapshot["errors"] + + class MetricsEndpointProviderEvents(ObjectEvents): """Events raised by :class:`InvalidAlertRuleEvent`s.""" alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + invalid_scrape_job = EventSource(InvalidScrapeJobEvent) def _type_convert_stored(obj): """Convert Stored* to their appropriate types, recursively.""" if isinstance(obj, StoredList): return list(map(_type_convert_stored, obj)) - elif isinstance(obj, StoredDict): + if isinstance(obj, StoredDict): rdict = {} # type: Dict[Any, Any] for k in obj.keys(): rdict[k] = _type_convert_stored(obj[k]) return rdict - else: - return obj + return obj def _validate_relation_by_interface_and_direction( @@ -1119,12 +1145,24 @@ def jobs(self) -> list: for relation in self._charm.model.relations[self._relation_name]: static_scrape_jobs = self._static_scrape_config(relation) if static_scrape_jobs: - scrape_jobs.extend(static_scrape_jobs) + # Duplicate job names will cause validate_scrape_jobs to fail. + # Therefore we need to dedupe here and after all jobs are collected. + static_scrape_jobs = _dedupe_job_names(static_scrape_jobs) + try: + self._tool.validate_scrape_jobs(static_scrape_jobs) + except subprocess.CalledProcessError as e: + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["scrape_job_errors"] = str(e) + relation.data[self._charm.app]["event"] = json.dumps(data) + else: + scrape_jobs.extend(static_scrape_jobs) scrape_jobs = _dedupe_job_names(scrape_jobs) return scrape_jobs + @property def alerts(self) -> dict: """Fetch alerts for all relations. @@ -1175,22 +1213,25 @@ def alerts(self) -> dict: if not alert_rules: continue - try: - scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) - identifier = JujuTopology.from_dict(scrape_metadata).identifier - alerts[identifier] = self._tool.apply_label_matchers(alert_rules) - - except KeyError as e: - logger.debug( - "Relation %s has no 'scrape_metadata': %s", - relation.id, - e, - ) - identifier = self._get_identifier_by_alert_rules(alert_rules) + alert_rules = self._inject_alert_expr_labels(alert_rules) + + identifier, topology = self._get_identifier_by_alert_rules(alert_rules) + if not topology: + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = JujuTopology.from_dict(scrape_metadata).identifier + alerts[identifier] = self._tool.apply_label_matchers(alert_rules) # type: ignore + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) if not identifier: logger.error( - "Alert rules were found but no usable group or identifier was present" + "Alert rules were found but no usable group or identifier was present." ) continue @@ -1200,12 +1241,17 @@ def alerts(self) -> dict: if errmsg: if alerts[identifier]: del alerts[identifier] - relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg}) + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["errors"] = errmsg + relation.data[self._charm.app]["event"] = json.dumps(data) continue return alerts - def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: + def _get_identifier_by_alert_rules( + self, rules: dict + ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: """Determine an appropriate dict key for alert rules. The key is used as the filename when writing alerts to disk, so the structure @@ -1213,21 +1259,28 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: Args: rules: a dict of alert rules + Returns: + A tuple containing an identifier, if found, and a JujuTopology, if it could + be constructed. """ if "groups" not in rules: logger.debug("No alert groups were found in relation data") - return None + return None, None # Construct an ID based on what's in the alert rules if they have labels for group in rules["groups"]: try: labels = group["rules"][0]["labels"] - identifier = "{}_{}_{}".format( - labels["juju_model"], - labels["juju_model_uuid"], - labels["juju_application"], + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), ) - return identifier + return topology.identifier, topology except KeyError: logger.debug("Alert rules were found but no usable labels were present") continue @@ -1238,11 +1291,55 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: ) try: for group in rules["groups"]: - return group["name"] + return group["name"], None except KeyError: logger.debug("No group name was found to use as identifier") - return None + return None, None + + def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: + """Iterate through alert rules and inject topology into expressions. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + return rules + + modified_groups = [] + for group in rules["groups"]: + # Copy off rules, so we don't modify an object we're iterating over + rules_copy = group["rules"] + for idx, rule in enumerate(rules_copy): + labels = rule.get("labels") + + if labels: + try: + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + + # Inject topology and put it back in the list + rule["expr"] = self._tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), + topology.label_matcher_dict, + ) + except KeyError: + # Some required JujuTopology key is missing. Just move on. + pass + + group["rules"][idx] = rule + + modified_groups.append(group) + + rules["groups"] = modified_groups + return rules def _static_scrape_config(self, relation) -> list: """Generate the static scrape configuration for a single relation. @@ -1263,29 +1360,39 @@ def _static_scrape_config(self, relation) -> list: if not relation.units: return [] - scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) - if not scrape_jobs: + if not scrape_configs: return [] scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) if not scrape_metadata: - return scrape_jobs + return scrape_configs topology = JujuTopology.from_dict(scrape_metadata) job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) - scrape_jobs = PrometheusConfig.prefix_job_names(scrape_jobs, job_name_prefix) - scrape_jobs = PrometheusConfig.sanitize_scrape_configs(scrape_jobs) + scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix) + scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs) hosts = self._relation_hosts(relation) - scrape_jobs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( - scrape_jobs, hosts, topology + scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_configs, hosts, topology ) - return scrape_jobs + # If scheme is https but no ca section present, then auto add "insecure_skip_verify", + # otherwise scraping errors out with "x509: certificate signed by unknown authority". + # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config + for scrape_config in scrape_configs: + tls_config = scrape_config.get("tls_config", {}) + ca_present = "ca" in tls_config or "ca_file" in tls_config + if scrape_config.get("scheme") == "https" and not ca_present: + tls_config["insecure_skip_verify"] = True + scrape_config["tls_config"] = tls_config + + return scrape_configs def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" @@ -1350,7 +1457,7 @@ def _dedupe_job_names(jobs: List[dict]): job["job_name"] = "{}_{}".format(job["job_name"], hashed) new_jobs = [] for key in jobs_dict: - new_jobs.extend([i for i in jobs_dict[key]]) + new_jobs.extend(list(jobs_dict[key])) # Deduplicate jobs which are equal # Again this in O(n^2) but it should be okay @@ -1609,6 +1716,10 @@ def _on_relation_changed(self, event): else: self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) + scrape_errors = ev.get("scrape_job_errors", None) + if scrape_errors: + self.on.invalid_scrape_job.emit(errors=scrape_errors) + def update_scrape_job_spec(self, jobs): """Update scrape job specification.""" self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) @@ -1699,11 +1810,10 @@ def _scrape_jobs(self) -> list: A list of dictionaries, where each dictionary specifies a single scrape job for Prometheus. """ - jobs = self._jobs if self._jobs else [DEFAULT_JOB] + jobs = self._jobs or [] if callable(self._lookaside_jobs): - return jobs + PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs()) - else: - return jobs + jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())) + return jobs or [DEFAULT_JOB] @property def _scrape_metadata(self) -> dict: @@ -1936,6 +2046,9 @@ def _set_prometheus_data(self, event): `MetricsEndpointAggregator`, that Prometheus unit is provided with the complete set of existing scrape jobs and alert rules. """ + if not self._charm.unit.is_leader(): + return + jobs = [] + _type_convert_stored( self._stored.jobs ) # list of scrape jobs, one per relation @@ -1981,7 +2094,11 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: Args: targets: a `dict` containing target information app_name: a `str` identifying the application + kwargs: a `dict` of the extra arguments passed to the function """ + if not self._charm.unit.is_leader(): + return + # new scrape job for the relation that has changed updated_job = self._static_scrape_job(targets, app_name, **kwargs) @@ -2015,6 +2132,9 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is sufficient to uniquely identify the target. """ + if not self._charm.unit.is_leader(): + return + for relation in self.model.relations[self._prometheus_relation]: jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) if not jobs: @@ -2100,6 +2220,7 @@ def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: "port". application_name: a string name of the application for which this static scrape job is being constructed. + kwargs: a `dict` of the extra arguments passed to the function Returns: A dictionary corresponding to a Prometheus static scrape @@ -2121,6 +2242,8 @@ def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: "juju_application": application_name, "juju_unit": unit_name, "host": target["hostname"], + # Expanding this will merge the dicts and replace the + # topology labels if any were present/found **self._static_config_extra_labels(target), }, } @@ -2143,7 +2266,16 @@ def _static_config_extra_labels(self, target: Dict[str, str]) -> Dict[str, str]: logger.debug("Could not perform DNS lookup for %s", target["hostname"]) dns_name = target["hostname"] extra_info["dns_name"] = dns_name + label_re = re.compile(r'(?P