diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py index 1623517..1f1bc4f 100644 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -218,7 +218,8 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 24 + +LIBPATCH = 35 logger = logging.getLogger(__name__) @@ -229,7 +230,7 @@ def __init__(self, *args): TOPOLOGY_TEMPLATE_DROPDOWNS = [ # type: ignore { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", "definition": "label_values(up,juju_model)", "description": None, @@ -254,9 +255,9 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "definition": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', "description": None, "error": None, "hide": 0, @@ -265,7 +266,7 @@ def __init__(self, *args): "multi": True, "name": "juju_model_uuid", "query": { - "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "query": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -279,9 +280,9 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', "description": None, "error": None, "hide": 0, @@ -290,7 +291,7 @@ def __init__(self, *args): "multi": True, "name": "juju_application", "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -304,9 +305,9 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', "description": None, "error": None, "hide": 0, @@ -315,7 +316,7 @@ def __init__(self, *args): "multi": True, "name": "juju_unit", "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -336,7 +337,7 @@ def __init__(self, *args): "error": None, "hide": 0, "includeAll": True, - "label": None, + "label": "Prometheus datasource", "multi": True, "name": "prometheusds", "options": [], @@ -351,7 +352,7 @@ def __init__(self, *args): "error": None, "hide": 0, "includeAll": True, - "label": None, + "label": "Loki datasource", "multi": True, "name": "lokids", "options": [], @@ -366,7 +367,7 @@ def __init__(self, *args): REACTIVE_CONVERTER = { # type: ignore "allValue": None, "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', "description": None, "error": None, "hide": 0, @@ -376,7 +377,7 @@ def __init__(self, *args): "name": "host", "options": [], "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -524,7 +525,7 @@ def _validate_relation_by_interface_and_direction( relation = charm.meta.relations[relation_name] actual_relation_interface = relation.interface_name - if actual_relation_interface != expected_relation_interface: + if actual_relation_interface and actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( relation_name, expected_relation_interface, actual_relation_interface ) @@ -581,7 +582,7 @@ def _convert_dashboard_fields(content: str, inject_dropdowns: bool = True) -> st # If no existing template variables exist, just insert our own if "templating" not in dict_content: - dict_content["templating"] = {"list": [d for d in template_dropdowns]} # type: ignore + dict_content["templating"] = {"list": list(template_dropdowns)} # type: ignore else: # Otherwise, set a flag so we can go back later existing_templates = True @@ -664,14 +665,14 @@ def _template_panels( continue if not existing_templates: datasource = panel.get("datasource") - if type(datasource) == str: + if isinstance(datasource, str): if "loki" in datasource: panel["datasource"] = "${lokids}" elif "grafana" in datasource: continue else: panel["datasource"] = "${prometheusds}" - elif type(datasource) == dict: + elif isinstance(datasource, dict): # In dashboards exported by Grafana 9, datasource type is dict dstype = datasource.get("type", "") if dstype == "loki": @@ -685,7 +686,7 @@ def _template_panels( logger.error("Unknown datasource format: skipping") continue else: - if type(panel["datasource"]) == str: + if isinstance(panel["datasource"], str): if panel["datasource"].lower() in replacements.values(): # Already a known template variable continue @@ -700,7 +701,7 @@ def _template_panels( if replacement: used_replacements.append(ds) panel["datasource"] = replacement or panel["datasource"] - elif type(panel["datasource"]) == dict: + elif isinstance(panel["datasource"], dict): dstype = panel["datasource"].get("type", "") if panel["datasource"].get("uid", "").lower() in replacements.values(): # Already a known template variable @@ -789,7 +790,7 @@ def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: # We need to use an index so we can insert the changed element back later for panel_idx, panel in enumerate(panels): - if type(panel) is not dict: + if not isinstance(panel, dict): continue # Use the index to insert it back in the same location @@ -829,18 +830,18 @@ def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: if "datasource" not in panel.keys(): continue - else: - if type(panel["datasource"]) == str: - if panel["datasource"] not in known_datasources: - continue - querytype = known_datasources[panel["datasource"]] - elif type(panel["datasource"]) == dict: - if panel["datasource"]["uid"] not in known_datasources: - continue - querytype = known_datasources[panel["datasource"]["uid"]] - else: - logger.error("Unknown datasource format: skipping") + + if isinstance(panel["datasource"], str): + if panel["datasource"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]] + elif isinstance(panel["datasource"], dict): + if panel["datasource"]["uid"] not in known_datasources: continue + querytype = known_datasources[panel["datasource"]["uid"]] + else: + logger.error("Unknown datasource format: skipping") + continue # Capture all values inside `[]` into a list which we'll iterate over later to # put them back in-order. Then apply the regex again and replace everything with @@ -900,13 +901,12 @@ def _type_convert_stored(obj): """Convert Stored* to their appropriate types, recursively.""" if isinstance(obj, StoredList): return list(map(_type_convert_stored, obj)) - elif isinstance(obj, StoredDict): + if isinstance(obj, StoredDict): rdict = {} # type: Dict[Any, Any] for k in obj.keys(): rdict[k] = _type_convert_stored(obj[k]) return rdict - else: - return obj + return obj class GrafanaDashboardsChanged(EventBase): @@ -955,7 +955,7 @@ def restore(self, snapshot): """Restore grafana source information.""" self.error_message = snapshot["error_message"] self.valid = snapshot["valid"] - self.errors = json.loads(snapshot["errors"]) + self.errors = json.loads(str(snapshot["errors"])) class GrafanaProviderEvents(ObjectEvents): @@ -968,7 +968,7 @@ class GrafanaDashboardProvider(Object): """An API to provide Grafana dashboards to a Grafana charm.""" _stored = StoredState() - on = GrafanaProviderEvents() + on = GrafanaProviderEvents() # pyright: ignore def __init__( self, @@ -1072,7 +1072,7 @@ def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: """ # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore encoded_dashboard = _encode_dashboard_content(content) @@ -1093,7 +1093,7 @@ def remove_non_builtin_dashboards(self) -> None: """Remove all dashboards to the relation added via :method:`add_dashboard`.""" # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("prog:"): @@ -1120,7 +1120,7 @@ def _update_all_dashboards_from_dir( # Ensure we do not leave outdated dashboards by removing from stored all # the encoded dashboards that start with "file/". if self._dashboards_path: - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1174,7 +1174,7 @@ def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: e.grafana_dashboards_absolute_path, e.message, ) - stored_dashboard_templates = self._stored.dashboard_templates # type: Any + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1195,6 +1195,7 @@ def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> `grafana_dashboaard` relationship is joined """ if self._charm.unit.is_leader(): + self._update_all_dashboards_from_dir() self._upset_dashboards_on_relation(event.relation) def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: @@ -1212,16 +1213,18 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> valid = bool(data.get("valid", True)) errors = data.get("errors", []) if valid and not errors: - self.on.dashboard_status_changed.emit(valid=valid) + self.on.dashboard_status_changed.emit(valid=valid) # pyright: ignore else: - self.on.dashboard_status_changed.emit(valid=valid, errors=errors) + self.on.dashboard_status_changed.emit( # pyright: ignore + valid=valid, errors=errors + ) def _upset_dashboards_on_relation(self, relation: Relation) -> None: """Update the dashboards in the relation data bucket.""" # It's completely ridiculous to add a UUID, but if we don't have some # pseudo-random value, this never makes it across 'juju set-state' stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } @@ -1250,13 +1253,13 @@ def _juju_topology(self) -> Dict: @property def dashboard_templates(self) -> List: """Return a list of the known dashboard templates.""" - return [v for v in self._stored.dashboard_templates.values()] # type: ignore + return list(self._stored.dashboard_templates.values()) # type: ignore class GrafanaDashboardConsumer(Object): """A consumer object for working with Grafana Dashboards.""" - on = GrafanaDashboardEvents() + on = GrafanaDashboardEvents() # pyright: ignore _stored = StoredState() def __init__( @@ -1304,7 +1307,7 @@ def __init__( self._relation_name = relation_name self._tranformer = CosTool(self._charm) - self._stored.set_default(dashboards=dict()) # type: ignore + self._stored.set_default(dashboards={}) # type: ignore self.framework.observe( self._charm.on[self._relation_name].relation_changed, @@ -1348,13 +1351,13 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> changes = self._render_dashboards_and_signal_changed(event.relation) if changes: - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: """Emit dashboard events on peer events so secondary charm data updates.""" if self._charm.unit.is_leader(): return - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def update_dashboards(self, relation: Optional[Relation] = None) -> None: """Re-establish dashboards on one or more relations. @@ -1401,7 +1404,7 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # """ other_app = relation.app - raw_data = relation.data[other_app].get("dashboards", {}) # type: ignore + raw_data = relation.data[other_app].get("dashboards", "") # pyright: ignore if not raw_data: logger.warning( @@ -1416,11 +1419,6 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # The only piece of data needed on this side of the relations is "templates" templates = data.pop("templates") - # Import only if a charmed operator uses the consumer, we don't impose these - # dependencies on the client - from jinja2 import Template - from jinja2.exceptions import TemplateSyntaxError - # The dashboards are WAY too big since this ultimately calls out to Juju to # set the relation data, and it overflows the maximum argument length for # subprocess, so we have to use b64, annoyingly. @@ -1433,14 +1431,12 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # relation_has_invalid_dashboards = False for _, (fname, template) in enumerate(templates.items()): - decoded_content = None content = None error = None topology = template.get("juju_topology", {}) try: - decoded_content = _decode_dashboard_content(template["content"]) + content = _decode_dashboard_content(template["content"]) inject_dropdowns = template.get("inject_dropdowns", True) - content = Template(decoded_content).render() content = self._manage_dashboard_uid(content, template) content = _convert_dashboard_fields(content, inject_dropdowns) @@ -1455,9 +1451,6 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # error = str(e.msg) logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) continue - except TemplateSyntaxError as e: - error = str(e) - relation_has_invalid_dashboards = True # Prepend the relation name and ID to the dashboard ID to avoid clashes with # multiple relations with apps from the same charm, or having dashboards with @@ -1504,28 +1497,27 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # Dropping dashboards for a relation needs to be signalled return True - else: - stored_data = rendered_dashboards - currently_stored_data = self._get_stored_dashboards(relation.id) - coerced_data = ( - _type_convert_stored(currently_stored_data) if currently_stored_data else {} - ) + stored_data = rendered_dashboards + currently_stored_data = self._get_stored_dashboards(relation.id) + + coerced_data = _type_convert_stored(currently_stored_data) if currently_stored_data else {} - if not coerced_data == stored_data: - stored_dashboards = self.get_peer_data("dashboards") - stored_dashboards[relation.id] = stored_data - self.set_peer_data("dashboards", stored_dashboards) - return True + if not coerced_data == stored_data: + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards[relation.id] = stored_data + self.set_peer_data("dashboards", stored_dashboards) + return True + return None # type: ignore def _manage_dashboard_uid(self, dashboard: str, template: dict) -> str: """Add an uid to the dashboard if it is not present.""" - dashboard = json.loads(dashboard) + dashboard_dict = json.loads(dashboard) - if not dashboard.get("uid", None) and "dashboard_alt_uid" in template: - dashboard["uid"] = template["dashboard_alt_uid"] + if not dashboard_dict.get("uid", None) and "dashboard_alt_uid" in template: + dashboard_dict["uid"] = template["dashboard_alt_uid"] - return json.dumps(dashboard) + return json.dumps(dashboard_dict) def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: """If an errored dashboard is in stored data, remove it and trigger a deletion.""" @@ -1533,7 +1525,7 @@ def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: stored_dashboards = self.get_peer_data("dashboards") stored_dashboards.pop(str(relation.id)) self.set_peer_data("dashboards", stored_dashboards) - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def _to_external_object(self, relation_id, dashboard): return { @@ -1615,7 +1607,7 @@ class GrafanaDashboardAggregator(Object): """ _stored = StoredState() - on = GrafanaProviderEvents() + on = GrafanaProviderEvents() # pyright: ignore def __init__( self, @@ -1680,31 +1672,37 @@ def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: """Push dashboards to the downstream Grafana relation.""" # It's still ridiculous to add a UUID here, but needed stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) def remove_dashboards(self, event: RelationBrokenEvent) -> None: """Remove a dashboard if the relation is broken.""" - app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) # type: ignore + app_ids = _type_convert_stored(self._stored.id_mappings.get(event.app.name, "")) # type: ignore + + if not app_ids: + logger.info("Could not look up stored dashboards for %s", event.app.name) # type: ignore + return del self._stored.id_mappings[event.app.name] # type: ignore for id in app_ids: del self._stored.dashboard_templates[id] # type: ignore stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) # Yes, this has a fair amount of branching. It's not that complex, though - def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 + def _strip_existing_datasources(self, dash: dict) -> dict: # noqa: C901 """Remove existing reactive charm datasource templating out. This method iterates through *known* places where reactive charms may set @@ -1713,7 +1711,7 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from the Grafana UI. It is not present in earlier Grafana versions, and can be disabled in 5.3.4 and above (optionally). If set, any values present will be substituted on - import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors + import. Some reactive charms use this for Prometheus. COS uses dropdown selectors for datasources, and leaving this present results in "default" datasource values which are broken. @@ -1723,20 +1721,15 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 Further properties may be discovered. """ - dash = template["dashboard"] try: if "list" in dash["templating"]: for i in range(len(dash["templating"]["list"])): if ( "datasource" in dash["templating"]["list"][i] - and "Juju" in dash["templating"]["list"][i]["datasource"] - ): - dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" - if ( - "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["name"] == "host" + and dash["templating"]["list"][i]["datasource"] is not None ): - dash["templating"]["list"][i] = REACTIVE_CONVERTER + if "Juju" in dash["templating"]["list"][i].get("datasource", ""): + dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" # Strip out newly-added 'juju_application' template variables which # don't line up with our drop-downs @@ -1744,7 +1737,7 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 for i in range(len(dash["templating"]["list"])): if ( "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["name"] == "app" + and dash["templating"]["list"][i].get("name", "") == "app" ): del dash_mutable["templating"]["list"][i] @@ -1756,18 +1749,20 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 if "__inputs" in dash: inputs = dash for i in range(len(dash["__inputs"])): - if dash["__inputs"][i]["pluginName"] == "Prometheus": + if dash["__inputs"][i].get("pluginName", "") == "Prometheus": del inputs["__inputs"][i] if inputs: dash["__inputs"] = inputs["__inputs"] else: del dash["__inputs"] - template["dashboard"] = dash - return template + return dash def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" + if not self._charm.unit.is_leader(): + return {} + templates = [] id = "" @@ -1790,20 +1785,28 @@ def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: dashboards = {} for t in templates: - # Replace values with LMA-style templating - t = self._strip_existing_datasources(t) - # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON # in the bucket back out to the actual "dashboard" we _need_, this is the way # This is not a mistake -- there's a double nesting in reactive charms, and # Grafana won't load it. We have to unbox: # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], # and the final unboxing is below. - dash = json.dumps(t["dashboard"]) + # + # Apparently SOME newer dashboards (such as Ceph) do not have this double nesting, so + # now we get to account for both :toot: + dash = t.get("dashboard", {}) or t + + # Replace values with LMA-style templating + dash = self._strip_existing_datasources(dash) + dash = json.dumps(dash) # Replace the old-style datasource templates dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "\$datasource"', r'"datasource": "${prometheusds}"', dash + ) + dash = re.sub(r'"uid": "\$datasource"', r'"uid": "${prometheusds}"', dash) dash = re.sub( r'"datasource": "(!?\w)[\w|\s|-]+?Juju generated.*?"', r'"datasource": "${prometheusds}"', @@ -1811,12 +1814,17 @@ def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: ) # Yank out "new"+old LMA topology - dash = re.sub(r'(,?juju_application=~)"\$app"', r'\1"\$juju_application"', dash) + dash = re.sub( + r'(,?\s?juju_application=~)\\"\$app\\"', r'\1\\"$juju_application\\"', dash + ) + + # Replace old piechart panels + dash = re.sub(r'"type": "grafana-piechart-panel"', '"type": "piechart"', dash) - from jinja2 import Template + from jinja2 import DebugUndefined, Template content = _encode_dashboard_content( - Template(dash).render(host=r"$host", datasource=r"${prometheusds}") # type: ignore + Template(dash, undefined=DebugUndefined).render(datasource=r"${prometheusds}") # type: ignore ) id = "prog:{}".format(content[-24:-16]) @@ -1980,7 +1988,7 @@ def inject_label_matchers(self, expression: str, topology: dict, type: str) -> s args.extend(["--", "{}".format(expression)]) # noinspection PyBroadException try: - return self._exec(args) + return re.sub(r'="\$juju', r'=~"$juju', self._exec(args)) except subprocess.CalledProcessError as e: logger.debug('Applying the expression failed: "%s", falling back to the original', e) return expression diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py deleted file mode 100644 index a79e5d4..0000000 --- a/lib/charms/observability_libs/v0/juju_topology.py +++ /dev/null @@ -1,301 +0,0 @@ -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. -"""## Overview. - -This document explains how to use the `JujuTopology` class to -create and consume topology information from Juju in a consistent manner. - -The goal of the Juju topology is to uniquely identify a piece -of software running across any of your Juju-managed deployments. -This is achieved by combining the following four elements: - -- Model name -- Model UUID -- Application name -- Unit identifier - - -For a more in-depth description of the concept, as well as a -walk-through of it's use-case in observability, see -[this blog post](https://juju.is/blog/model-driven-observability-part-2-juju-topology-metrics) -on the Juju blog. - -## Library Usage - -This library may be used to create and consume `JujuTopology` objects. -The `JujuTopology` class provides three ways to create instances: - -### Using the `from_charm` method - -Enables instantiation by supplying the charm as an argument. When -creating topology objects for the current charm, this is the recommended -approach. - -```python -topology = JujuTopology.from_charm(self) -``` - -### Using the `from_dict` method - -Allows for instantion using a dictionary of relation data, like the -`scrape_metadata` from Prometheus or the labels of an alert rule. When -creating topology objects for remote charms, this is the recommended -approach. - -```python -scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) -topology = JujuTopology.from_dict(scrape_metadata) -``` - -### Using the class constructor - -Enables instantiation using whatever values you want. While this -is useful in some very specific cases, this is almost certainly not -what you are looking for as setting these values manually may -result in observability metrics which do not uniquely identify a -charm in order to provide accurate usage reporting, alerting, -horizontal scaling, or other use cases. - -```python -topology = JujuTopology( - model="some-juju-model", - model_uuid="00000000-0000-0000-0000-000000000001", - application="fancy-juju-application", - unit="fancy-juju-application/0", - charm_name="fancy-juju-application-k8s", -) -``` - -""" -from collections import OrderedDict -from typing import Dict, List, Optional -from uuid import UUID - -# The unique Charmhub library identifier, never change it -LIBID = "bced1658f20f49d28b88f61f83c2d232" - -LIBAPI = 0 -LIBPATCH = 6 - - -class InvalidUUIDError(Exception): - """Invalid UUID was provided.""" - - def __init__(self, uuid: str): - self.message = "'{}' is not a valid UUID.".format(uuid) - super().__init__(self.message) - - -class JujuTopology: - """JujuTopology is used for storing, generating and formatting juju topology information. - - DEPRECATED: This class is deprecated. Use `pip install cosl` and - `from cosl.juju_topology import JujuTopology` instead. - """ - - def __init__( - self, - model: str, - model_uuid: str, - application: str, - unit: Optional[str] = None, - charm_name: Optional[str] = None, - ): - """Build a JujuTopology object. - - A `JujuTopology` object is used for storing and transforming - Juju topology information. This information is used to - annotate Prometheus scrape jobs and alert rules. Such - annotation when applied to scrape jobs helps in identifying - the source of the scrapped metrics. On the other hand when - applied to alert rules topology information ensures that - evaluation of alert expressions is restricted to the source - (charm) from which the alert rules were obtained. - - Args: - model: a string name of the Juju model - model_uuid: a globally unique string identifier for the Juju model - application: an application name as a string - unit: a unit name as a string - charm_name: name of charm as a string - """ - if not self.is_valid_uuid(model_uuid): - raise InvalidUUIDError(model_uuid) - - self._model = model - self._model_uuid = model_uuid - self._application = application - self._charm_name = charm_name - self._unit = unit - - def is_valid_uuid(self, uuid): - """Validate the supplied UUID against the Juju Model UUID pattern. - - Args: - uuid: string that needs to be checked if it is valid v4 UUID. - - Returns: - True if parameter is a valid v4 UUID, False otherwise. - """ - try: - return str(UUID(uuid, version=4)) == uuid - except (ValueError, TypeError): - return False - - @classmethod - def from_charm(cls, charm): - """Creates a JujuTopology instance by using the model data available on a charm object. - - Args: - charm: a `CharmBase` object for which the `JujuTopology` will be constructed - Returns: - a `JujuTopology` object. - """ - return cls( - model=charm.model.name, - model_uuid=charm.model.uuid, - application=charm.model.app.name, - unit=charm.model.unit.name, - charm_name=charm.meta.name, - ) - - @classmethod - def from_dict(cls, data: dict): - """Factory method for creating `JujuTopology` children from a dictionary. - - Args: - data: a dictionary with five keys providing topology information. The keys are - - "model" - - "model_uuid" - - "application" - - "unit" - - "charm_name" - `unit` and `charm_name` may be empty, but will result in more limited - labels. However, this allows us to support charms without workloads. - - Returns: - a `JujuTopology` object. - """ - return cls( - model=data["model"], - model_uuid=data["model_uuid"], - application=data["application"], - unit=data.get("unit", ""), - charm_name=data.get("charm_name", ""), - ) - - def as_dict( - self, - *, - remapped_keys: Optional[Dict[str, str]] = None, - excluded_keys: Optional[List[str]] = None, - ) -> OrderedDict: - """Format the topology information into an ordered dict. - - Keeping the dictionary ordered is important to be able to - compare dicts without having to resort to deep comparisons. - - Args: - remapped_keys: A dictionary mapping old key names to new key names, - which will be substituted when invoked. - excluded_keys: A list of key names to exclude from the returned dict. - uuid_length: The length to crop the UUID to. - """ - ret = OrderedDict( - [ - ("model", self.model), - ("model_uuid", self.model_uuid), - ("application", self.application), - ("unit", self.unit), - ("charm_name", self.charm_name), - ] - ) - if excluded_keys: - ret = OrderedDict({k: v for k, v in ret.items() if k not in excluded_keys}) - - if remapped_keys: - ret = OrderedDict( - (remapped_keys.get(k), v) if remapped_keys.get(k) else (k, v) for k, v in ret.items() # type: ignore - ) - - return ret - - @property - def identifier(self) -> str: - """Format the topology information into a terse string. - - This crops the model UUID, making it unsuitable for comparisons against - anything but other identifiers. Mainly to be used as a display name or file - name where long strings might become an issue. - - >>> JujuTopology( \ - model = "a-model", \ - model_uuid = "00000000-0000-4000-8000-000000000000", \ - application = "some-app", \ - unit = "some-app/1" \ - ).identifier - 'a-model_00000000_some-app' - """ - parts = self.as_dict( - excluded_keys=["unit", "charm_name"], - ) - - parts["model_uuid"] = self.model_uuid_short - values = parts.values() - - return "_".join([str(val) for val in values]).replace("/", "_") - - @property - def label_matcher_dict(self) -> Dict[str, str]: - """Format the topology information into a dict with keys having 'juju_' as prefix. - - Relabelled topology never includes the unit as it would then only match - the leader unit (ie. the unit that produced the dict). - """ - items = self.as_dict( - remapped_keys={"charm_name": "charm"}, - excluded_keys=["unit"], - ).items() - - return {"juju_{}".format(key): value for key, value in items if value} - - @property - def label_matchers(self) -> str: - """Format the topology information into a promql/logql label matcher string. - - Topology label matchers should never include the unit as it - would then only match the leader unit (ie. the unit that - produced the matchers). - """ - items = self.label_matcher_dict.items() - return ", ".join(['{}="{}"'.format(key, value) for key, value in items if value]) - - @property - def model(self) -> str: - """Getter for the juju model value.""" - return self._model - - @property - def model_uuid(self) -> str: - """Getter for the juju model uuid value.""" - return self._model_uuid - - @property - def model_uuid_short(self) -> str: - """Getter for the juju model value, truncated to the first eight letters.""" - return self._model_uuid[:8] - - @property - def application(self) -> str: - """Getter for the juju application value.""" - return self._application - - @property - def charm_name(self) -> Optional[str]: - """Getter for the juju charm name value.""" - return self._charm_name - - @property - def unit(self) -> Optional[str]: - """Getter for the juju unit value.""" - return self._unit diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 28ce507..665af88 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -18,13 +18,6 @@ Source code can be found on GitHub at: https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s -## Dependencies - -Using this library requires you to fetch the juju_topology library from -[observability-libs](https://charmhub.io/observability-libs/libraries/juju_topology). - -`charmcraft fetch-lib charms.observability_libs.v0.juju_topology` - ## Provider Library Usage This Prometheus charm interacts with its scrape targets using its @@ -121,7 +114,7 @@ def __init__(self, *args): { "targets": ["10.1.32.215:7000", "*:8000"], "labels": { - "some-key": "some-value" + "some_key": "some-value" } } ] @@ -151,7 +144,7 @@ def __init__(self, *args): { "targets": ["*:7000"], "labels": { - "some-key": "some-value" + "some_key": "some-value" } } ] @@ -163,7 +156,7 @@ def __init__(self, *args): { "targets": ["*:8000"], "labels": { - "some-other-key": "some-other-value" + "some_other_key": "some-other-value" } } ] @@ -346,7 +339,8 @@ def _on_scrape_targets_changed(self, event): from urllib.parse import urlparse import yaml -from charms.observability_libs.v0.juju_topology import JujuTopology +from cosl import JujuTopology +from cosl.rules import AlertRules from ops.charm import CharmBase, RelationRole from ops.framework import ( BoundEvent, @@ -368,7 +362,9 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 30 +LIBPATCH = 44 + +PYDEPS = ["cosl"] logger = logging.getLogger(__name__) @@ -389,6 +385,8 @@ def _on_scrape_targets_changed(self, event): "scheme", "basic_auth", "tls_config", + "authorization", + "params", } DEFAULT_JOB = { "metrics_path": "/metrics", @@ -599,15 +597,22 @@ def render_alertmanager_static_configs(alertmanagers: List[str]): # Create a mapping from paths to netlocs # Group alertmanager targets into a dictionary of lists: # {path: [netloc1, netloc2]} - paths = defaultdict(list) # type: Dict[str, List[str]] + paths = defaultdict(list) # type: Dict[Tuple[str, str], List[str]] for parsed in map(urlparse, sanitized): path = parsed.path or "/" - paths[path].append(parsed.netloc) + paths[(parsed.scheme, path)].append(parsed.netloc) return { "alertmanagers": [ - {"path_prefix": path_prefix, "static_configs": [{"targets": netlocs}]} - for path_prefix, netlocs in paths.items() + { + # For https we still do not render a `tls_config` section because + # certs are expected to be made available by the charm via the + # `update-ca-certificates` mechanism. + "scheme": scheme, + "path_prefix": path_prefix, + "static_configs": [{"targets": netlocs}], + } + for (scheme, path_prefix), netlocs in paths.items() ] } @@ -686,23 +691,39 @@ def restore(self, snapshot): self.errors = snapshot["errors"] +class InvalidScrapeJobEvent(EventBase): + """Event emitted when alert rule files are not valid.""" + + def __init__(self, handle, errors: str = ""): + super().__init__(handle) + self.errors = errors + + def snapshot(self) -> Dict: + """Save error information.""" + return {"errors": self.errors} + + def restore(self, snapshot): + """Restore error information.""" + self.errors = snapshot["errors"] + + class MetricsEndpointProviderEvents(ObjectEvents): """Events raised by :class:`InvalidAlertRuleEvent`s.""" alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + invalid_scrape_job = EventSource(InvalidScrapeJobEvent) def _type_convert_stored(obj): """Convert Stored* to their appropriate types, recursively.""" if isinstance(obj, StoredList): return list(map(_type_convert_stored, obj)) - elif isinstance(obj, StoredDict): + if isinstance(obj, StoredDict): rdict = {} # type: Dict[Any, Any] for k in obj.keys(): rdict[k] = _type_convert_stored(obj[k]) return rdict - else: - return obj + return obj def _validate_relation_by_interface_and_direction( @@ -744,7 +765,7 @@ def _validate_relation_by_interface_and_direction( actual_relation_interface = relation.interface_name if actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface + relation_name, expected_relation_interface, actual_relation_interface or "None" ) if expected_relation_role == RelationRole.provides: @@ -812,206 +833,6 @@ def _is_single_alert_rule_format(rules_dict: dict) -> bool: return set(rules_dict) >= {"alert", "expr"} -class AlertRules: - """Utility class for amalgamating prometheus alert rule files and injecting juju topology. - - An `AlertRules` object supports aggregating alert rules from files and directories in both - official and single rule file formats using the `add_path()` method. All the alert rules - read are annotated with Juju topology labels and amalgamated into a single data structure - in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be - easily dumped into JSON format and exchanged over relation data. The dictionary can also - be dumped into YAML format and written directly into an alert rules file that is read by - Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, - since Prometheus allows only a single list of alert rule groups per alert rules file. - - The official Prometheus format is a YAML file conforming to the Prometheus documentation - (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). - The custom single rule format is a subsection of the official YAML, having a single alert - rule, effectively "one alert per file". - """ - - # This class uses the following terminology for the various parts of a rule file: - # - alert rules file: the entire groups[] yaml, including the "groups:" key. - # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list - # of dictionaries that have the "name" and "rules" keys. - # - alert group (singular): a single dictionary that has the "name" and "rules" keys. - # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with - # the "alert" and "expr" keys. - # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. - - def __init__(self, topology: Optional[JujuTopology] = None): - """Build and alert rule object. - - Args: - topology: an optional `JujuTopology` instance that is used to annotate all alert rules. - """ - self.topology = topology - self.tool = CosTool(None) - self.alert_groups = [] # type: List[dict] - - def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: - """Read a rules file from path, injecting juju topology. - - Args: - root_path: full path to the root rules folder (used only for generating group name) - file_path: full path to a *.rule file. - - Returns: - A list of dictionaries representing the rules file, if file is valid (the structure is - formed by `yaml.safe_load` of the file); an empty list otherwise. - """ - with file_path.open() as rf: - # Load a list of rules from file then add labels and filters - try: - rule_file = yaml.safe_load(rf) - - except Exception as e: - logger.error("Failed to read alert rules from %s: %s", file_path.name, e) - return [] - - if not rule_file: - logger.warning("Empty rules file: %s", file_path.name) - return [] - if not isinstance(rule_file, dict): - logger.error("Invalid rules file (must be a dict): %s", file_path.name) - return [] - if _is_official_alert_rule_format(rule_file): - alert_groups = rule_file["groups"] - elif _is_single_alert_rule_format(rule_file): - # convert to list of alert groups - # group name is made up from the file name - alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] - else: - # invalid/unsupported - logger.error("Invalid rules file: %s", file_path.name) - return [] - - # update rules with additional metadata - for alert_group in alert_groups: - # update group name with topology and sub-path - alert_group["name"] = self._group_name( - str(root_path), - str(file_path), - alert_group["name"], - ) - - # add "juju_" topology labels - for alert_rule in alert_group["rules"]: - if "labels" not in alert_rule: - alert_rule["labels"] = {} - - if self.topology: - alert_rule["labels"].update(self.topology.label_matcher_dict) - # insert juju topology filters into a prometheus alert rule - alert_rule["expr"] = self.tool.inject_label_matchers( - re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), - self.topology.label_matcher_dict, - ) - - return alert_groups - - def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: - """Generate group name from path and topology. - - The group name is made up of the relative path between the root dir_path, the file path, - and topology identifier. - - Args: - root_path: path to the root rules dir. - file_path: path to rule file. - group_name: original group name to keep as part of the new augmented group name - - Returns: - New group name, augmented by juju topology and relative path. - """ - rel_path = os.path.relpath(os.path.dirname(file_path), root_path) - rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_") - - # Generate group name: - # - name, from juju topology - # - suffix, from the relative path of the rule file; - group_name_parts = [self.topology.identifier] if self.topology else [] - group_name_parts.extend([rel_path, group_name, "alerts"]) - # filter to remove empty strings - return "_".join(filter(None, group_name_parts)) - - @classmethod - def _multi_suffix_glob( - cls, dir_path: Path, suffixes: List[str], recursive: bool = True - ) -> list: - """Helper function for getting all files in a directory that have a matching suffix. - - Args: - dir_path: path to the directory to glob from. - suffixes: list of suffixes to include in the glob (items should begin with a period). - recursive: a flag indicating whether a glob is recursive (nested) or not. - - Returns: - List of files in `dir_path` that have one of the suffixes specified in `suffixes`. - """ - all_files_in_dir = dir_path.glob("**/*" if recursive else "*") - return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir)) - - def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: - """Read all rule files in a directory. - - All rules from files for the same directory are loaded into a single - group. The generated name of this group includes juju topology. - By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. - - Args: - dir_path: directory containing *.rule files (alert rules without groups). - recursive: flag indicating whether to scan for rule files recursively. - - Returns: - a list of dictionaries representing prometheus alert rule groups, each dictionary - representing an alert group (structure determined by `yaml.safe_load`). - """ - alert_groups = [] # type: List[dict] - - # Gather all alerts into a list of groups - for file_path in self._multi_suffix_glob( - dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive - ): - alert_groups_from_file = self._from_file(dir_path, file_path) - if alert_groups_from_file: - logger.debug("Reading alert rule from %s", file_path) - alert_groups.extend(alert_groups_from_file) - - return alert_groups - - def add_path(self, path: str, *, recursive: bool = False) -> None: - """Add rules from a dir path. - - All rules from files are aggregated into a data structure representing a single rule file. - All group names are augmented with juju topology. - - Args: - path: either a rules file or a dir of rules files. - recursive: whether to read files recursively or not (no impact if `path` is a file). - - Returns: - True if path was added else False. - """ - path = Path(path) # type: Path - if path.is_dir(): - self.alert_groups.extend(self._from_dir(path, recursive)) - elif path.is_file(): - self.alert_groups.extend(self._from_file(path.parent, path)) - else: - logger.debug("Alert rules path does not exist: %s", path) - - def as_dict(self) -> dict: - """Return standard alert rules file in dict representation. - - Returns: - a dictionary containing a single list of alert rule groups. - The list of alert rule groups is provided as value of the - "groups" dictionary key. - """ - return {"groups": self.alert_groups} if self.alert_groups else {} - - class TargetsChangedEvent(EventBase): """Event emitted when Prometheus scrape targets change.""" @@ -1037,7 +858,7 @@ class MonitoringEvents(ObjectEvents): class MetricsEndpointConsumer(Object): """A Prometheus based Monitoring service.""" - on = MonitoringEvents() + on = MonitoringEvents() # pyright: ignore def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): """A Prometheus based Monitoring service. @@ -1119,12 +940,24 @@ def jobs(self) -> list: for relation in self._charm.model.relations[self._relation_name]: static_scrape_jobs = self._static_scrape_config(relation) if static_scrape_jobs: - scrape_jobs.extend(static_scrape_jobs) + # Duplicate job names will cause validate_scrape_jobs to fail. + # Therefore we need to dedupe here and after all jobs are collected. + static_scrape_jobs = _dedupe_job_names(static_scrape_jobs) + try: + self._tool.validate_scrape_jobs(static_scrape_jobs) + except subprocess.CalledProcessError as e: + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["scrape_job_errors"] = str(e) + relation.data[self._charm.app]["event"] = json.dumps(data) + else: + scrape_jobs.extend(static_scrape_jobs) scrape_jobs = _dedupe_job_names(scrape_jobs) return scrape_jobs + @property def alerts(self) -> dict: """Fetch alerts for all relations. @@ -1175,37 +1008,48 @@ def alerts(self) -> dict: if not alert_rules: continue - try: - scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) - identifier = JujuTopology.from_dict(scrape_metadata).identifier - alerts[identifier] = self._tool.apply_label_matchers(alert_rules) - - except KeyError as e: - logger.debug( - "Relation %s has no 'scrape_metadata': %s", - relation.id, - e, - ) - identifier = self._get_identifier_by_alert_rules(alert_rules) + alert_rules = self._inject_alert_expr_labels(alert_rules) + + identifier, topology = self._get_identifier_by_alert_rules(alert_rules) + if not topology: + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = JujuTopology.from_dict(scrape_metadata).identifier + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) if not identifier: logger.error( - "Alert rules were found but no usable group or identifier was present" + "Alert rules were found but no usable group or identifier was present." ) continue + # We need to append the relation info to the identifier. This is to allow for cases for there are two + # relations which eventually scrape the same application. Issue #551. + identifier = f"{identifier}_{relation.name}_{relation.id}" + alerts[identifier] = alert_rules _, errmsg = self._tool.validate_alert_rules(alert_rules) if errmsg: if alerts[identifier]: del alerts[identifier] - relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg}) + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["errors"] = errmsg + relation.data[self._charm.app]["event"] = json.dumps(data) continue return alerts - def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: + def _get_identifier_by_alert_rules( + self, rules: dict + ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: """Determine an appropriate dict key for alert rules. The key is used as the filename when writing alerts to disk, so the structure @@ -1213,21 +1057,28 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: Args: rules: a dict of alert rules + Returns: + A tuple containing an identifier, if found, and a JujuTopology, if it could + be constructed. """ if "groups" not in rules: logger.debug("No alert groups were found in relation data") - return None + return None, None # Construct an ID based on what's in the alert rules if they have labels for group in rules["groups"]: try: labels = group["rules"][0]["labels"] - identifier = "{}_{}_{}".format( - labels["juju_model"], - labels["juju_model_uuid"], - labels["juju_application"], + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), ) - return identifier + return topology.identifier, topology except KeyError: logger.debug("Alert rules were found but no usable labels were present") continue @@ -1238,11 +1089,55 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: ) try: for group in rules["groups"]: - return group["name"] + return group["name"], None except KeyError: logger.debug("No group name was found to use as identifier") - return None + return None, None + + def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: + """Iterate through alert rules and inject topology into expressions. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + return rules + + modified_groups = [] + for group in rules["groups"]: + # Copy off rules, so we don't modify an object we're iterating over + rules_copy = group["rules"] + for idx, rule in enumerate(rules_copy): + labels = rule.get("labels") + + if labels: + try: + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + + # Inject topology and put it back in the list + rule["expr"] = self._tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), + topology.alert_expression_dict, + ) + except KeyError: + # Some required JujuTopology key is missing. Just move on. + pass + + group["rules"][idx] = rule + + modified_groups.append(group) + + rules["groups"] = modified_groups + return rules def _static_scrape_config(self, relation) -> list: """Generate the static scrape configuration for a single relation. @@ -1263,29 +1158,31 @@ def _static_scrape_config(self, relation) -> list: if not relation.units: return [] - scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) - if not scrape_jobs: + if not scrape_configs: return [] scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) if not scrape_metadata: - return scrape_jobs + return scrape_configs topology = JujuTopology.from_dict(scrape_metadata) job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) - scrape_jobs = PrometheusConfig.prefix_job_names(scrape_jobs, job_name_prefix) - scrape_jobs = PrometheusConfig.sanitize_scrape_configs(scrape_jobs) + scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix) + scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs) hosts = self._relation_hosts(relation) - scrape_jobs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( - scrape_jobs, hosts, topology + scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_configs, hosts, topology ) - return scrape_jobs + # For https scrape targets we still do not render a `tls_config` section because certs + # are expected to be made available by the charm via the `update-ca-certificates` mechanism. + return scrape_configs def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" @@ -1350,7 +1247,7 @@ def _dedupe_job_names(jobs: List[dict]): job["job_name"] = "{}_{}".format(job["job_name"], hashed) new_jobs = [] for key in jobs_dict: - new_jobs.extend([i for i in jobs_dict[key]]) + new_jobs.extend(list(jobs_dict[key])) # Deduplicate jobs which are equal # Again this in O(n^2) but it should be okay @@ -1401,7 +1298,7 @@ def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> st class MetricsEndpointProvider(Object): """A metrics endpoint for Prometheus.""" - on = MetricsEndpointProviderEvents() + on = MetricsEndpointProviderEvents() # pyright: ignore def __init__( self, @@ -1609,6 +1506,10 @@ def _on_relation_changed(self, event): else: self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) + scrape_errors = ev.get("scrape_job_errors", None) + if scrape_errors: + self.on.invalid_scrape_job.emit(errors=scrape_errors) + def update_scrape_job_spec(self, jobs): """Update scrape job specification.""" self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) @@ -1628,7 +1529,7 @@ def set_scrape_job_spec(self, _=None): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules(topology=self.topology) + alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path, recursive=True) alert_rules_as_dict = alert_rules.as_dict() @@ -1699,11 +1600,10 @@ def _scrape_jobs(self) -> list: A list of dictionaries, where each dictionary specifies a single scrape job for Prometheus. """ - jobs = self._jobs if self._jobs else [DEFAULT_JOB] + jobs = self._jobs or [] if callable(self._lookaside_jobs): - return jobs + PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs()) - else: - return jobs + jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())) + return jobs or [DEFAULT_JOB] @property def _scrape_metadata(self) -> dict: @@ -1776,7 +1676,7 @@ def _update_relation_data(self, _): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules() + alert_rules = AlertRules(query_type="promql") alert_rules.add_path(self.dir_path, recursive=self._recursive) alert_rules_as_dict = alert_rules.as_dict() @@ -1936,15 +1836,20 @@ def _set_prometheus_data(self, event): `MetricsEndpointAggregator`, that Prometheus unit is provided with the complete set of existing scrape jobs and alert rules. """ + if not self._charm.unit.is_leader(): + return + jobs = [] + _type_convert_stored( - self._stored.jobs + self._stored.jobs # pyright: ignore ) # list of scrape jobs, one per relation for relation in self.model.relations[self._target_relation]: targets = self._get_targets(relation) if targets and relation.app: jobs.append(self._static_scrape_job(targets, relation.app.name)) - groups = [] + _type_convert_stored(self._stored.alert_rules) # list of alert rule groups + groups = [] + _type_convert_stored( + self._stored.alert_rules # pyright: ignore + ) # list of alert rule groups for relation in self.model.relations[self._alert_rules_relation]: unit_rules = self._get_alert_rules(relation) if unit_rules and relation.app: @@ -1981,7 +1886,11 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: Args: targets: a `dict` containing target information app_name: a `str` identifying the application + kwargs: a `dict` of the extra arguments passed to the function """ + if not self._charm.unit.is_leader(): + return + # new scrape job for the relation that has changed updated_job = self._static_scrape_job(targets, app_name, **kwargs) @@ -1992,7 +1901,7 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: jobs.append(updated_job) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - if not _type_convert_stored(self._stored.jobs) == jobs: + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs def _on_prometheus_targets_departed(self, event): @@ -2015,6 +1924,9 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is sufficient to uniquely identify the target. """ + if not self._charm.unit.is_leader(): + return + for relation in self.model.relations[self._prometheus_relation]: jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) if not jobs: @@ -2041,7 +1953,7 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - if not _type_convert_stored(self._stored.jobs) == jobs: + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs def _job_name(self, appname) -> str: @@ -2100,6 +2012,7 @@ def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: "port". application_name: a string name of the application for which this static scrape job is being constructed. + kwargs: a `dict` of the extra arguments passed to the function Returns: A dictionary corresponding to a Prometheus static scrape @@ -2121,6 +2034,8 @@ def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: "juju_application": application_name, "juju_unit": unit_name, "host": target["hostname"], + # Expanding this will merge the dicts and replace the + # topology labels if any were present/found **self._static_config_extra_labels(target), }, } @@ -2195,6 +2110,9 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T The unit rules should be a dict, which is has additional Juju topology labels added. For rules generated by the NRPE exporter, they are pre-labeled so lookups can be performed. """ + if not self._charm.unit.is_leader(): + return + if label_rules: rules = self._label_alert_rules(unit_rules, name) else: @@ -2214,7 +2132,7 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T groups.append(updated_group) relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - if not _type_convert_stored(self._stored.alert_rules) == groups: + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore self._stored.alert_rules = groups def _on_alert_rules_departed(self, event): @@ -2229,6 +2147,9 @@ def _on_alert_rules_departed(self, event): def remove_alert_rules(self, group_name: str, unit_name: str) -> None: """Remove an alert rule group from relation data.""" + if not self._charm.unit.is_leader(): + return + for relation in self.model.relations[self._prometheus_relation]: alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) if not alert_rules: @@ -2261,7 +2182,7 @@ def remove_alert_rules(self, group_name: str, unit_name: str) -> None: json.dumps({"groups": groups}) if groups else "{}" ) - if not _type_convert_stored(self._stored.alert_rules) == groups: + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore self._stored.alert_rules = groups def _get_alert_rules(self, relation) -> dict: @@ -2403,6 +2324,22 @@ def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: ] ) + def validate_scrape_jobs(self, jobs: list) -> bool: + """Validate scrape jobs using cos-tool.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating scrape jobs.") + return True + conf = {"scrape_configs": jobs} + with tempfile.NamedTemporaryFile() as tmpfile: + with open(tmpfile.name, "w") as f: + f.write(yaml.safe_dump(conf)) + try: + self._exec([str(self.path), "validate-config", tmpfile.name]) + except subprocess.CalledProcessError as e: + logger.error("Validating scrape jobs failed: {}".format(e.output)) + raise + return True + def inject_label_matchers(self, expression, topology) -> str: """Add label matchers to an expression.""" if not topology: diff --git a/requirements-integration.txt b/requirements-integration.txt index 6b1cc78..1f8f111 100644 --- a/requirements-integration.txt +++ b/requirements-integration.txt @@ -14,7 +14,7 @@ asttokens==2.4.1 # via stack-data async-timeout==4.0.3 # via aiohttp -attrs==23.1.0 +attrs==23.2.0 # via # -r requirements.txt # aiohttp @@ -35,7 +35,7 @@ brotli==1.1.0 # via selenium-wire cachetools==5.3.3 # via google-auth -certifi==2023.7.22 +certifi==2024.2.2 # via # -r requirements.txt # httpcore @@ -48,10 +48,12 @@ cffi==1.16.0 # via # cryptography # pynacl -charset-normalizer==3.3.0 +charset-normalizer==3.3.2 # via # -r requirements.txt # requests +cosl==0.0.10 + # via -r requirements.txt cryptography==42.0.5 # via # paramiko @@ -76,7 +78,7 @@ frozenlist==1.4.1 # aiosignal google-auth==2.28.2 # via kubernetes -grpclib==0.4.6 +grpclib==0.4.7 # via # -r requirements.txt # betterproto @@ -104,7 +106,7 @@ hyperframe==6.0.1 # -r requirements.txt # h2 # selenium-wire -idna==3.4 +idna==3.6 # via # -r requirements.txt # anyio @@ -112,7 +114,7 @@ idna==3.4 # requests # trio # yarl -importlib-resources==6.1.0 +importlib-resources==6.3.1 # via # -r requirements.txt # jsonschema @@ -148,7 +150,7 @@ markupsafe==2.1.5 # via jinja2 matplotlib-inline==0.1.6 # via ipython -multidict==6.0.4 +multidict==6.0.5 # via # -r requirements.txt # aiohttp @@ -162,9 +164,10 @@ oauthlib==3.2.2 # requests-oauthlib oci-image==1.0.0 # via -r requirements.txt -ops==2.7.0 +ops==2.11.0 # via # -r requirements.txt + # cosl # serialized-data-interface outcome==1.3.0.post0 # via trio @@ -221,7 +224,7 @@ pyrfc3339==1.1 # via # juju # macaroonbakery -pyrsistent==0.19.3 +pyrsistent==0.20.0 # via # -r requirements.txt # jsonschema @@ -237,7 +240,7 @@ pytest-asyncio==0.21.1 # via pytest-operator pytest-operator==0.34.0 # via -r requirements-integration.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements.txt # betterproto @@ -247,6 +250,7 @@ pytz==2024.1 pyyaml==6.0.1 # via # -r requirements.txt + # cosl # juju # kubernetes # lightkube @@ -312,13 +316,15 @@ trio-websocket==0.11.1 # via selenium typing-extensions==4.10.0 # via + # -r requirements.txt # anyio + # cosl # ipython # selenium # typing-inspect typing-inspect==0.9.0 # via juju -urllib3[socks]==2.0.6 +urllib3[socks]==2.2.1 # via # -r requirements.txt # kubernetes @@ -326,7 +332,7 @@ urllib3[socks]==2.0.6 # selenium wcwidth==0.2.13 # via prompt-toolkit -websocket-client==1.6.4 +websocket-client==1.7.0 # via # -r requirements.txt # kubernetes @@ -339,7 +345,7 @@ wsproto==1.2.0 # trio-websocket yarl==1.9.4 # via aiohttp -zipp==3.17.0 +zipp==3.18.1 # via # -r requirements.txt # importlib-resources diff --git a/requirements-unit.txt b/requirements-unit.txt index 7fd1ac1..c08e127 100644 --- a/requirements-unit.txt +++ b/requirements-unit.txt @@ -4,7 +4,7 @@ # # pip-compile requirements-unit.in # -attrs==23.1.0 +attrs==23.2.0 # via # -r requirements.txt # jsonschema @@ -12,21 +12,23 @@ betterproto==2.0.0b6 # via # -r requirements.txt # envoy-data-plane -certifi==2023.7.22 +certifi==2024.2.2 # via # -r requirements.txt # requests -charset-normalizer==3.3.0 +charset-normalizer==3.3.2 # via # -r requirements.txt # requests +cosl==0.0.10 + # via -r requirements.txt coverage==7.4.4 # via -r requirements-unit.in envoy-data-plane==0.2.5 # via -r requirements.txt exceptiongroup==1.2.0 # via pytest -grpclib==0.4.6 +grpclib==0.4.7 # via # -r requirements.txt # betterproto @@ -42,11 +44,11 @@ hyperframe==6.0.1 # via # -r requirements.txt # h2 -idna==3.4 +idna==3.6 # via # -r requirements.txt # requests -importlib-resources==6.1.0 +importlib-resources==6.3.1 # via # -r requirements.txt # jsonschema @@ -56,15 +58,16 @@ jsonschema==4.17.3 # via # -r requirements.txt # serialized-data-interface -multidict==6.0.4 +multidict==6.0.5 # via # -r requirements.txt # grpclib oci-image==1.0.0 # via -r requirements.txt -ops==2.7.0 +ops==2.11.0 # via # -r requirements.txt + # cosl # serialized-data-interface packaging==24.0 # via pytest @@ -74,7 +77,7 @@ pkgutil-resolve-name==1.3.10 # jsonschema pluggy==1.4.0 # via pytest -pyrsistent==0.19.3 +pyrsistent==0.20.0 # via # -r requirements.txt # jsonschema @@ -87,13 +90,14 @@ pytest-lazy-fixture==0.6.3 # via -r requirements-unit.in pytest-mock==3.12.0 # via -r requirements-unit.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements.txt # betterproto pyyaml==6.0.1 # via # -r requirements.txt + # cosl # ops # serialized-data-interface requests==2.31.0 @@ -110,15 +114,19 @@ stringcase==1.2.0 # via -r requirements.txt tomli==2.0.1 # via pytest -urllib3==2.0.6 +typing-extensions==4.10.0 + # via + # -r requirements.txt + # cosl +urllib3==2.2.1 # via # -r requirements.txt # requests -websocket-client==1.6.4 +websocket-client==1.7.0 # via # -r requirements.txt # ops -zipp==3.17.0 +zipp==3.18.1 # via # -r requirements.txt # importlib-resources diff --git a/requirements.in b/requirements.in index 938c037..b344561 100644 --- a/requirements.in +++ b/requirements.in @@ -2,6 +2,8 @@ # See LICENSE file for licensing details. envoy_data_plane==0.2.5 +# from observability_libs.v0.juju_topology.py +cosl # if unpinned causes problems with installation resulting in module 'platform' has no attribute 'dist' oci-image ops diff --git a/requirements.txt b/requirements.txt index c533742..87c8742 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,8 @@ certifi==2024.2.2 # via requests charset-normalizer==3.3.2 # via requests +cosl==0.0.10 + # via -r requirements.in envoy-data-plane==0.2.5 # via -r requirements.in grpclib==0.4.7 @@ -35,6 +37,7 @@ oci-image==1.0.0 ops==2.11.0 # via # -r requirements.in + # cosl # serialized-data-interface pkgutil-resolve-name==1.3.10 # via jsonschema @@ -44,6 +47,7 @@ python-dateutil==2.9.0.post0 # via betterproto pyyaml==6.0.1 # via + # cosl # ops # serialized-data-interface requests==2.31.0 @@ -54,6 +58,8 @@ six==1.16.0 # via python-dateutil stringcase==1.2.0 # via -r requirements.in +typing-extensions==4.10.0 + # via cosl urllib3==2.2.1 # via requests websocket-client==1.7.0