diff --git a/alerts/google-gce/metadata.yaml b/alerts/google-gce/metadata.yaml index d9cd459a7b..f494ba50d0 100644 --- a/alerts/google-gce/metadata.yaml +++ b/alerts/google-gce/metadata.yaml @@ -55,3 +55,17 @@ alert_policy_templates: related_integrations: - id: gce platform: GCP +- + id: reservation-utilization-too-high + description: "Monitors reservation utilization across all GCE Reservations in the current project and will notify you if the utilization rises above 90%. Reservation utilization is (in use count / reserved count)." + version: 1 + related_integrations: + - id: gce + platform: GCP +- + id: reservation-utilization-too-low + description: "Monitors reservation utilization across all GCE Reservations in the current project and will notify you if the utilization falls below 10% for 20 of the past 23 hours. Reservation utilization is (in use count / reserved count)." + version: 1 + related_integrations: + - id: gce + platform: GCP diff --git a/alerts/google-gce/reservation-utilization-too-high.v1.json b/alerts/google-gce/reservation-utilization-too-high.v1.json new file mode 100644 index 0000000000..7655ddbf05 --- /dev/null +++ b/alerts/google-gce/reservation-utilization-too-high.v1.json @@ -0,0 +1,21 @@ +{ + "displayName": "Reservation - High Utilization", + "userLabels": {}, + "conditions": [ + { + "displayName": "High Reservation Utilization", + "conditionMonitoringQueryLanguage": { + "duration": "0s", + "query": "fetch compute.googleapis.com/Reservation\n|\n{ metric 'compute.googleapis.com/reservation/used'\n| align next_older(5m) | every 5m ;\nmetric 'compute.googleapis.com/reservation/reserved'\n| align next_older(5m) | every 5m\n}\n| ratio\n| condition val() >= 0.9", + "trigger": { + "count": 1 + } + } + } + ], + "alertStrategy": { + "autoClose": "604800s" + }, + "combiner": "OR", + "enabled": true + } \ No newline at end of file diff --git a/alerts/google-gce/reservation-utilization-too-low.v1.json b/alerts/google-gce/reservation-utilization-too-low.v1.json new file mode 100644 index 0000000000..85bcb4a9aa --- /dev/null +++ b/alerts/google-gce/reservation-utilization-too-low.v1.json @@ -0,0 +1,21 @@ +{ + "displayName": "Reservation - Low Utilization", + "userLabels": {}, + "conditions": [ + { + "displayName": "Low Usage for 20 hours out of 23 hours", + "conditionMonitoringQueryLanguage": { + "duration": "0s", + "query": "fetch compute.googleapis.com/Reservation\n|\n{ metric 'compute.googleapis.com/reservation/used'\n| align next_older(5m) | every 5m ;\nmetric 'compute.googleapis.com/reservation/reserved'\n| align next_older(5m) | every 5m\n}\n| ratio\n| value val() <= 0.1\n| count_true_aligner(23h)\n| condition val() > 20 * 12 # 20 hours * (12 5 min intervals in hour)", + "trigger": { + "count": 1 + } + } + } + ], + "alertStrategy": { + "autoClose": "604800s" + }, + "combiner": "OR", + "enabled": true + } \ No newline at end of file diff --git a/dashboards/argo-workflows/README.md b/dashboards/argo-workflows/README.md index c1879aa1ff..73b2ad648e 100644 --- a/dashboards/argo-workflows/README.md +++ b/dashboards/argo-workflows/README.md @@ -7,4 +7,4 @@ |Argo Workflows Prometheus| |:------------------| |Filename: [argo-workflows-prometheus.json](argo-workflows-prometheus.json)| -|This dashboard has charts displaying: `Running Workflows`, `Pending Workflows`, `Skipped Workflows`, `Succeeded Workflows`, `Failed Workflows`, `Errors`, `Operation Duration (seconds)`, `Queue Adds`, `Queue Depth`, and `Queue Latency`| +|This dashboard has charts displaying: `Running Workflows`, `Pending Workflows`, `Skipped Workflows`, `Succeeded Workflows`, `Workflows With Pods Not Running`, `Failed Workflows`, `Errors`, `Operation Duration Seconds`, `Kubernetes Request Rates`, `Queue Adds`, `Queue Depth`, and `Queue Latency`| diff --git a/dashboards/argo-workflows/argo-workflows-prometheus.json b/dashboards/argo-workflows/argo-workflows-prometheus.json index c862c96714..7b6afc08d5 100644 --- a/dashboards/argo-workflows/argo-workflows-prometheus.json +++ b/dashboards/argo-workflows/argo-workflows-prometheus.json @@ -1,5 +1,5 @@ { - "category": "CUSTOM", + "displayName": "Argo Workflows Prometheus Overview", "dashboardFilters": [ { "filterType": "RESOURCE_LABEL", @@ -17,115 +17,125 @@ "templateVariable": "Namespace" } ], - "displayName": "Argo Workflows Prometheus Overview", - "labels": {}, "mosaicLayout": { - "columns": 12, + "columns": 48, "tiles": [ { - "height": 2, + "xPos": 8, + "width": 8, + "height": 8, "widget": { + "title": "Pending", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_count{status=\"Pending\",${Cluster},${Location},${Namespace}}" + "prometheusQuery": "sum(argo_workflows_count{status=\"Pending\",${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Pending" - }, - "width": 3, - "xPos": 3, - "yPos": 0 + } + } }, { - "height": 2, + "width": 8, + "height": 8, "widget": { + "title": "Running", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_count{status=\"Running\",${Cluster},${Location},${Namespace}}" + "prometheusQuery": "sum(argo_workflows_count{status=\"Running\",${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Running" - }, - "width": 3, - "xPos": 0, - "yPos": 0 + } + } }, { - "height": 2, + "xPos": 40, + "width": 8, + "height": 8, "widget": { + "title": "Errors", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_count{status=\"Error\",${Cluster},${Location},${Namespace}}" + "prometheusQuery": "sum(argo_workflows_count{status=\"Error\",${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Errors" - }, - "width": 2, - "xPos": 10, - "yPos": 0 + } + } }, { - "height": 2, + "xPos": 32, + "width": 8, + "height": 8, "widget": { + "title": "Failed", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_count{status=\"Failed\",${Cluster},${Location},${Namespace}}" + "prometheusQuery": "sum(argo_workflows_count{status=\"Failed\",${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Failed" - }, - "width": 2, - "xPos": 8, - "yPos": 0 + } + } }, { - "height": 2, + "xPos": 16, + "width": 8, + "height": 8, "widget": { + "title": "Succeeded", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_count{status=\"Succeeded\",${Cluster},${Location},${Namespace}}" + "prometheusQuery": "sum(argo_workflows_count{status=\"Succeeded\",${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Succeeded" - }, - "width": 2, - "xPos": 6, - "yPos": 0 + } + } }, { - "height": 4, + "yPos": 8, + "width": 24, + "height": 10, "widget": { - "title": "Operation Duration (seconds)", + "title": "Median Operation Duration Seconds", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_operation_duration_seconds_sum{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "histogram_quantile(0.50,rate(argo_workflows_operation_duration_seconds_bucket{${Cluster},${Location},${Namespace}}[${__interval}]))", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 12, - "xPos": 0, - "yPos": 2 + } }, { - "height": 4, + "xPos": 24, + "yPos": 34, + "width": 24, + "height": 12, "widget": { "title": "Queue Adds", "xyChart": { @@ -134,25 +144,30 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "delta(argo_workflows_queue_adds_count{${Cluster},${Location},${Namespace}}[2m])" + "prometheusQuery": "delta(argo_workflows_queue_adds_count{${Cluster},${Location},${Namespace}}[2m])", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 4, - "xPos": 0, - "yPos": 6 + } }, { - "height": 4, + "yPos": 34, + "width": 24, + "height": 12, "widget": { "title": "Queue Depth", "xyChart": { @@ -161,50 +176,129 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "argo_workflows_queue_depth_count{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "argo_workflows_queue_depth_count{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 4, - "xPos": 4, - "yPos": 6 + } }, { - "height": 4, + "yPos": 22, + "width": 48, + "height": 12, "widget": { - "title": "Queue Latency", + "title": "Median Queue Latency", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(argo_workflows_queue_latency_sum{${Cluster},${Location},${Namespace}}[1m])" + "prometheusQuery": "histogram_quantile(0.50, sum by (queue_name, le)(rate(argo_workflows_queue_latency_bucket{${Cluster},${Location},${Namespace}}[${__interval}])))", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 4, - "xPos": 8, - "yPos": 6 + } + }, + { + "xPos": 24, + "width": 8, + "height": 8, + "widget": { + "title": "Pods Not Running", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "argo_workflows_workflow_condition{status=\"False\",type=\"PodRunning\",${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + } + }, + { + "xPos": 24, + "yPos": 8, + "width": 24, + "height": 10, + "widget": { + "title": "Kubernetes Requests Rates", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(argo_workflows_k8s_request_total{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 18, + "width": 48, + "height": 4, + "widget": { + "title": "Queue", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } } ] - } + }, + "labels": {} } diff --git a/dashboards/argo-workflows/argo-workflows-prometheus.png b/dashboards/argo-workflows/argo-workflows-prometheus.png index 0bdd14e58d..4be3c62cc1 100644 Binary files a/dashboards/argo-workflows/argo-workflows-prometheus.png and b/dashboards/argo-workflows/argo-workflows-prometheus.png differ diff --git a/dashboards/argo-workflows/metadata.yaml b/dashboards/argo-workflows/metadata.yaml index 9ddb1e2b11..823f8f7791 100644 --- a/dashboards/argo-workflows/metadata.yaml +++ b/dashboards/argo-workflows/metadata.yaml @@ -3,7 +3,7 @@ sample_dashboards: category: Argo Workflows id: argo-workflows-prometheus display_name: Argo Workflows Prometheus Overview - description: "This dashboard has charts displaying: Running Workflows, Pending Workflows, Skipped Workflows, Succeeded Workflows, Failed Workflows, Errors, Operation Duration (seconds), Queue Adds, Queue Depth, and Queue Latency" + description: "This dashboard has charts displaying: Running Workflows, Pending Workflows, Skipped Workflows, Succeeded Workflows, Workflows With Pods Not Running, Failed Workflows, Errors, Operation Duration Seconds, Kubernetes Request Rates, Queue Adds, Queue Depth, and Queue Latency" related_integrations: - id: argo platform: GKE diff --git a/dashboards/consul/README.md b/dashboards/consul/README.md index 461710f010..725cd67b7a 100644 --- a/dashboards/consul/README.md +++ b/dashboards/consul/README.md @@ -7,4 +7,4 @@ |Consul Prometheus| |:------------------| |Filename: [consul-prometheus.json](consul-prometheus.json)| -|This dashboard has charts displaying: `Autopilot Health`, `System Memory Utilized in Bytes`, `GC Pause (milliseconds)`, `Raft Apply Rate`, `KV Store Apply Time`, `Raft Commit Time`, `Client RPC Request Rate`, `Rate Limited RPC Request Rate`, `Failed RPC Request Rate`, `Elections`, and `Leaders Elected`| +|This dashboard has charts displaying: `Autopilot Health`, `System Memory Utilized in Bytes`, `GC Pause Time`, `KV Store Apply Time`, `Raft Commit Time`, `Raft Transaction Rate`, `Transaction Timing Rate`, `Transaction Apply Time`, `Client RPC Request Rate`, `Rate Limit Exceeded RPC Requests`, `Failed RPC Request Rate`, `Elections`, `Leaders Elected`, and `Time To Contact Followers` | diff --git a/dashboards/consul/consul-prometheus.01.png b/dashboards/consul/consul-prometheus.01.png deleted file mode 100644 index 4e8c4ccc9c..0000000000 Binary files a/dashboards/consul/consul-prometheus.01.png and /dev/null differ diff --git a/dashboards/consul/consul-prometheus.02.png b/dashboards/consul/consul-prometheus.02.png deleted file mode 100644 index 94f29a7180..0000000000 Binary files a/dashboards/consul/consul-prometheus.02.png and /dev/null differ diff --git a/dashboards/consul/consul-prometheus.03.png b/dashboards/consul/consul-prometheus.03.png deleted file mode 100644 index 48a7ddfef7..0000000000 Binary files a/dashboards/consul/consul-prometheus.03.png and /dev/null differ diff --git a/dashboards/consul/consul-prometheus.json b/dashboards/consul/consul-prometheus.json index a6f560a9f9..01c8d19df1 100644 --- a/dashboards/consul/consul-prometheus.json +++ b/dashboards/consul/consul-prometheus.json @@ -1,5 +1,5 @@ { - "category": "CUSTOM", + "displayName": "Consul Prometheus Overview", "dashboardFilters": [ { "filterType": "RESOURCE_LABEL", @@ -17,13 +17,13 @@ "templateVariable": "Namespace" } ], - "displayName": "Consul Prometheus Overview", - "labels": {}, "mosaicLayout": { - "columns": 12, + "columns": 48, "tiles": [ { - "height": 3, + "yPos": 24, + "width": 20, + "height": 8, "widget": { "title": "KV Store Apply Time ", "xyChart": { @@ -35,22 +35,24 @@ "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "consul_kvs_apply{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "consul_kvs_apply{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 6 + } }, { - "height": 3, + "yPos": 32, + "width": 24, + "height": 8, "widget": { "title": "Raft Commit Time", "xyChart": { @@ -62,73 +64,58 @@ "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "consul_raft_commitTime{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "consul_raft_commitTime{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 6 - }, - { - "height": 6, - "widget": { - "collapsibleGroup": { - "collapsed": false - }, - "title": "Transaction Timing" - }, - "width": 12, - "xPos": 0, - "yPos": 3 + } }, { - "height": 3, + "xPos": 24, + "yPos": 52, + "width": 24, + "height": 8, "widget": { - "collapsibleGroup": { - "collapsed": false - }, - "title": "Leadership" - }, - "width": 12, - "xPos": 0, - "yPos": 15 - }, - { - "height": 3, - "widget": { - "title": "Leaders Elected", + "title": "Leadership Changes", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "consul_raft_state_leader{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "rate(consul_raft_state_leader{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 15 + } }, { - "height": 3, + "xPos": 24, + "yPos": 44, + "width": 24, + "height": 8, "widget": { "title": "Elections", "xyChart": { @@ -137,25 +124,29 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "consul_raft_state_candidate{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "consul_raft_state_candidate{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 15 + } }, { - "height": 3, + "width": 12, + "height": 12, "widget": { "title": "Autopilot Health", "xyChart": { @@ -164,25 +155,30 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "consul_autopilot_healthy{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "consul_autopilot_healthy{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 3, - "xPos": 0, - "yPos": 0 + } }, { - "height": 3, + "xPos": 12, + "width": 20, + "height": 12, "widget": { "title": "System Memory Utilized in Bytes", "xyChart": { @@ -191,120 +187,169 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "consul_runtime_sys_bytes{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "consul_runtime_sys_bytes{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 5, - "xPos": 3, - "yPos": 0 + } }, { - "height": 3, + "xPos": 32, + "width": 16, + "height": 12, "widget": { - "title": "GC Pause (nanoseconds)", + "title": "GC Pause Time", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(consul_runtime_total_gc_pause_ns{${Cluster},${Location},${Namespace}}[5m])" + "prometheusQuery": "rate(consul_runtime_total_gc_pause_ns{${Cluster},${Location},${Namespace}}[5m]) / (1000000000)", + "unitOverride": "" } } ], + "thresholds": [ + { + "label": "", + "targetAxis": "Y1", + "value": 2 + } + ], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 4, - "xPos": 8, - "yPos": 0 + } }, { - "height": 6, + "xPos": 24, + "yPos": 16, + "width": 24, + "height": 8, "widget": { - "collapsibleGroup": { - "collapsed": false - }, - "title": "Network activity" - }, - "width": 12, - "xPos": 0, - "yPos": 9 + "title": "Client RPC Request Rate", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(consul_client_rpc{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } }, { - "height": 3, + "xPos": 24, + "yPos": 24, + "width": 24, + "height": 8, "widget": { - "title": "Client RPC Request Rate", + "title": "Rate Limit Exceeded RPC Requests", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(consul_client_rpc{${Cluster},${Location},${Namespace}}[1m])" + "prometheusQuery": "rate(consul_client_rpc_exceeded{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 9 + } }, { - "height": 3, + "xPos": 24, + "yPos": 32, + "width": 24, + "height": 8, "widget": { - "title": "Rate Limited RPC Request Rate", + "title": "Failed RPC Request Rate", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(consul_client_rpc_exceeded{${Cluster},${Location},${Namespace}}[1m])" + "prometheusQuery": "rate(consul_client_rpc_failed{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 9 + } }, { - "height": 3, + "yPos": 16, + "width": 24, + "height": 8, "widget": { - "title": "Failed RPC Request Rate", + "title": "Raft Transaction Rate", "xyChart": { "chartOptions": { "mode": "COLOR" @@ -314,24 +359,47 @@ "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(consul_client_rpc_failed{${Cluster},${Location},${Namespace}}[1m])" + "prometheusQuery": "rate(consul_raft_apply{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 12, - "xPos": 0, - "yPos": 12 + } }, { - "height": 3, + "yPos": 12, + "width": 24, + "height": 4, "widget": { - "title": "Raft Apply Rate", + "title": "Transactions", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#C6DAFC", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 40, + "width": 24, + "height": 8, + "widget": { + "title": "Transaction Apply Time", "xyChart": { "chartOptions": { "mode": "COLOR" @@ -341,20 +409,116 @@ "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(consul_raft_apply{${Cluster},${Location},${Namespace}}[5m])" + "prometheusQuery": "consul_txn_apply{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 12, - "xPos": 0, - "yPos": 3 + } + }, + { + "xPos": 24, + "yPos": 12, + "width": 24, + "height": 4, + "widget": { + "title": "Network Activity", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E1BEE7", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "xPos": 24, + "yPos": 40, + "width": 24, + "height": 4, + "widget": { + "title": "Leadership Health", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#C8E6C9", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "xPos": 24, + "yPos": 60, + "width": 24, + "height": 8, + "widget": { + "title": "Time To Contact Followers", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "consul_raft_leader_lastContact{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 20, + "yPos": 24, + "width": 4, + "height": 8, + "widget": { + "text": { + "content": "Several charts have a key that represents percentiles", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#FFFFFF", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } } ] - } + }, + "labels": {} } diff --git a/dashboards/consul/consul-prometheus.png b/dashboards/consul/consul-prometheus.png new file mode 100644 index 0000000000..a05a92269c Binary files /dev/null and b/dashboards/consul/consul-prometheus.png differ diff --git a/dashboards/consul/metadata.yaml b/dashboards/consul/metadata.yaml index 9da0411d49..83952829a5 100644 --- a/dashboards/consul/metadata.yaml +++ b/dashboards/consul/metadata.yaml @@ -4,7 +4,7 @@ sample_dashboards: id: consul-prometheus display_name: Consul Prometheus Overview description: |- - This dashboard has charts displaying: Autopilot Health, System Memory Utilized in Bytes, GC Pause (milliseconds), Raft Apply Rate, KV Store Apply Time, Raft Commit Time, Client RPC Request Rate, Rate Limited RPC Request Rate, Failed RPC Request Rate, Elections, and Leaders Elected + This dashboard has charts displaying: Autopilot Health, System Memory Utilized in Bytes, GC Pause Time, KV Store Apply Time, Raft Commit Time, Raft Transaction Rate, Transaction Apply Time, Client RPC Request Rate, Rate Limit Exceeded RPC Requests, Failed RPC Request Rate, Elections, Leaders Elected, and Time To Contact Followers. related_integrations: - id: consul platform: GKE diff --git a/dashboards/google-cloud-agent-for-sap/agent-for-sap-hana-availability-monitoring.01.png b/dashboards/google-cloud-agent-for-sap/agent-for-sap-hana-availability-monitoring.01.png new file mode 100644 index 0000000000..81713b296d Binary files /dev/null and b/dashboards/google-cloud-agent-for-sap/agent-for-sap-hana-availability-monitoring.01.png differ diff --git a/dashboards/google-cloud-agent-for-sap/agent-for-sap-hana-availability-monitoring.json b/dashboards/google-cloud-agent-for-sap/agent-for-sap-hana-availability-monitoring.json new file mode 100644 index 0000000000..292a397c5c --- /dev/null +++ b/dashboards/google-cloud-agent-for-sap/agent-for-sap-hana-availability-monitoring.json @@ -0,0 +1,184 @@ +{ + "displayName": "SAP HANA Availability Monitoring", + "dashboardFilters": [ + { + "filterType": "SYSTEM_METADATA_LABEL", + "labelKey": "name", + "stringValue": "", + "templateVariable": "" + }, + { + "filterType": "METRIC_LABEL", + "labelKey": "sid", + "stringValue": "", + "templateVariable": "" + } + ], + "mosaicLayout": { + "columns": 48, + "tiles": [ + { + "width": 24, + "height": 16, + "widget": { + "title": "HANA System Availability", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metadata\\.system\\.name} ${labels.metric\\.sid}:${labels.metric\\.instance_nr} ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/hana/availability'\n| group_by [metadata.system_labels.name, metric.sid,metric.instance_nr], \n| map add[status:\n if(val() == 1, 'All processes are active','One or more processes are not active')]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 16, + "width": 24, + "height": 16, + "widget": { + "title": "HANA High Availability Status", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metadata\\.system\\.name} ${labels.metric\\.sid}:${labels.metric\\.instance_nr} ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/hana/ha/availability'\n| group_by[metadata.system_labels.name, metric.sid,metric.instance_nr]\n| every 1m\n| map add[status:\n if(val() == 4, ' is primary, in sync',\n if(val() == 3, ' is primary, not in sync',\n if(val() == 2, ' is primary but has sync error',\n if(val() == 1, ' is secondary',' Unknown'))))]\n", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 16, + "width": 24, + "height": 16, + "widget": { + "title": "HANA Service Availability", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metadata\\.system\\.name} ${labels.metric\\.sid}:${labels.metric\\.instance_nr} ${labels.metric\\.service_name} ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/hana/service'\n| group_by [ metadata.system_labels.name, metric.sid,metric.instance_nr, metric.service_name]\n| map add[status:\n if(val() == 1, 'Running','Not running')]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "width": 24, + "height": 16, + "widget": { + "title": "HANA Replication Status", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metadata\\.system\\.name} ${labels.metric\\.sid}:${labels.metric\\.instance_nr} ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/hana/ha/replication'\n| group_by [metadata.system_labels.name, metric.sid]\n| map add[status:\n if(val() == 15, 'is Primary',\n if(val() == 12, 'is Secondary',\n if(val() == 10, 'has replication disabled','in status ERROR')))]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 32, + "width": 24, + "height": 16, + "widget": { + "title": "Pacemaker Resource Agent Status", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/cluster/resources'\n| map add[status:\n if(val() == 3, ' Running',\n if(val() == 2, ' Starting',\n if(val() == 1, ' Stopped',\n if(val() == 0, ' Failed',' Unknown'))))]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LOG10" + } + } + } + } + ] + }, + "labels": {} +} \ No newline at end of file diff --git a/dashboards/google-cloud-agent-for-sap/agent-for-sap-netweaver-availability-monitoring.01.png b/dashboards/google-cloud-agent-for-sap/agent-for-sap-netweaver-availability-monitoring.01.png new file mode 100644 index 0000000000..187a6ae8ef Binary files /dev/null and b/dashboards/google-cloud-agent-for-sap/agent-for-sap-netweaver-availability-monitoring.01.png differ diff --git a/dashboards/google-cloud-agent-for-sap/agent-for-sap-netweaver-availability-monitoring.json b/dashboards/google-cloud-agent-for-sap/agent-for-sap-netweaver-availability-monitoring.json new file mode 100644 index 0000000000..3a55b34e03 --- /dev/null +++ b/dashboards/google-cloud-agent-for-sap/agent-for-sap-netweaver-availability-monitoring.json @@ -0,0 +1,120 @@ +{ + "displayName": "SAP Netweaver Availability Monitoring", + "dashboardFilters": [ + { + "filterType": "SYSTEM_METADATA_LABEL", + "labelKey": "name", + "stringValue": "", + "templateVariable": "" + }, + { + "filterType": "METRIC_LABEL", + "labelKey": "sid", + "stringValue": "", + "templateVariable": "" + } + ], + "mosaicLayout": { + "columns": 48, + "tiles": [ + { + "width": 48, + "height": 16, + "widget": { + "title": "Netweaver System Availability", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metadata\\.system\\.name} ${labels.metric\\.sid}:${labels.metric\\.instance_nr} = ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/nw/availability'\n| group_by [metadata.system_labels.name, metric.sid,metric.instance_nr]\n| map add[status:\n if(val() == 1, 'Online','Unknown state')]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 16, + "width": 48, + "height": 16, + "widget": { + "title": "Netweaver Service Availability", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metadata\\.system\\.name} ${labels.metric\\.sid}:${labels.metric\\.instance_nr} ${labels.metric\\.service_name} = ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/nw/service'\n| group_by [metadata.system.name,metric.sid,metric.instance_nr,metric.service_name]\n| map add[status:\n if(val() == 1, 'Running','Not running')]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 32, + "width": 48, + "height": 16, + "widget": { + "title": "Pacemaker Resource Status", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "legendTemplate": "${labels.metric\\.node} ${labels.metric\\.sid} ${labels.metric\\.resource} = ${labels.status}", + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/sap/cluster/resources'\n| every 1m\n| map add[status:\n if(val() == 3, ' Running',\n if(val() == 2, ' Starting',\n if(val() == 1, ' Stopped',\n if(val() == 0, ' Failed',' Unknown'))))]", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + } + ] + }, + "labels": {} +} \ No newline at end of file diff --git a/dashboards/google-cloud-agent-for-sap/metadata.yaml b/dashboards/google-cloud-agent-for-sap/metadata.yaml index 5cd9010ef7..c731ba1f52 100644 --- a/dashboards/google-cloud-agent-for-sap/metadata.yaml +++ b/dashboards/google-cloud-agent-for-sap/metadata.yaml @@ -1,10 +1,26 @@ sample_dashboards: + - + category: Agent for SAP + id: agent-for-sap-hana-availability-monitoring + display_name: Agent for SAP - HANA Availability Monitoring + description: "This dashboard is based on [Google Cloud’s Agent for SAP](https://cloud.google.com/solutions/sap/docs/agent-for-sap/latest/all-guides). It provides an overview of the availability of an SAP HANA system. The availability of a standalone SAP HANA system is derived from the status of the instance's services, such as `hdbindexserver` and `hdbnameserver`. For highly-available SAP HANA deployments, the availability is also derived from the replication status between the primary and secondary sites, and also shows the status of the individual Pacemaker resource agents." + related_integrations: + - id: google_agent_for_sap + platform: GCP + + - + category: Agent for SAP + id: agent-for-sap-netweaver-availability-monitoring + display_name: Agent for SAP - NetWeaver Availability Monitoring + description: "This dashboard is based on [Google Cloud Agent for SAP](https://cloud.google.com/solutions/sap/docs/agent-for-sap/latest/all-guides). It provides an overview of the availability of SAP NetWeaver systems, which is derived from the instance's service status of the underlying CS, ERS, and application servers. For highly-available SAP NetWeaver deployments, the dashboard also shows the status of the individual Pacemaker resource agents." + related_integrations: + - id: google_agent_for_sap + platform: GCP - category: Agent for SAP id: agent-for-sap-hana-overview display_name: Agent for SAP - HANA Overview - description: |- - This dashboard is based on [Google Cloud Agent for SAP](https://cloud.google.com/solutions/sap/docs/agent-for-sap/latest/all-guides) and has status overview charts displaying HANA Availability, HANA High Availability, HANA HA Replication, HANA Service Status, HA Cluster - Node State,HA Cluster - Resource State for SAP HANA Instances. + description: "This dashboard is based on [Google Cloud Agent for SAP](https://cloud.google.com/solutions/sap/docs/agent-for-sap/latest/all-guides) and has status overview charts displaying HANA Availability, HANA High Availability, HANA HA Replication, HANA Service Status, HA Cluster - Node State,HA Cluster - Resource State for SAP HANA Instances." related_integrations: - id: google_agent_for_sap platform: GCP @@ -17,7 +33,7 @@ sample_dashboards: related_integrations: - id: google_agent_for_sap platform: GCP - + - category: Agent for SAP id: agent-for-sap-hana-performance @@ -34,5 +50,4 @@ sample_dashboards: description: "This dashboard is based on Google's [Google Cloud Agent for SAP](https://cloud.google.com/solutions/sap/docs/agent-for-sap/latest/all-guides) and [Google's Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party/sap_hana#monitored-metrics) to provide a detailed view on metrics like has charts displaying Instance Memory, Service Memory Used, Schema Estimated Max Memory, Rowstore Memory Size, Schema Operation Outliers, Schema Record Count Outliers, System Connections, Alerts, and Blocked Transactions from SAP HANA as well as charts of infrastructure related metrics for the running SAP HANA VMs: CPU % Top 5 VMs, Memory % Top 5 VMs, and Hosts by Region." related_integrations: - id: google_agent_for_sap - platform: GCP - \ No newline at end of file + platform: GCP \ No newline at end of file diff --git a/dashboards/hbase/README.md b/dashboards/hbase/README.md index 2e26d01df6..266f09db38 100644 --- a/dashboards/hbase/README.md +++ b/dashboards/hbase/README.md @@ -18,9 +18,9 @@ |Hbase Prometheus Overview| |:------------------| |Filename: [hbase-prometheus-overview.json](hbase-prometheus-overview.json)| -|This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including `Region Servers`, `Dead Region Servers`, `Read Requests`, and `Write Requests`. | +|This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including `Region Servers`, `Dead Region Servers`, `Regions in transitions`, `Regions in transition longer than threshold`, `Oldest Region in transition age`, `Region Count`, `Authentication Attempts`, `Open Connections`, `Active RPC Handlers`, `Total Requests`, `Read Requests`, `Write Requests`, `Block Cache Hits`, `Block Cache Misses`, `Cache-Enabled Request Hit Percent`, `Store File Count`, `Store File Size (bytes)`, `Local Store File Data %`, `Unarchived Write Ahead Logs`, `Skip Write Ahead Log Writes`, `Enqueued User Requests`, `Enqueued Replication Requests`, `Enqueued Priority Requests`, `Memstore Flush Queue Depth`, `Memstore Flushing Update Block Time (milliseconds)`, and `Compaction Request Queue Depth`. | -|Hbase Operations Prometheus Overview| +|Hbase Region Server Operations Prometheus Overview| |:------------------| -|Filename: [hbase-operations-prometheus-overview.json](hbase-operations-prometheus-overview.json)| -|This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including `Slow Get Operations`, `Slow Append Operations`, `Slow Delete Operations`, `Slow Put Operations`, `Slow Increment Operations`, `Get Operations 99th Percentile`, `Append Operations 99th Percentile`, `Delete Operations 99th Percentile`, `Put Operations 99th Percentile`, `Increment Operations 99th Percentile`, and `Replay Operations 99th Percentile`. | +|Filename: [hbase-region-server-operations-prometheus-overview.json](hbase-region-server-operations-prometheus-overview.json)| +|This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including `Get Operations`, `Slow Get Operations`, `Get Operations Average Time`, `Get Operations Time 99th Percentile`, `Append Operations`, `Slow Append Operations`, `Append Operations Average Time`, `Append Operations Time 99th Percentile`, `Delete Operations`, `Slow Delete Operations`, `Delete Operations Average Time`, `Delete Operations Time 99th Percentile`, `Put Operations`, `Slow Put Operations`, `Put Operations Average Time`, `Put Operations Time 99th Percentile`, `Increment Operations`, `Slow Increment Operations`, `Increment Operations Average Time`, `Increment Operations Time 99th Percentile`, `Replay Operations`, `Replay Operations Average Time`, and `Replay Operations Time 99th Percentile`. | diff --git a/dashboards/hbase/hbase-operations-prometheus-overview.01.png b/dashboards/hbase/hbase-operations-prometheus-overview.01.png deleted file mode 100644 index 433d807e97..0000000000 Binary files a/dashboards/hbase/hbase-operations-prometheus-overview.01.png and /dev/null differ diff --git a/dashboards/hbase/hbase-operations-prometheus-overview.02.png b/dashboards/hbase/hbase-operations-prometheus-overview.02.png deleted file mode 100644 index 4ca54ced55..0000000000 Binary files a/dashboards/hbase/hbase-operations-prometheus-overview.02.png and /dev/null differ diff --git a/dashboards/hbase/hbase-operations-prometheus-overview.json b/dashboards/hbase/hbase-operations-prometheus-overview.json deleted file mode 100644 index 464d3d03dd..0000000000 --- a/dashboards/hbase/hbase-operations-prometheus-overview.json +++ /dev/null @@ -1,306 +0,0 @@ -{ - "category": "CUSTOM", - "displayName": "HBase Operations Prometheus Overview", - "mosaicLayout": { - "columns": 12, - "tiles": [ - { - "height": 4, - "widget": { - "title": "Slow Get Operations", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_RegionServer_Server_slowGetCount" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 0, - "yPos": 0 - }, - { - "height": 4, - "widget": { - "title": "Get Operations Time 99th Percentile", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_getTime_99th_percentile" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 6, - "yPos": 0 - }, - { - "height": 4, - "widget": { - "title": "Slow Append Operations", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "rate(hbase_RegionServer_Server_slowAppendCount[5m])" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 0, - "yPos": 4 - }, - { - "height": 4, - "widget": { - "title": "Append Operations Time 99th Percentile", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_appendTime_99th_percentile" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 6, - "yPos": 4 - }, - { - "height": 4, - "widget": { - "title": "Slow Delete Operations", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "rate(hbase_RegionServer_Server_slowDeleteCount[5m])" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 0, - "yPos": 8 - }, - { - "height": 4, - "widget": { - "title": "Delete Operations Time 99th Percentile", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_deleteTime_99th_percentile" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 6, - "yPos": 8 - }, - { - "height": 4, - "widget": { - "title": "Slow Put Operations", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "rate(hbase_RegionServer_Server_slowPutCount[5m])" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 0, - "yPos": 12 - }, - { - "height": 4, - "widget": { - "title": "Put Operations Time 99th Percentile", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_putTime_99th_percentile" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 6, - "yPos": 12 - }, - { - "height": 4, - "widget": { - "title": "Slow Increment Operations", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "rate(hbase_RegionServer_Server_slowIncrementCount[5m])" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 0, - "yPos": 16 - }, - { - "height": 4, - "widget": { - "title": "Increment Operations Time 99th Percentile", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_incrementTime_99th_percentile" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 6, - "yPos": 16 - }, - { - "height": 4, - "widget": { - "title": "Replay Operations 99th Percentile", - "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, - "dataSets": [ - { - "plotType": "LINE", - "targetAxis": "Y1", - "timeSeriesQuery": { - "prometheusQuery": "hbase_RegionServer_Server_Replay_99th_percentile" - } - } - ], - "timeshiftDuration": "0s", - "yAxis": { - "scale": "LINEAR" - } - } - }, - "width": 6, - "xPos": 0, - "yPos": 20 - } - ] - } -} diff --git a/dashboards/hbase/hbase-prometheus-overview.01.png b/dashboards/hbase/hbase-prometheus-overview.01.png new file mode 100644 index 0000000000..f6dfae0f12 Binary files /dev/null and b/dashboards/hbase/hbase-prometheus-overview.01.png differ diff --git a/dashboards/hbase/hbase-prometheus-overview.02.png b/dashboards/hbase/hbase-prometheus-overview.02.png new file mode 100644 index 0000000000..2b36f0426a Binary files /dev/null and b/dashboards/hbase/hbase-prometheus-overview.02.png differ diff --git a/dashboards/hbase/hbase-prometheus-overview.json b/dashboards/hbase/hbase-prometheus-overview.json index 579652c1ff..6b4774a739 100644 --- a/dashboards/hbase/hbase-prometheus-overview.json +++ b/dashboards/hbase/hbase-prometheus-overview.json @@ -1,11 +1,72 @@ { "category": "CUSTOM", "displayName": "HBase Prometheus Overview", + "dashboardFilters": [ + { + "filterType": "RESOURCE_LABEL", + "labelKey": "cluster", + "templateVariable": "Cluster" + }, + { + "filterType": "RESOURCE_LABEL", + "labelKey": "location", + "templateVariable": "Location" + }, + { + "filterType": "RESOURCE_LABEL", + "labelKey": "namespace", + "templateVariable": "Namespace" + } + ], "mosaicLayout": { - "columns": 12, + "columns": 48, "tiles": [ { + "yPos": 8, + "width": 48, "height": 4, + "widget": { + "title": "Master", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#C6DAFC", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 36, + "width": 48, + "height": 4, + "widget": { + "title": "Region Server", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#C6DAFC", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 12, + "width": 24, + "height": 12, "widget": { "title": "Region Servers", "xyChart": { @@ -14,25 +75,30 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "hbase_Master_Server_numRegionServers" + "prometheusQuery": "hbase_Master_Server_numRegionServer{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], - "timeshiftDuration": "0s", + "thresholds": [], "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 0 + } }, { - "height": 4, + "xPos": 24, + "yPos": 12, + "width": 24, + "height": 12, "widget": { "title": "Dead Region Servers", "xyChart": { @@ -41,25 +107,250 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "hbase_Master_Server_numDeadRegionServers" + "prometheusQuery": "hbase_Master_Server_numDeadRegionServers{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], - "timeshiftDuration": "0s", + "thresholds": [], "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 0 + } }, { - "height": 4, + "yPos": 24, + "width": 16, + "height": 12, + "widget": { + "title": "Regions in transition", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_Master_AssignmentManager_ritCount{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 16, + "yPos": 24, + "width": 16, + "height": 12, + "widget": { + "title": "Regions in transition longer than threshold", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_Master_AssignmentManager_ritCountOverThreshold{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 32, + "yPos": 24, + "width": 16, + "height": 12, + "widget": { + "title": "Oldest Region in transition age", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_Master_AssignmentManager_ritCountOverThreshold{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 40, + "width": 24, + "height": 12, + "widget": { + "title": "Region Count", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_regionCount{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 96, + "width": 16, + "height": 12, + "widget": { + "title": "Store File Count", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_storeFileCount{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 16, + "yPos": 96, + "width": 16, + "height": 12, + "widget": { + "title": "Store File Size (bytes)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_storeFileSize{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 68, + "width": 16, + "height": 12, + "widget": { + "title": "Total Requests", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_totalRequestCount{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 16, + "yPos": 68, + "width": 16, + "height": 12, "widget": { "title": "Read Requests", "xyChart": { @@ -68,25 +359,547 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_readRequestCount{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 52, + "width": 24, + "height": 12, + "widget": { + "title": "Open Connections", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(hbase_RegionServer_Server_readRequestCount[1m])" + "prometheusQuery": "hbase_RegionServer_IPC_numOpenConnections{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], - "timeshiftDuration": "0s", + "thresholds": [], "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 4 + } }, { + "xPos": 24, + "yPos": 52, + "width": 24, + "height": 12, + "widget": { + "title": "Active RPC Handlers", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_IPC_numActiveHandler{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 108, + "width": 24, + "height": 12, + "widget": { + "title": "Unarchived Write Ahead Logs", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_hlogFileCount{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 124, + "width": 16, + "height": 12, + "widget": { + "title": "Enqueued User Requests", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_IPC_numCallsInGeneralQueue{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 16, + "yPos": 124, + "width": 16, + "height": 12, + "widget": { + "title": "Enqueued Replication Requests", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_IPC_numCallsInReplicationQueue{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 32, + "yPos": 124, + "width": 16, + "height": 12, + "widget": { + "title": "Enqueued Priority Requests", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_IPC_numCallsInPriorityQueue{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 136, + "width": 16, + "height": 12, + "widget": { + "title": "Memstore Flush Queue Depth", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_flushQueueLength{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 16, + "yPos": 136, + "width": 16, + "height": 12, + "widget": { + "title": "Memstore Flushing Update Block Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_updatesBlockedTime{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 32, + "yPos": 136, + "width": 16, + "height": 12, + "widget": { + "title": "Compaction Request Queue Depth", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_compactionQueueLength{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 120, + "width": 48, "height": 4, + "widget": { + "title": "Queues", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_MEDIUM", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 80, + "width": 16, + "height": 12, + "widget": { + "title": "Block Cache Hits", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_blockCacheHitCount{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 32, + "yPos": 80, + "width": 16, + "height": 12, + "widget": { + "title": "Cache-Enabled Request Hit Percent", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_blockCacheExpressHitPercent{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 32, + "yPos": 96, + "width": 16, + "height": 12, + "widget": { + "title": "Local Store File Data %", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_percentFilesLocal{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 92, + "width": 48, + "height": 4, + "widget": { + "title": "Data Storage", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_MEDIUM", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 64, + "width": 48, + "height": 4, + "widget": { + "title": "Requests", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_MEDIUM", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "xPos": 24, + "yPos": 40, + "width": 24, + "height": 12, + "widget": { + "title": "Authentication Attempts", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_IPC_authenticationSuccesses{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + }, + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_IPC_authenticationFailures{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 108, + "width": 24, + "height": 12, + "widget": { + "title": "Skip Write Ahead Log Writes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_mutationsWithoutWALCount{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 32, + "yPos": 68, + "width": 16, + "height": 12, "widget": { "title": "Write Requests", "xyChart": { @@ -95,23 +908,121 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(hbase_RegionServer_Server_writeRequestCount[1m])" + "prometheusQuery": "rate(hbase_RegionServer_Server_writeRequestCount{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" } } ], - "timeshiftDuration": "0s", + "thresholds": [], "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 4 + } + }, + { + "xPos": 16, + "yPos": 80, + "width": 16, + "height": 12, + "widget": { + "title": "Block Cache Misses", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_blockCacheMissCount{${Cluster},${Location},${Namespace}}[1m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "width": 12, + "height": 8, + "widget": { + "title": "Region Servers", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "sum(hbase_Master_Server_numRegionServers{${Cluster},${Location},${Namespace}})", + "unitOverride": "" + } + } + } + }, + { + "xPos": 12, + "width": 12, + "height": 8, + "widget": { + "title": "Dead Region Servers", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "sum(hbase_Master_Server_numDeadRegionServers{${Cluster},${Location},${Namespace}})", + "unitOverride": "" + } + } + } + }, + { + "xPos": 24, + "width": 12, + "height": 8, + "widget": { + "title": "Total Region Count", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "sum(hbase_RegionServer_Server_regionCount{${Cluster},${Location},${Namespace}})", + "unitOverride": "" + } + } + } + }, + { + "xPos": 36, + "width": 12, + "height": 8, + "widget": { + "title": "Total Requests", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "sum(hbase_RegionServer_Server_totalRequestCount{${Cluster},${Location},${Namespace}})", + "unitOverride": "" + } + } + } } ] - } + }, + "labels": {} } diff --git a/dashboards/hbase/hbase-prometheus-overview.png b/dashboards/hbase/hbase-prometheus-overview.png deleted file mode 100644 index a393fbba1b..0000000000 Binary files a/dashboards/hbase/hbase-prometheus-overview.png and /dev/null differ diff --git a/dashboards/hbase/hbase-region-server-operations-prometheus-overview.01.png b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.01.png new file mode 100644 index 0000000000..1f5a7a32f4 Binary files /dev/null and b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.01.png differ diff --git a/dashboards/hbase/hbase-region-server-operations-prometheus-overview.02.png b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.02.png new file mode 100644 index 0000000000..d7fe766566 Binary files /dev/null and b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.02.png differ diff --git a/dashboards/hbase/hbase-region-server-operations-prometheus-overview.03.png b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.03.png new file mode 100644 index 0000000000..3696f0646a Binary files /dev/null and b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.03.png differ diff --git a/dashboards/hbase/hbase-region-server-operations-prometheus-overview.json b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.json new file mode 100644 index 0000000000..fdaffa876b --- /dev/null +++ b/dashboards/hbase/hbase-region-server-operations-prometheus-overview.json @@ -0,0 +1,857 @@ +{ + "category": "CUSTOM", + "displayName": "Hbase Region Server Operations Prometheus Overview", + "dashboardFilters": [ + { + "filterType": "RESOURCE_LABEL", + "labelKey": "cluster", + "templateVariable": "Cluster" + }, + { + "filterType": "RESOURCE_LABEL", + "labelKey": "location", + "templateVariable": "Location" + }, + { + "filterType": "RESOURCE_LABEL", + "labelKey": "namespace", + "templateVariable": "Namespace" + } + ], + "mosaicLayout": { + "columns": 48, + "tiles": [ + { + "xPos": 24, + "yPos": 4, + "width": 24, + "height": 16, + "widget": { + "title": "Slow Get Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_slowGetCount{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 20, + "width": 24, + "height": 16, + "widget": { + "title": "Get Operations Time 99th Percentile (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Get_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 40, + "width": 24, + "height": 16, + "widget": { + "title": "Slow Append Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_slowAppendCount{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 56, + "width": 24, + "height": 16, + "widget": { + "title": "Append Operations Time 99th Percentile (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Append_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 76, + "width": 24, + "height": 16, + "widget": { + "title": "Slow Delete Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_slowDeleteCount{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 92, + "width": 24, + "height": 16, + "widget": { + "title": "Delete Operations Time 99th Percentile (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Delete_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 112, + "width": 24, + "height": 16, + "widget": { + "title": "Slow Put Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_slowPutCount{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 128, + "width": 24, + "height": 16, + "widget": { + "title": "Put Operations Time 99th Percentile (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Put_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 148, + "width": 24, + "height": 16, + "widget": { + "title": "Slow Increment Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_slowIncrementCount{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 164, + "width": 24, + "height": 16, + "widget": { + "title": "Increment Operations Time 99th Percentile (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Increment_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 200, + "width": 24, + "height": 16, + "widget": { + "title": "Replay Operations Time 99th Percentile (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Replay_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "width": 48, + "height": 4, + "widget": { + "title": "Get", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 36, + "width": 48, + "height": 4, + "widget": { + "title": "Append", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 72, + "width": 48, + "height": 4, + "widget": { + "title": "Delete", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 108, + "width": 48, + "height": 4, + "widget": { + "title": "Put", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 144, + "width": 48, + "height": 4, + "widget": { + "title": "Increment", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 180, + "width": 48, + "height": 4, + "widget": { + "title": "Replay", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 4, + "width": 24, + "height": 16, + "widget": { + "title": "Get Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_Get_num_ops{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 20, + "width": 24, + "height": 16, + "widget": { + "title": "Get Operations Average Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Get_mean{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 40, + "width": 24, + "height": 16, + "widget": { + "title": "Append Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_Append_num_ops{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 56, + "width": 24, + "height": 16, + "widget": { + "title": "Append Operations Average Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Append_mean{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 76, + "width": 24, + "height": 16, + "widget": { + "title": "Delete Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_Delete_num_ops{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 92, + "width": 24, + "height": 16, + "widget": { + "title": "Delete Operations Average Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Delete_99th_percentile{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 112, + "width": 24, + "height": 16, + "widget": { + "title": "Put Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_Put_num_ops{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 128, + "width": 24, + "height": 16, + "widget": { + "title": "Put Operations Average Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Put_mean{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 148, + "width": 24, + "height": 16, + "widget": { + "title": "Increment Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_Increment_num_ops{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 164, + "width": 24, + "height": 16, + "widget": { + "title": "Increment Operations Average Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Increment_mean{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 200, + "width": 24, + "height": 16, + "widget": { + "title": "Replay Operations Average Time (ms)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "hbase_RegionServer_Server_Replay_mean{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 184, + "width": 48, + "height": 16, + "widget": { + "title": "Replay Operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "rate(hbase_RegionServer_Server_Replay_num_ops{${Cluster},${Location},${Namespace}}[5m])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + } + ] + }, + "labels": {} +} diff --git a/dashboards/hbase/metadata.yaml b/dashboards/hbase/metadata.yaml index 0207af55c4..d7ad18a159 100644 --- a/dashboards/hbase/metadata.yaml +++ b/dashboards/hbase/metadata.yaml @@ -19,15 +19,15 @@ sample_dashboards: category: HBase id: hbase-prometheus-overview display_name: Hbase Prometheus Overview - description: "This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including Region Servers, Dead Region Servers, Read Requests, and Write Requests." + description: "This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including Region Servers, Dead Region Servers, Regions in transitions, Regions in transition longer than threshold, Oldest Region in transition age, Region Count, Authentication Attempts, Open Connections, Active RPC Handlers, Total Requests, Read Requests, Write Requests, Block Cache Hits, Block Cache Misses, Cache-Enabled Request Hit Percent, Store File Count, Store File Size (bytes), Local Store File Data %, Unarchived Write Ahead Logs, Skip Write Ahead Log Writes, Enqueued User Requests, Enqueued Replication Requests, Enqueued Priority Requests, Memstore Flush Queue Depth, Memstore Flushing Update Block Time (milliseconds), and Compaction Request Queue Depth." related_integrations: - id: hbase platform: GKE - category: HBase - id: hbase-operations-prometheus-overview - display_name: Hbase Operations Prometheus Overview - description: "This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including Slow Get Operations, Slow Append Operations, Slow Delete Operations, Slow Put Operations, Slow Increment Operations, Get Operations 99th Percentile, Append Operations 99th Percentile, Delete Operations 99th Percentile, Put Operations 99th Percentile, Increment Operations 99th Percentile, and Replay Operations 99th Percentile." + id: hbase-region-server-operations-prometheus-overview + display_name: Hbase Region Server Operations Prometheus Overview + description: "This dashboard is based on metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including Get Operations, Slow Get Operations, Get Operations Average Time, Get Operations Time 99th Percentile, Append Operations, Slow Append Operations, Append Operations Average Time, Append Operations Time 99th Percentile, Delete Operations, Slow Delete Operations, Delete Operations Average Time, Delete Operations Time 99th Percentile, Put Operations, Slow Put Operations, Put Operations Average Time, Put Operations Time 99th Percentile, Increment Operations, Slow Increment Operations, Increment Operations Average Time, Increment Operations Time 99th Percentile, Replay Operations, Replay Operations Average Time, and Replay Operations Time 99th Percentile." related_integrations: - id: hbase platform: GKE diff --git a/dashboards/jenkins/README.md b/dashboards/jenkins/README.md index 6d9c39af4d..be0d0c0a3c 100644 --- a/dashboards/jenkins/README.md +++ b/dashboards/jenkins/README.md @@ -7,4 +7,4 @@ |Jenkins Prometheus| |:------------------| |Filename: [jenkins-prometheus.json](jenkins-prometheus.json)| -|This dashboard has charts displaying: `Health Check`, `Executor In Use`, `Nodes Offline`, `Executor Free`, `Plugins Failed`, `JVM Memory Free %`, `Job Build Speed`, `Queue Size`, `Job Queued Rate`, `Resource Usage`, and `Job Queue Duration`| +|This dashboard has charts displaying: `Health Check`, `CPU Load`, `Executor In Use`, `Nodes Offline`, `Executor Free`, `Plugins Failed`, `JVM Memory Free %`, `Time Spent Building`, `Time Spent In Buildable State`, `Time Spent In Waiting State`, `Time Spent In Blocked State`, `Build Queue Size`, `Build Queue Buildable Jobs`, `Build Queue Pending Jobs`, and `Build Queue Blocked Jobs` | diff --git a/dashboards/jenkins/jenkins-prometheus.json b/dashboards/jenkins/jenkins-prometheus.json index 9ad6190280..71e2f3632b 100644 --- a/dashboards/jenkins/jenkins-prometheus.json +++ b/dashboards/jenkins/jenkins-prometheus.json @@ -1,5 +1,5 @@ { - "category": "CUSTOM", + "displayName": "Jenkins Prometheus Overview", "dashboardFilters": [ { "filterType": "RESOURCE_LABEL", @@ -17,40 +17,45 @@ "templateVariable": "Namespace" } ], - "displayName": "Jenkins Prometheus Overview", - "labels": {}, "mosaicLayout": { - "columns": 12, + "columns": 48, "tiles": [ { - "height": 3, + "yPos": 20, + "width": 24, + "height": 8, "widget": { - "title": "Job Build Speed", + "title": "Time Spent Building", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jenkins_job_building_duration{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jenkins_job_building_duration{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 4 + } }, { - "height": 4, + "xPos": 8, + "width": 8, + "height": 8, "widget": { "title": "JVM Memory Free %", "xyChart": { @@ -59,219 +64,425 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "(vm_memory_total_max{job=\"jenkins-metrics\",${Cluster},${Location},${Namespace}} - vm_memory_total_used{job=\"jenkins-metrics\",${Cluster},${Location},${Namespace}}) / vm_memory_total_max{job=\"jenkins-metrics\",${Cluster},${Location},${Namespace}} * 100.0" + "prometheusQuery": "(vm_memory_total_max{job=~\".*jenkins.*\",${Cluster},${Location},${Namespace}} - vm_memory_total_used{job=~\".*jenkins.*\",${Cluster},${Location},${Namespace}}) / vm_memory_total_max{job=~\".*jenkins.*\",${Cluster},${Location},${Namespace}} * 100.0", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 4, - "xPos": 8, - "yPos": 0 + } }, { - "height": 2, + "xPos": 40, + "width": 8, + "height": 8, "widget": { + "title": "Executor Free", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "sum(jenkins_executor_free_value{${Cluster},${Location},${Namespace}})" + "prometheusQuery": "sum(jenkins_executor_free_value{${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Executor Free" - }, - "width": 3, - "xPos": 3, - "yPos": 2 + } + } }, { - "height": 3, + "xPos": 24, + "yPos": 20, + "width": 24, + "height": 8, "widget": { - "title": "Queue Size", + "title": "Size", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jenkins_queue_size_value{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jenkins_queue_size_value{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 4 + } + }, + { + "xPos": 32, + "width": 8, + "height": 8, + "widget": { + "title": "Executor In Use", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "sum(jenkins_executor_in_use_value{${Cluster},${Location},${Namespace}})", + "unitOverride": "" + } + } + } }, { - "height": 3, + "width": 8, + "height": 8, "widget": { - "title": "Job Queued Rate", + "title": "Health Check", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "rate(jenkins_job_queuing_duration_count{${Cluster},${Location},${Namespace}}[1m])" + "prometheusQuery": "jenkins_health_check_score{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 7 + } }, { - "height": 2, + "xPos": 16, + "width": 8, + "height": 8, "widget": { + "title": "Plugins Failed", "scorecard": { "blankView": {}, + "thresholds": [], "timeSeriesQuery": { - "prometheusQuery": "sum(jenkins_executor_in_use_value{${Cluster},${Location},${Namespace}})" + "prometheusQuery": "sum(jenkins_plugins_failed{${Cluster},${Location},${Namespace}})", + "unitOverride": "" } - }, - "title": "Executor In Use" - }, - "width": 3, - "xPos": 3, - "yPos": 0 + } + } }, { - "height": 2, + "xPos": 24, + "width": 8, + "height": 8, "widget": { - "title": "Health Check", + "title": "Nodes Offline", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "prometheusQuery": "sum(jenkins_node_offline_value{${Cluster},${Location},${Namespace}})", + "unitOverride": "" + } + } + } + }, + { + "yPos": 28, + "width": 24, + "height": 8, + "widget": { + "title": "Time Spent In Buildable State", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jenkins_health_check_score" + "prometheusQuery": "jenkins_job_buildable_duration{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 3, - "xPos": 0, - "yPos": 0 + } }, { + "yPos": 16, + "width": 24, "height": 4, "widget": { - "scorecard": { - "blankView": {}, - "timeSeriesQuery": { - "prometheusQuery": "sum(jenkins_plugins_failed{${Cluster},${Location},${Namespace}})" + "title": "Jobs", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#C8E6C9", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" } - }, - "title": "Plugins Failed" - }, - "width": 2, - "xPos": 6, - "yPos": 0 + } + } }, { - "height": 2, + "xPos": 24, + "yPos": 16, + "width": 24, + "height": 4, "widget": { - "scorecard": { - "blankView": {}, - "timeSeriesQuery": { - "prometheusQuery": "sum(jenkins_node_offline_value{${Cluster},${Location},${Namespace}})" + "title": "Build Queue", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#C6DAFC", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "xPos": 24, + "yPos": 28, + "width": 24, + "height": 8, + "widget": { + "title": "Buildable", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "jenkins_queue_buildable_value{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 36, + "width": 24, + "height": 8, + "widget": { + "title": "Pending", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "jenkins_queue_pending_value{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 44, + "width": 24, + "height": 8, + "widget": { + "title": "Blocked", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "jenkins_queue_blocked_value{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 36, + "width": 24, + "height": 8, + "widget": { + "title": "Time Spent In Waiting State", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "jenkins_job_waiting_duration{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" } - }, - "title": "Nodes Offline" - }, - "width": 3, - "xPos": 0, - "yPos": 2 + } + } }, { - "height": 3, + "yPos": 44, + "width": 24, + "height": 8, "widget": { - "title": "Job Queue Duration", + "title": "Time Spent In Blocked State", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jenkins_job_queuing_duration{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jenkins_job_blocked_duration{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 12, - "xPos": 0, - "yPos": 10 + } }, { - "height": 3, + "yPos": 8, + "width": 48, + "height": 8, "widget": { - "title": "Resource Usage", + "title": "CPU Load", "xyChart": { "chartOptions": { "mode": "COLOR" }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "vm_cpu_load{job=\"jenkins-metrics\",${Cluster},${Location},${Namespace}}" + "prometheusQuery": "vm_cpu_load{job=~\".*jenkins.*\",${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 7 + } } ] - } + }, + "labels": {} } diff --git a/dashboards/jenkins/jenkins-prometheus.png b/dashboards/jenkins/jenkins-prometheus.png index c49bcf2ffb..f8951f462c 100644 Binary files a/dashboards/jenkins/jenkins-prometheus.png and b/dashboards/jenkins/jenkins-prometheus.png differ diff --git a/dashboards/jenkins/metadata.yaml b/dashboards/jenkins/metadata.yaml index 956a1429ad..0520982bd0 100644 --- a/dashboards/jenkins/metadata.yaml +++ b/dashboards/jenkins/metadata.yaml @@ -4,7 +4,7 @@ sample_dashboards: id: jenkins-prometheus display_name: Jenkins Prometheus Overview description: |- - This dashboard has charts displaying: Health Check, Executor In Use, Nodes Offline, Executor Free, Plugins Failed, JVM Memory Free %, Job Build Speed, Queue Size, Job Queued Rate, Resource Usage, and Job Queue Duration + This dashboard has charts displaying: Health Check, CPU Load, Executor In Use, Nodes Offline, Executor Free, Plugins Failed, JVM Memory Free %, Time Spent Building, Time Spent In Buildable State, Time Spent In Waiting State, Time Spent In Blocked State, Build Queue Size, Build Queue Buildable Jobs, Build Queue Pending Jobs, and Build Queue Blocked Jobs. related_integrations: - id: jenkins platform: GKE diff --git a/dashboards/jetty/README.md b/dashboards/jetty/README.md index 2005adcce9..ca42e827b2 100644 --- a/dashboards/jetty/README.md +++ b/dashboards/jetty/README.md @@ -13,4 +13,4 @@ |Jetty Prometheus Overview| |:------------------| |Filename: [jetty-prometheus-overview.json](jetty-prometheus-overview.json)| -|This dashboard is based on prometheus metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including `Sessions`, `Session Time`, `Select Calls`, `Max Session Time`, `Threads`, and `Thread Queue`.| +|This dashboard is based on prometheus metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including `Sessions`, `Session Time`, `Select Calls`, `Max Session Time`, `Idle Threads`, `Busy Threads`, `Utilized Threads`, `Threads`, and `Thread Queue`.| diff --git a/dashboards/jetty/jetty-prometheus-overview.json b/dashboards/jetty/jetty-prometheus-overview.json index ae644528be..0d156d5a6a 100644 --- a/dashboards/jetty/jetty-prometheus-overview.json +++ b/dashboards/jetty/jetty-prometheus-overview.json @@ -1,5 +1,5 @@ { - "category": "CUSTOM", + "displayName": "Jetty Prometheus Overview", "dashboardFilters": [ { "filterType": "RESOURCE_LABEL", @@ -17,13 +17,13 @@ "templateVariable": "Namespace" } ], - "displayName": "Jetty Prometheus Overview", - "labels": {}, "mosaicLayout": { - "columns": 12, + "columns": 48, "tiles": [ { - "height": 4, + "xPos": 24, + "width": 24, + "height": 8, "widget": { "title": "Select Calls", "xyChart": { @@ -32,25 +32,31 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jetty_io_selectCount{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jetty_io_selectCount{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 4 + } }, { - "height": 4, + "xPos": 24, + "yPos": 28, + "width": 24, + "height": 8, "widget": { "title": "Sessions", "xyChart": { @@ -59,25 +65,31 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jetty_server_session_sessionsCurrent{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jetty_server_session_sessionsCurrent{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 0 + } }, { - "height": 4, + "xPos": 24, + "yPos": 20, + "width": 24, + "height": 8, "widget": { "title": "Session Time", "xyChart": { @@ -86,25 +98,31 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jetty_server_session_sessionTimeTotal{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jetty_server_session_sessionTimeTotal{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 0 + } }, { - "height": 4, + "xPos": 24, + "yPos": 12, + "width": 24, + "height": 8, "widget": { "title": "Max Session Time", "xyChart": { @@ -113,25 +131,30 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jetty_server_session_sessionTimeMax{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jetty_server_session_sessionTimeMax{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 4 + } }, { - "height": 4, + "yPos": 12, + "width": 24, + "height": 8, "widget": { "title": "Threads", "xyChart": { @@ -140,25 +163,30 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jetty_util_thread_threads{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jetty_util_thread_threads{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 0, - "yPos": 8 + } }, { - "height": 4, + "yPos": 20, + "width": 24, + "height": 8, "widget": { "title": "Thread Queue", "xyChart": { @@ -167,23 +195,120 @@ }, "dataSets": [ { + "breakdowns": [], + "dimensions": [], + "measures": [], "plotType": "LINE", "targetAxis": "Y1", "timeSeriesQuery": { - "prometheusQuery": "jetty_util_thread_queueSize{${Cluster},${Location},${Namespace}}" + "prometheusQuery": "jetty_util_thread_queueSize{${Cluster},${Location},${Namespace}}", + "unitOverride": "" } } ], + "thresholds": [], "timeshiftDuration": "0s", "yAxis": { + "label": "", "scale": "LINEAR" } } - }, - "width": 6, - "xPos": 6, - "yPos": 8 + } + }, + { + "xPos": 24, + "yPos": 8, + "width": 24, + "height": 4, + "widget": { + "title": "Sessions", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#E7EFFE", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "yPos": 8, + "width": 24, + "height": 4, + "widget": { + "title": "Threads", + "text": { + "content": "", + "format": "MARKDOWN", + "style": { + "backgroundColor": "#FEF8E6", + "fontSize": "FS_LARGE", + "horizontalAlignment": "H_LEFT", + "padding": "P_EXTRA_SMALL", + "pointerLocation": "POINTER_LOCATION_UNSPECIFIED", + "textColor": "#212121", + "verticalAlignment": "V_TOP" + } + } + } + }, + { + "width": 8, + "height": 8, + "widget": { + "title": "Busy Threads", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "outputFullDuration": true, + "prometheusQuery": "jetty_util_thread_busyThreads{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + } + }, + { + "xPos": 8, + "width": 8, + "height": 8, + "widget": { + "title": "Idle Threads", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "outputFullDuration": true, + "prometheusQuery": "jetty_util_thread_idleThreads{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + } + }, + { + "xPos": 16, + "width": 8, + "height": 8, + "widget": { + "title": "Utilized Threads", + "scorecard": { + "blankView": {}, + "thresholds": [], + "timeSeriesQuery": { + "outputFullDuration": true, + "prometheusQuery": "jetty_util_thread_utilizedThreads{${Cluster},${Location},${Namespace}}", + "unitOverride": "" + } + } + } } ] - } + }, + "labels": {} } diff --git a/dashboards/jetty/jetty-prometheus-overview.png b/dashboards/jetty/jetty-prometheus-overview.png index d1ce1e46fe..89c02e6085 100644 Binary files a/dashboards/jetty/jetty-prometheus-overview.png and b/dashboards/jetty/jetty-prometheus-overview.png differ diff --git a/dashboards/jetty/metadata.yaml b/dashboards/jetty/metadata.yaml index 9c01ac562f..a3060274dc 100644 --- a/dashboards/jetty/metadata.yaml +++ b/dashboards/jetty/metadata.yaml @@ -14,7 +14,7 @@ sample_dashboards: id: jetty-prometheus-overview display_name: Jetty Prometheus Overview description: |- - This dashboard is based on prometheus metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including Sessions, Session Time, Select Calls, Max Session Time, Threads, and Thread Queue. + This dashboard is based on prometheus metrics exposed by an [exporter](https://github.com/prometheus/jmx_exporter), including Sessions, Session Time, Select Calls, Max Session Time, Idle Threads, Busy Threads, Utilized Threads, Threads, and Thread Queue. related_integrations: - id: jetty platform: GKE diff --git a/integrations/argo-workflows/prometheus_metadata.yaml b/integrations/argo-workflows/prometheus_metadata.yaml index 775bdb7397..283fe17905 100644 --- a/integrations/argo-workflows/prometheus_metadata.yaml +++ b/integrations/argo-workflows/prometheus_metadata.yaml @@ -13,10 +13,18 @@ platforms: prometheus_name: argo_workflows_operation_duration_seconds_sum kind: CUMULATIVE value_type: DOUBLE + - name: prometheus.googleapis.com/argo_workflows_k8s_request_total/counter + prometheus_name: argo_workflows_k8s_request_total + kind: CUMULATIVE + value_type: DOUBLE - name: prometheus.googleapis.com/argo_workflows_count/gauge prometheus_name: argo_workflows_count kind: GAUGE value_type: DOUBLE + - name: prometheus.googleapis.com/argo_workflows_workflow_condition/gauge + prometheus_name: argo_workflows_workflow_condition + kind: GAUGE + value_type: DOUBLE - name: prometheus.googleapis.com/argo_workflows_queue_adds_count/counter prometheus_name: argo_workflows_queue_adds_count kind: CUMULATIVE diff --git a/integrations/consul/prometheus_metadata.yaml b/integrations/consul/prometheus_metadata.yaml index d4aac04043..59f723acc5 100644 --- a/integrations/consul/prometheus_metadata.yaml +++ b/integrations/consul/prometheus_metadata.yaml @@ -53,4 +53,12 @@ platforms: prometheus_name: consul_raft_state_leader kind: CUMULATIVE value_type: DOUBLE + - name: prometheus.googleapis.com/consul_txn_apply/summary + prometheus_name: consul_txn_apply + kind: GAUGE + value_type: DOUBLE + - name: prometheus.googleapis.com/consul_raft_leader_lastContact/summary + prometheus_name: consul_raft_leader_lastContact + kind: GAUGE + value_type: DOUBLE install_documentation_url: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/consul diff --git a/integrations/jenkins/prometheus_metadata.yaml b/integrations/jenkins/prometheus_metadata.yaml index 62697164a9..615b2eef91 100644 --- a/integrations/jenkins/prometheus_metadata.yaml +++ b/integrations/jenkins/prometheus_metadata.yaml @@ -41,10 +41,34 @@ platforms: prometheus_name: jenkins_job_building_duration kind: GAUGE value_type: DOUBLE + - name: prometheus.googleapis.com/jenkins_job_buildable_duration/summary + prometheus_name: jenkins_job_buildable_duration + kind: GAUGE + value_type: DOUBLE + - name: prometheus.googleapis.com/jenkins_job_waiting_duration/summary + prometheus_name: jenkins_job_waiting_duration + kind: GAUGE + value_type: DOUBLE + - name: prometheus.googleapis.com/jenkins_job_blocked_duration/summary + prometheus_name: jenkins_job_blocked_duration + kind: GAUGE + value_type: DOUBLE - name: prometheus.googleapis.com/jenkins_queue_size_value/gauge prometheus_name: jenkins_queue_size_value kind: GAUGE value_type: DOUBLE + - name: prometheus.googleapis.com/jenkins_queue_buildable_value/gauge + prometheus_name: jenkins_queue_buildable_value + kind: GAUGE + value_type: DOUBLE + - name: prometheus.googleapis.com/jenkins_queue_pending_value/gauge + prometheus_name: jenkins_queue_pending_value + kind: GAUGE + value_type: DOUBLE + - name: prometheus.googleapis.com/jenkins_queue_blocked_value/gauge + prometheus_name: jenkins_queue_blocked_value + kind: GAUGE + value_type: DOUBLE - name: prometheus.googleapis.com/jenkins_job_queuing_duration_count/summary prometheus_name: jenkins_job_queuing_duration_count kind: CUMULATIVE diff --git a/integrations/jetty/prometheus_metadata.yaml b/integrations/jetty/prometheus_metadata.yaml index a67bb2de48..bcba902d7c 100644 --- a/integrations/jetty/prometheus_metadata.yaml +++ b/integrations/jetty/prometheus_metadata.yaml @@ -29,6 +29,18 @@ platforms: prometheus_name: jetty_util_thread_threads kind: CUMULATIVE value_type: DOUBLE + - name: prometheus.googleapis.com/jetty_util_thread_idleThreads/unknown:counter + prometheus_name: jetty_util_thread_idleThreads + kind: CUMULATIVE + value_type: DOUBLE + - name: prometheus.googleapis.com/jetty_util_thread_busyThreads/unknown:counter + prometheus_name: jetty_util_thread_busyThreads + kind: CUMULATIVE + value_type: DOUBLE + - name: prometheus.googleapis.com/jetty_util_thread_utilizedThreads/unknown:counter + prometheus_name: jetty_util_thread_utilizedThreads + kind: CUMULATIVE + value_type: DOUBLE - name: prometheus.googleapis.com/jetty_util_thread_queueSize/unknown:counter prometheus_name: jetty_util_thread_queueSize kind: CUMULATIVE diff --git a/scripts/dashboard-importer/import.sh b/scripts/dashboard-importer/import.sh index 58ddea5ba0..b0cd954f6b 100755 --- a/scripts/dashboard-importer/import.sh +++ b/scripts/dashboard-importer/import.sh @@ -35,7 +35,8 @@ then echo "Conversion complete. Proceeding to upload..." # Extract output directory from console output - DIRECTORY=$(grep -Po '(reports.*/)' <<< "$CONVERSION_OUTPUT" | sed -n '2p') + DIRECTORY="${CONVERSION_OUTPUT##*./upload.sh }" + DIRECTORY="${DIRECTORY% *}" echo echo -e "Now running: \033[34m./upload.sh $DIRECTORY $PROJECT\033[0m\n"