Skip to content

Commit

Permalink
fix: request record label with commitment mode and version
Browse files Browse the repository at this point in the history
  • Loading branch information
hopeyen committed Aug 30, 2024
1 parent bb493d8 commit 790e10a
Show file tree
Hide file tree
Showing 5 changed files with 327 additions and 16 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ An optional storage caching CLI flag `--routing.cache-targets` can be leveraged

To the see list of available metrics, run `./bin/eigenda-proxy doc metrics`

To quickly set up monitoring dashboard, add eigenda-proxy metrics endpoint to a reachable prometheus server config as a scrape target, add prometheus datasource to Grafana to, and import the existing [Grafana dashboard JSON file](./grafana_dashboard.json)

## Deployment Guide

### Hardware Requirements
Expand Down
243 changes: 243 additions & 0 deletions grafana_dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 2,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "ddshms3dlineoe"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ddshms3dlineoe"
},
"editorMode": "code",
"expr": "eigenda_proxy_default_rpc_server_requests_total{method=\"/put/\"}",
"instant": false,
"legendFormat": "{{__name__}}",
"range": true,
"refId": "A"
}
],
"title": "/put requests total",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ddshms3dlineoe"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 4,
"options": {
"displayMode": "gradient",
"maxVizHeight": 300,
"minVizHeight": 16,
"minVizWidth": 8,
"namePlacement": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showUnfilled": true,
"sizing": "auto",
"valueMode": "color"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ddshms3dlineoe"
},
"editorMode": "code",
"expr": "eigenda_proxy_default_rpc_server_request_duration_seconds_bucket{method=\"/put/\"}",
"format": "heatmap",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "/put requests duration",
"type": "bargauge"
},
{
"datasource": {
"type": "loki",
"uid": "loki-datasource"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 10
},
"id": 2,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "loki-datasource"
},
"editorMode": "builder",
"expr": "{container=\"ops-bedrock-da-server-1\"} |= ``",
"queryType": "range",
"refId": "A"
}
],
"title": "logs",
"type": "logs"
}
],
"schemaVersion": 39,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "EigenDA Proxy",
"uid": "ddw5n232n5vy8e",
"version": 1,
"weekStart": ""
}
33 changes: 26 additions & 7 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ type Config struct {
type Metricer interface {
RecordInfo(version string)
RecordUp()
RecordRPCServerRequest(method string) func(status string)
RecordBadRequestHeader(method string, errorCategory string)
RecordRPCServerRequest(method string, commitmentMode string, version string) func(status string)

Document() []metrics.DocumentedMetric
}
Expand All @@ -40,6 +41,7 @@ type Metrics struct {
Up prometheus.Gauge

HTTPServerRequestsTotal *prometheus.CounterVec
HTTPServerBadRequestHeader *prometheus.CounterVec
HTTPServerRequestDurationSeconds *prometheus.HistogramVec

registry *prometheus.Registry
Expand Down Expand Up @@ -79,7 +81,15 @@ func NewMetrics(subsystem string) *Metrics {
Name: "requests_total",
Help: "Total requests to the HTTP server",
}, []string{
"method", "status",
"method", "status", "commitment_mode", "commitment_version",
}),
HTTPServerBadRequestHeader: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: httpServerSubsystem,
Name: "requests_bad_header_total",
Help: "Total requests to the HTTP server with bad headers",
}, []string{
"method", "error_type",
}),
HTTPServerRequestDurationSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Expand All @@ -90,7 +100,7 @@ func NewMetrics(subsystem string) *Metrics {
Buckets: prometheus.ExponentialBucketsRange(0.05, 1200, 20),
Help: "Histogram of HTTP server request durations",
}, []string{
"method", // no status on histograms because those are very expensive
"method", "commitment_mode", "commitment_version", // no status on histograms because those are very expensive
}),
registry: registry,
factory: factory,
Expand All @@ -112,16 +122,22 @@ func (m *Metrics) RecordUp() {
// RecordRPCServerRequest is a helper method to record an incoming HTTP request.
// It bumps the requests metric, and tracks how long it takes to serve a response,
// including the HTTP status code.
func (m *Metrics) RecordRPCServerRequest(method string) func(status string) {
func (m *Metrics) RecordRPCServerRequest(method string, mode string, ver string) func(status string) {
// we don't want to track the status code on the histogram because that would
// create a huge number of labels, and cost a lot on cloud hosted services
timer := prometheus.NewTimer(m.HTTPServerRequestDurationSeconds.WithLabelValues(method))
timer := prometheus.NewTimer(m.HTTPServerRequestDurationSeconds.WithLabelValues(method, mode, ver))
return func(status string) {
m.HTTPServerRequestsTotal.WithLabelValues(method, status).Inc()
m.HTTPServerRequestsTotal.WithLabelValues(method, status, mode, ver).Inc()
timer.ObserveDuration()
}
}

// RecordBadRequestHeader record an incoming HTTP request that fails the commitment
// scheme formats with labels type of request method and error type.
func (m *Metrics) RecordBadRequestHeader(method string, err string) {
m.HTTPServerBadRequestHeader.WithLabelValues(method, err).Inc()
}

// StartServer starts the metrics server on the given hostname and port.
func (m *Metrics) StartServer(hostname string, port int) (*ophttp.HTTPServer, error) {
addr := net.JoinHostPort(hostname, strconv.Itoa(port))
Expand Down Expand Up @@ -150,6 +166,9 @@ func (n *noopMetricer) RecordInfo(_ string) {
func (n *noopMetricer) RecordUp() {
}

func (n *noopMetricer) RecordRPCServerRequest(string) func(status string) {
func (n *noopMetricer) RecordBadRequestHeader(string, string) {
}

func (n *noopMetricer) RecordRPCServerRequest(string, string, string) func(status string) {
return func(string) {}
}
29 changes: 29 additions & 0 deletions monitor/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "eigenda-proxy"

# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.

static_configs:
- targets: ["localhost:7300"]
Loading

0 comments on commit 790e10a

Please sign in to comment.