From 790e10ab8b4ed90be84081198518d48067aba4d2 Mon Sep 17 00:00:00 2001 From: hopeyen Date: Fri, 30 Aug 2024 09:43:28 -0500 Subject: [PATCH] fix: request record label with commitment mode and version --- README.md | 2 + grafana_dashboard.json | 243 +++++++++++++++++++++++++++++++++++++++++ metrics/metrics.go | 33 ++++-- monitor/prometheus.yml | 29 +++++ server/server.go | 36 ++++-- 5 files changed, 327 insertions(+), 16 deletions(-) create mode 100644 grafana_dashboard.json create mode 100644 monitor/prometheus.yml diff --git a/README.md b/README.md index 2c6f0d75..f8e6b3c0 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,8 @@ An optional storage caching CLI flag `--routing.cache-targets` can be leveraged To the see list of available metrics, run `./bin/eigenda-proxy doc metrics` +To quickly set up monitoring dashboard, add eigenda-proxy metrics endpoint to a reachable prometheus server config as a scrape target, add prometheus datasource to Grafana to, and import the existing [Grafana dashboard JSON file](./grafana_dashboard.json) + ## Deployment Guide ### Hardware Requirements diff --git a/grafana_dashboard.json b/grafana_dashboard.json new file mode 100644 index 00000000..eb7f48fb --- /dev/null +++ b/grafana_dashboard.json @@ -0,0 +1,243 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddshms3dlineoe" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddshms3dlineoe" + }, + "editorMode": "code", + "expr": "eigenda_proxy_default_rpc_server_requests_total{method=\"/put/\"}", + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A" + } + ], + "title": "/put requests total", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddshms3dlineoe" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddshms3dlineoe" + }, + "editorMode": "code", + "expr": "eigenda_proxy_default_rpc_server_request_duration_seconds_bucket{method=\"/put/\"}", + "format": "heatmap", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "/put requests duration", + "type": "bargauge" + }, + { + "datasource": { + "type": "loki", + "uid": "loki-datasource" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 2, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki-datasource" + }, + "editorMode": "builder", + "expr": "{container=\"ops-bedrock-da-server-1\"} |= ``", + "queryType": "range", + "refId": "A" + } + ], + "title": "logs", + "type": "logs" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "EigenDA Proxy", + "uid": "ddw5n232n5vy8e", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/metrics/metrics.go b/metrics/metrics.go index d46dec78..543f0dad 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -29,7 +29,8 @@ type Config struct { type Metricer interface { RecordInfo(version string) RecordUp() - RecordRPCServerRequest(method string) func(status string) + RecordBadRequestHeader(method string, errorCategory string) + RecordRPCServerRequest(method string, commitmentMode string, version string) func(status string) Document() []metrics.DocumentedMetric } @@ -40,6 +41,7 @@ type Metrics struct { Up prometheus.Gauge HTTPServerRequestsTotal *prometheus.CounterVec + HTTPServerBadRequestHeader *prometheus.CounterVec HTTPServerRequestDurationSeconds *prometheus.HistogramVec registry *prometheus.Registry @@ -79,7 +81,15 @@ func NewMetrics(subsystem string) *Metrics { Name: "requests_total", Help: "Total requests to the HTTP server", }, []string{ - "method", "status", + "method", "status", "commitment_mode", "commitment_version", + }), + HTTPServerBadRequestHeader: factory.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: httpServerSubsystem, + Name: "requests_bad_header_total", + Help: "Total requests to the HTTP server with bad headers", + }, []string{ + "method", "error_type", }), HTTPServerRequestDurationSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{ Namespace: namespace, @@ -90,7 +100,7 @@ func NewMetrics(subsystem string) *Metrics { Buckets: prometheus.ExponentialBucketsRange(0.05, 1200, 20), Help: "Histogram of HTTP server request durations", }, []string{ - "method", // no status on histograms because those are very expensive + "method", "commitment_mode", "commitment_version", // no status on histograms because those are very expensive }), registry: registry, factory: factory, @@ -112,16 +122,22 @@ func (m *Metrics) RecordUp() { // RecordRPCServerRequest is a helper method to record an incoming HTTP request. // It bumps the requests metric, and tracks how long it takes to serve a response, // including the HTTP status code. -func (m *Metrics) RecordRPCServerRequest(method string) func(status string) { +func (m *Metrics) RecordRPCServerRequest(method string, mode string, ver string) func(status string) { // we don't want to track the status code on the histogram because that would // create a huge number of labels, and cost a lot on cloud hosted services - timer := prometheus.NewTimer(m.HTTPServerRequestDurationSeconds.WithLabelValues(method)) + timer := prometheus.NewTimer(m.HTTPServerRequestDurationSeconds.WithLabelValues(method, mode, ver)) return func(status string) { - m.HTTPServerRequestsTotal.WithLabelValues(method, status).Inc() + m.HTTPServerRequestsTotal.WithLabelValues(method, status, mode, ver).Inc() timer.ObserveDuration() } } +// RecordBadRequestHeader record an incoming HTTP request that fails the commitment +// scheme formats with labels type of request method and error type. +func (m *Metrics) RecordBadRequestHeader(method string, err string) { + m.HTTPServerBadRequestHeader.WithLabelValues(method, err).Inc() +} + // StartServer starts the metrics server on the given hostname and port. func (m *Metrics) StartServer(hostname string, port int) (*ophttp.HTTPServer, error) { addr := net.JoinHostPort(hostname, strconv.Itoa(port)) @@ -150,6 +166,9 @@ func (n *noopMetricer) RecordInfo(_ string) { func (n *noopMetricer) RecordUp() { } -func (n *noopMetricer) RecordRPCServerRequest(string) func(status string) { +func (n *noopMetricer) RecordBadRequestHeader(string, string) { +} + +func (n *noopMetricer) RecordRPCServerRequest(string, string, string) func(status string) { return func(string) {} } diff --git a/monitor/prometheus.yml b/monitor/prometheus.yml new file mode 100644 index 00000000..3f7441a7 --- /dev/null +++ b/monitor/prometheus.yml @@ -0,0 +1,29 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: "eigenda-proxy" + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ["localhost:7300"] diff --git a/server/server.go b/server/server.go index 6a1aa7fe..fe54d22a 100644 --- a/server/server.go +++ b/server/server.go @@ -66,16 +66,21 @@ func NewServer(host string, port int, router *store.Router, log log.Logger, func WithMetrics(handleFn func(http.ResponseWriter, *http.Request) error, m metrics.Metricer) func(http.ResponseWriter, *http.Request) error { return func(w http.ResponseWriter, r *http.Request) error { - // we use a commitment schema (https://github.com/Layr-Labs/eigenda-proxy?tab=readme-ov-file#commitment-schemas) - // where the first 3 bytes of the path are the commitment header - // commit type | da layer type | version byte - // we want to group all requests by commitment header, otherwise the prometheus metric labels will explode - // TODO: commitment header is different for non-op commitments. We will need to change this to accommodate other commitments. - // probably want (commitment mode, cert version) as the labels, since commit-type/da-layer are not relevant anyways. - commitmentHeader := r.URL.Path[:3] - recordDur := m.RecordRPCServerRequest(commitmentHeader) + // label requests with commitment mode and version + ct, err := ReadCommitmentMode(r) + if err != nil { + m.RecordBadRequestHeader(r.Method, invalidCommitmentMode) + return err + } + vb, err := ReadCommitmentVersion(r, ct) + if err != nil { + m.RecordBadRequestHeader(r.Method, invalidCommitmentMode) + return err + } - err := handleFn(w, r) + recordDur := m.RecordRPCServerRequest(r.Method, string(ct), strconv.Itoa(int(vb))) + + err = handleFn(w, r) // we assume that every route will set the status header recordDur(w.Header().Get("status")) return err @@ -290,6 +295,19 @@ func ReadCommitmentMode(r *http.Request) (commitments.CommitmentMode, error) { return commitments.OptimismAltDA, nil } +func ReadCommitmentVersion(r *http.Request, mode commitments.CommitmentMode) (uint8, error) { + commitment := r.URL.Path[1:] + if len(commitment) < 3 { + return 0, fmt.Errorf("commitment is too short") + } + + if mode == commitments.OptimismAltDA || mode == commitments.OptimismGeneric { + return commitment[2], nil + } + // the only other mode is simple, which take first byte as version + return commitment[0], nil +} + func (svr *Server) GetEigenDAStats() *store.Stats { return svr.router.GetEigenDAStore().Stats() }