Layr-Labs · samlaf · Aug 22, 2024 · Aug 20, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/metrics/metrics.go b/metrics/metrics.go
@@ -13,7 +13,8 @@ import (
 )
 
 const (
-	Namespace = "eigenda_proxy"
+	namespace           = "eigenda_proxy"
+	httpServerSubsystem = "http_server"
 )
 
 // Config ... Metrics server configuration
@@ -28,8 +29,7 @@ type Config struct {
 type Metricer interface {
 	RecordInfo(version string)
 	RecordUp()
-	RecordRPCServerRequest(method string) func()
-	RecordRPCClientResponse(method string, err error)
+	RecordRPCServerRequest(method string) func(status string)
 
 	Document() []metrics.DocumentedMetric
 }
@@ -39,19 +39,19 @@ type Metrics struct {
 	Info *prometheus.GaugeVec
 	Up   prometheus.Gauge
 
-	metrics.RPCMetrics
+	HTTPServerRequestsTotal          *prometheus.CounterVec
+	HTTPServerRequestDurationSeconds *prometheus.HistogramVec
 
 	registry *prometheus.Registry
 	factory  metrics.Factory
 }
 
 var _ Metricer = (*Metrics)(nil)
 
-func NewMetrics(procName string) *Metrics {
-	if procName == "" {
-		procName = "default"
+func NewMetrics(subsystem string) *Metrics {
+	if subsystem == "" {
+		subsystem = "default"
 	}
-	ns := Namespace + "_" + procName
 
 	registry := prometheus.NewRegistry()
 	registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
@@ -60,20 +60,40 @@ func NewMetrics(procName string) *Metrics {
 
 	return &Metrics{
 		Up: factory.NewGauge(prometheus.GaugeOpts{
-			Namespace: ns,
+			Namespace: namespace,
+			Subsystem: subsystem,
 			Name:      "up",
 			Help:      "1 if the proxy server has finished starting up",
 		}),
 		Info: factory.NewGaugeVec(prometheus.GaugeOpts{
-			Namespace: ns,
+			Namespace: namespace,
+			Subsystem: subsystem,
 			Name:      "info",
 			Help:      "Pseudo-metric tracking version and config info",
 		}, []string{
 			"version",
 		}),
-		RPCMetrics: metrics.MakeRPCMetrics(ns, factory),
-		registry:   registry,
-		factory:    factory,
+		HTTPServerRequestsTotal: factory.NewCounterVec(prometheus.CounterOpts{
+			Namespace: namespace,
+			Subsystem: httpServerSubsystem,
+			Name:      "requests_total",
+			Help:      "Total requests to the HTTP server",
+		}, []string{
+			"method", "status",
+		}),
+		HTTPServerRequestDurationSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
+			Namespace: namespace,
+			Subsystem: httpServerSubsystem,
+			Name:      "request_duration_seconds",
+			// TODO: we might want different buckets for different routes?
+			// also probably different buckets depending on the backend (memstore, s3, and eigenda have different latencies)
+			Buckets: prometheus.ExponentialBucketsRange(0.05, 1200, 20),
+			Help:    "Histogram of HTTP server request durations",
+		}, []string{
+			"method", // no status on histograms because those are very expensive
+		}),
+		registry: registry,
+		factory:  factory,
 	}
 }
 
@@ -89,6 +109,19 @@ func (m *Metrics) RecordUp() {
 	m.Up.Set(1)
 }
 
+// RecordRPCServerRequest is a helper method to record an incoming HTTP request.
+// It bumps the requests metric, and tracks how long it takes to serve a response,
+// including the HTTP status code.
+func (m *Metrics) RecordRPCServerRequest(method string) func(status string) {
+	// we don't want to track the status code on the histogram because that would
+	// create a huge number of labels, and cost a lot on cloud hosted services
+	timer := prometheus.NewTimer(m.HTTPServerRequestDurationSeconds.WithLabelValues(method))
+	return func(status string) {
+		m.HTTPServerRequestsTotal.WithLabelValues(method, status).Inc()
+		timer.ObserveDuration()
+	}
+}
+
 // StartServer starts the metrics server on the given hostname and port.
 func (m *Metrics) StartServer(hostname string, port int) (*ophttp.HTTPServer, error) {
 	addr := net.JoinHostPort(hostname, strconv.Itoa(port))
@@ -103,7 +136,6 @@ func (m *Metrics) Document() []metrics.DocumentedMetric {
 }
 
 type noopMetricer struct {
-	metrics.NoopRPCMetrics
 }
 
 var NoopMetrics Metricer = new(noopMetricer)
@@ -117,3 +149,7 @@ func (n *noopMetricer) RecordInfo(_ string) {
 
 func (n *noopMetricer) RecordUp() {
 }
+
+func (n *noopMetricer) RecordRPCServerRequest(string) func(status string) {
+	return func(string) {}
+}
diff --git a/server/server.go b/server/server.go
@@ -70,11 +70,15 @@ func WithMetrics(handleFn func(http.ResponseWriter, *http.Request) error,
 		// where the first 3 bytes of the path are the commitment header
 		// commit type | da layer type | version byte
 		// we want to group all requests by commitment header, otherwise the prometheus metric labels will explode
+		// TODO: commitment header is different for non-op commitments. We will need to change this to accommodate other commitments.
+		//       probably want (commitment mode, cert version) as the labels, since commit-type/da-layer are not relevant anyways.
 		commitmentHeader := r.URL.Path[:3]
 		recordDur := m.RecordRPCServerRequest(commitmentHeader)
-		defer recordDur()
 
-		return handleFn(w, r)
+		err := handleFn(w, r)
+		// we assume that every route will set the status header
+		recordDur(w.Header().Get("status"))
+		return err
 	}
 }