From 04859aa11c4638e29fe7e2d69a26d943bd197029 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 5 Sep 2023 10:49:50 -0700 Subject: [PATCH 1/5] Add metrics --- go.mod | 10 ++++++ go.sum | 31 ++++++++++++++++++- main.go | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 125 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 1ada73c..d7b4da0 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/aws/aws-sdk-go-v2/credentials v1.13.35 github.com/aws/aws-sdk-go-v2/service/s3 v1.38.5 github.com/fxamacker/cbor/v2 v2.5.0 + github.com/prometheus/client_golang v1.16.0 golang.org/x/sync v0.3.0 ) @@ -26,5 +27,14 @@ require ( github.com/aws/aws-sdk-go-v2/service/ssooidc v1.15.5 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.21.5 // indirect github.com/aws/smithy-go v1.14.2 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/prometheus/client_model v0.3.0 // indirect + github.com/prometheus/common v0.42.0 // indirect + github.com/prometheus/procfs v0.10.1 // indirect github.com/x448/float16 v0.8.4 // indirect + golang.org/x/sys v0.8.0 // indirect + google.golang.org/protobuf v1.30.0 // indirect ) diff --git a/go.sum b/go.sum index d734bf4..8d87e89 100644 --- a/go.sum +++ b/go.sum @@ -34,18 +34,47 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.21.5 h1:CQBFElb0LS8RojMJlxRSo/HXipvT github.com/aws/aws-sdk-go-v2/service/sts v1.21.5/go.mod h1:VC7JDqsqiwXukYEDjoHh9U0fOJtNWh04FPQz4ct4GGU= github.com/aws/smithy-go v1.14.2 h1:MJU9hqBGbvWZdApzpvoF2WAIJDbtjK2NDJSiJP7HblQ= github.com/aws/smithy-go v1.14.2/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/fxamacker/cbor/v2 v2.5.0 h1:oHsG0V/Q6E/wqTS2O1Cozzsy69nqCiguo5Q1a1ADivE= github.com/fxamacker/cbor/v2 v2.5.0/go.mod h1:TA1xS00nchWmaBnEIxPSE5oHLuJBAVvqrtAnWBwBCVo= -github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.16.0 h1:yk/hx9hDbrGHovbci4BY+pRMfSuuat626eFsHb7tmT8= +github.com/prometheus/client_golang v1.16.0/go.mod h1:Zsulrv/L9oM40tJ7T815tM89lFEugiJ9HzIqaAx4LKc= +github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= +github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= +github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= +github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUAg= +github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/main.go b/main.go index 03057bf..69d9925 100644 --- a/main.go +++ b/main.go @@ -13,6 +13,7 @@ import ( "log" "net/http" "net/url" + "os" "strconv" "strings" "time" @@ -22,6 +23,9 @@ import ( "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/fxamacker/cbor/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/prometheus/client_golang/prometheus/promhttp" "golang.org/x/sync/singleflight" ) @@ -237,6 +241,10 @@ type tileCachingHandler struct { s3Bucket string // The S3 bucket to use for caching tiles. Must not be empty. cacheGroup *singleflight.Group // The singleflight.Group to use for deduplicating simultaneous requests (a.k.a. "request collapsing") for tiles. Must not be nil. + + requestsMetric *prometheus.CounterVec + partialTiles prometheus.Counter + singleFlightShared prometheus.Counter } func (tch *tileCachingHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -284,8 +292,9 @@ func (tch *tileCachingHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) // // When Trillian gets a request that is past the end of the log, it returns // 400 (for better or worse), so we emulate that here. + tch.requestsMetric.WithLabelValues("error", "ct_log_past_end").Inc() w.WriteHeader(http.StatusBadRequest) - w.Write([]byte("requested range is past the end of the log")) + fmt.Fprintln(w, "requested range is past the end of the log") return } else { contents.Entries = contents.Entries[prefixToRemove:] @@ -296,6 +305,12 @@ func (tch *tileCachingHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) contents.Entries = contents.Entries[:requestedLen] } + if w.Header().Get("X-Source") == "S3" { + tch.requestsMetric.WithLabelValues("success", "s3").Inc() + } else { + tch.requestsMetric.WithLabelValues("success", "ct_log").Inc() + } + w.Header().Set("X-Response-Len", fmt.Sprintf("%d", len(contents.Entries))) w.WriteHeader(http.StatusOK) @@ -325,11 +340,15 @@ func (tch *tileCachingHandler) getAndCacheTile(ctx context.Context, tile tile) ( source tileSource } - innerContents, err, _ := singleflightDo(tch.cacheGroup, dedupKey, func() (entriesAndSource, error) { + innerContents, err, shared := singleflightDo(tch.cacheGroup, dedupKey, func() (entriesAndSource, error) { contents, source, err := tch.getAndCacheTileUncollapsed(ctx, tile) return entriesAndSource{contents, source}, err }) + if shared { + tch.singleFlightShared.Inc() + } + // The value from our singleflightDo closure is always non-nil, so we don't // need an err != nil check here. return innerContents.entries, innerContents.source, err @@ -344,11 +363,13 @@ func (tch *tileCachingHandler) getAndCacheTileUncollapsed(ctx context.Context, t } if !errors.Is(err, noSuchKey{}) { + tch.requestsMetric.WithLabelValues("error", "s3_get").Inc() return nil, sourceS3, fmt.Errorf("error reading tile from s3: %w", err) } contents, err = getTileFromBackend(ctx, tile) if err != nil { + tch.requestsMetric.WithLabelValues("error", "ct_log_get").Inc() return nil, sourceCTLog, fmt.Errorf("error reading tile from backend: %w", err) } @@ -356,11 +377,13 @@ func (tch *tileCachingHandler) getAndCacheTileUncollapsed(ctx context.Context, t // tile isn't filled up yet. In that case, don't write to S3, but still return // results to the user. if tch.isPartialTile(contents) { + tch.partialTiles.Inc() return contents, sourceCTLog, nil } err = tch.writeToS3(ctx, tile, contents) if err != nil { + tch.requestsMetric.WithLabelValues("error", "s3_put").Inc() return nil, sourceCTLog, fmt.Errorf("error writing tile to S3: %w", err) } return contents, sourceCTLog, nil @@ -387,7 +410,8 @@ func main() { tileSize := flag.Int("tile-size", 0, "tile size. Must match the value used by the backend") s3bucket := flag.String("s3-bucket", "", "s3 bucket to use for caching") s3prefix := flag.String("s3-prefix", "", "prefix for s3 keys. defaults to value of -backend") - listenAddress := flag.String("listen-address", ":8080", "address to listen on") + listenAddress := flag.String("listen-address", ":7962", "address to listen on") + metricsAddress := flag.String("metrics-address", ":7963", "address to listen on") // fullRequestTimeout is the max allowed time the handler can read from S3 and return or read from S3, read from backend, write to S3, and return. fullRequestTimeout := flag.Duration("full-request-timeout", 4*time.Second, "max time to spend in the HTTP handler") @@ -420,13 +444,41 @@ func main() { } svc := s3.NewFromConfig(cfg) + promRegistry := newStatsRegistry(*metricsAddress) + + requestsMetric := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "requests", + Help: "total number of requests, by result and source", + }, + []string{"result", "source"}, + ) + promRegistry.MustRegister(requestsMetric) + + partialTiles := prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "partial_tiles", + Help: "number of requests not cached due to partial tile returned from CT log", + }) + promRegistry.MustRegister(partialTiles) + + singleFlightShared := prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "single_flight_shared", + Help: "how many inbound requests were coalesced into a single set of backend requests", + }) + promRegistry.MustRegister(singleFlightShared) + handler := &tileCachingHandler{ - logURL: *logURL, - tileSize: *tileSize, - s3Service: svc, - s3Prefix: *s3prefix, - s3Bucket: *s3bucket, - cacheGroup: &singleflight.Group{}, + logURL: *logURL, + tileSize: *tileSize, + s3Service: svc, + s3Prefix: *s3prefix, + s3Bucket: *s3bucket, + cacheGroup: &singleflight.Group{}, + requestsMetric: requestsMetric, + partialTiles: partialTiles, + singleFlightShared: singleFlightShared, } srv := http.Server{ @@ -440,3 +492,27 @@ func main() { log.Fatal(srv.ListenAndServe()) } + +func newStatsRegistry(listenAddress string) prometheus.Registerer { + registry := prometheus.NewRegistry() + registry.MustRegister(collectors.NewGoCollector()) + registry.MustRegister(collectors.NewProcessCollector( + collectors.ProcessCollectorOpts{})) + + server := http.Server{ + Addr: listenAddress, + ReadTimeout: 5 * time.Second, + WriteTimeout: 5 * time.Second, + IdleTimeout: 5 * time.Minute, + ReadHeaderTimeout: 2 * time.Second, + Handler: promhttp.HandlerFor(registry, promhttp.HandlerOpts{}), + } + go func() { + err := server.ListenAndServe() + if err != nil { + log.Printf("unable to start metrics server on %s: %s\n", listenAddress, err) + os.Exit(1) + } + }() + return registry +} From 7d632de3d12d672ef761620b642fccc2f57f4575 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 5 Sep 2023 12:11:36 -0700 Subject: [PATCH 2/5] Prefix metrics with ctile_ --- main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index 69d9925..3a49e1d 100644 --- a/main.go +++ b/main.go @@ -448,7 +448,7 @@ func main() { requestsMetric := prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "requests", + Name: "ctile_requests", Help: "total number of requests, by result and source", }, []string{"result", "source"}, @@ -457,14 +457,14 @@ func main() { partialTiles := prometheus.NewCounter( prometheus.CounterOpts{ - Name: "partial_tiles", + Name: "ctile_partial_tiles", Help: "number of requests not cached due to partial tile returned from CT log", }) promRegistry.MustRegister(partialTiles) singleFlightShared := prometheus.NewCounter( prometheus.CounterOpts{ - Name: "single_flight_shared", + Name: "ctile_single_flight_shared", Help: "how many inbound requests were coalesced into a single set of backend requests", }) promRegistry.MustRegister(singleFlightShared) From ad6d545edf4fe30359bf4c4bf46a186a76b5732d Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 5 Sep 2023 12:23:37 -0700 Subject: [PATCH 3/5] Review feedback --- main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 3a49e1d..8ca5b25 100644 --- a/main.go +++ b/main.go @@ -411,7 +411,7 @@ func main() { s3bucket := flag.String("s3-bucket", "", "s3 bucket to use for caching") s3prefix := flag.String("s3-prefix", "", "prefix for s3 keys. defaults to value of -backend") listenAddress := flag.String("listen-address", ":7962", "address to listen on") - metricsAddress := flag.String("metrics-address", ":7963", "address to listen on") + metricsAddress := flag.String("metrics-address", ":7963", "address to listen on for metrics") // fullRequestTimeout is the max allowed time the handler can read from S3 and return or read from S3, read from backend, write to S3, and return. fullRequestTimeout := flag.Duration("full-request-timeout", 4*time.Second, "max time to spend in the HTTP handler") @@ -465,7 +465,7 @@ func main() { singleFlightShared := prometheus.NewCounter( prometheus.CounterOpts{ Name: "ctile_single_flight_shared", - Help: "how many inbound requests were coalesced into a single set of backend requests", + Help: "number of inbound requests coalesced into a single set of backend requests", }) promRegistry.MustRegister(singleFlightShared) From 3137e87484bc13aaf133a986a808d098b00a7158 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 5 Sep 2023 12:33:02 -0700 Subject: [PATCH 4/5] fix test --- integration_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/integration_test.go b/integration_test.go index e070185..8d923f3 100644 --- a/integration_test.go +++ b/integration_test.go @@ -23,6 +23,7 @@ import ( "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/sync/singleflight" ) @@ -149,6 +150,10 @@ func TestIntegration(t *testing.T) { s3Bucket: "bucket", cacheGroup: &singleflight.Group{}, + + requestsMetric: prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"result", "source"}), + partialTiles: prometheus.NewCounter(prometheus.CounterOpts{}), + singleFlightShared: prometheus.NewCounter(prometheus.CounterOpts{}), } // Invalid URL; should 404 From 3f95a40cb57d98619de66d1c3ef58f5c2b229bb1 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 5 Sep 2023 13:01:05 -0700 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Phil Porada --- main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 8ca5b25..4725a3f 100644 --- a/main.go +++ b/main.go @@ -306,9 +306,9 @@ func (tch *tileCachingHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) } if w.Header().Get("X-Source") == "S3" { - tch.requestsMetric.WithLabelValues("success", "s3").Inc() + tch.requestsMetric.WithLabelValues("success", "s3_get").Inc() } else { - tch.requestsMetric.WithLabelValues("success", "ct_log").Inc() + tch.requestsMetric.WithLabelValues("success", "ct_log_get").Inc() } w.Header().Set("X-Response-Len", fmt.Sprintf("%d", len(contents.Entries)))