From 137fd72ab65257b37d26f6b6d3de837b8056acec Mon Sep 17 00:00:00 2001 From: Haibing Zhou Date: Wed, 10 Jan 2024 10:21:08 -0800 Subject: [PATCH 1/2] webhook: add options to disable resource_namespace tag in metrics To add some context, historically, `resource_name` was removed from this tag list due to its high potential of causing high metrics cardinality. See [knative/pkg#1464][1] for more information. While that's great, but it might not be sufficient for large scale use cases where namespaces can be super dynamic (with generateName, too) or grows fase enough. There is an issue report from [tektoncd/pipeline#3171][2] which talks about this. This proposal makes it possible to disable `resource_namespace` tag via an option function. The default behavior is not changed, so no user impact if any of existing users rely on this tag. There is no API contract change either due to the beauty of variadic functions. Now downstream projects can consume this by override `StatsReporter` in webhook context options with their own preference. As a caveat here, if downstream project does choose to override `StatsReporter`, the default `ReportMetrics` function shouldn't be called by default as they may now have a different set of tag keys to report. As such, this function is only called if the default `StatsReporter` is used. [1]: https://github.com/knative/pkg/pull/1464 [2]: https://github.com/tektoncd/pipeline/issues/3171 --- webhook/stats_reporter.go | 164 +++++++++++++++++++++++++-------- webhook/stats_reporter_test.go | 49 +++++++++- 2 files changed, 173 insertions(+), 40 deletions(-) diff --git a/webhook/stats_reporter.go b/webhook/stats_reporter.go index 1fe30e8af1..3f05d4a91c 100644 --- a/webhook/stats_reporter.go +++ b/webhook/stats_reporter.go @@ -26,6 +26,7 @@ import ( "go.opencensus.io/tag" admissionv1 "k8s.io/api/admission/v1" apixv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/util/sets" "knative.dev/pkg/metrics" ) @@ -65,19 +66,83 @@ var ( resultCodeKey = tag.MustNewKey("result_code") ) +type admissionToValue func(*admissionv1.AdmissionRequest, *admissionv1.AdmissionResponse) string +type conversionToValue func(*apixv1.ConversionRequest, *apixv1.ConversionResponse) string + +var ( + allAdmissionTags = map[tag.Key]admissionToValue{ + requestOperationKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return string(req.Operation) + }, + kindGroupKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Kind.Group + }, + kindVersionKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Kind.Version + }, + kindKindKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Kind.Kind + }, + resourceGroupKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Resource.Group + }, + resourceVersionKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Resource.Version + }, + resourceResourceKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Resource.Resource + }, + resourceNamespaceKey: func(req *admissionv1.AdmissionRequest, _ *admissionv1.AdmissionResponse) string { + return req.Namespace + }, + admissionAllowedKey: func(_ *admissionv1.AdmissionRequest, resp *admissionv1.AdmissionResponse) string { + return strconv.FormatBool(resp.Allowed) + }, + } + allConversionTags = map[tag.Key]conversionToValue{ + desiredAPIVersionKey: func(req *apixv1.ConversionRequest, _ *apixv1.ConversionResponse) string { + return req.DesiredAPIVersion + }, + resultStatusKey: func(_ *apixv1.ConversionRequest, resp *apixv1.ConversionResponse) string { + return resp.Result.Status + }, + resultReasonKey: func(_ *apixv1.ConversionRequest, resp *apixv1.ConversionResponse) string { + return string(resp.Result.Reason) + }, + resultCodeKey: func(_ *apixv1.ConversionRequest, resp *apixv1.ConversionResponse) string { + return strconv.Itoa(int(resp.Result.Code)) + }, + } +) + // StatsReporter reports webhook metrics type StatsReporter interface { ReportAdmissionRequest(request *admissionv1.AdmissionRequest, response *admissionv1.AdmissionResponse, d time.Duration) error ReportConversionRequest(request *apixv1.ConversionRequest, response *apixv1.ConversionResponse, d time.Duration) error } +type statsReporterOptions struct { + tagsToExclude sets.Set[string] +} + +type StatsReporterOption func(_ *statsReporterOptions) + +func WithoutTags(tags ...string) StatsReporterOption { + return func(opts *statsReporterOptions) { + opts.tagsToExclude.Insert(tags...) + } +} + // reporter implements StatsReporter interface type reporter struct { ctx context.Context + + admissionTags map[tag.Key]admissionToValue + conversionTags map[tag.Key]conversionToValue } // NewStatsReporter creates a reporter for webhook metrics -func NewStatsReporter() (StatsReporter, error) { +func NewStatsReporter(opts ...StatsReporterOption) (StatsReporter, error) { ctx, err := tag.New( context.Background(), ) @@ -85,23 +150,44 @@ func NewStatsReporter() (StatsReporter, error) { return nil, err } - return &reporter{ctx: ctx}, nil + options := statsReporterOptions{ + tagsToExclude: sets.New[string](), + } + for _, opt := range opts { + opt(&options) + } + + admissionTags := make(map[tag.Key]admissionToValue) + for key, f := range allAdmissionTags { + if options.tagsToExclude.Has(key.Name()) { + continue + } + admissionTags[key] = f + } + conversionTags := make(map[tag.Key]conversionToValue) + for key, f := range allConversionTags { + if options.tagsToExclude.Has(key.Name()) { + continue + } + conversionTags[key] = f + } + + return &reporter{ + ctx: ctx, + admissionTags: admissionTags, + conversionTags: conversionTags, + }, nil } // Captures req count metric, recording the count and the duration func (r *reporter) ReportAdmissionRequest(req *admissionv1.AdmissionRequest, resp *admissionv1.AdmissionResponse, d time.Duration) error { - ctx, err := tag.New( - r.ctx, - tag.Insert(requestOperationKey, string(req.Operation)), - tag.Insert(kindGroupKey, req.Kind.Group), - tag.Insert(kindVersionKey, req.Kind.Version), - tag.Insert(kindKindKey, req.Kind.Kind), - tag.Insert(resourceGroupKey, req.Resource.Group), - tag.Insert(resourceVersionKey, req.Resource.Version), - tag.Insert(resourceResourceKey, req.Resource.Resource), - tag.Insert(resourceNamespaceKey, req.Namespace), - tag.Insert(admissionAllowedKey, strconv.FormatBool(resp.Allowed)), - ) + mutators := make([]tag.Mutator, 0, len(r.admissionTags)) + + for key, f := range r.admissionTags { + mutators = append(mutators, tag.Insert(key, f(req, resp))) + } + + ctx, err := tag.New(r.ctx, mutators...) if err != nil { return err } @@ -114,13 +200,13 @@ func (r *reporter) ReportAdmissionRequest(req *admissionv1.AdmissionRequest, res // Captures req count metric, recording the count and the duration func (r *reporter) ReportConversionRequest(req *apixv1.ConversionRequest, resp *apixv1.ConversionResponse, d time.Duration) error { - ctx, err := tag.New( - r.ctx, - tag.Insert(desiredAPIVersionKey, req.DesiredAPIVersion), - tag.Insert(resultStatusKey, resp.Result.Status), - tag.Insert(resultReasonKey, string(resp.Result.Reason)), - tag.Insert(resultCodeKey, strconv.Itoa(int(resp.Result.Code))), - ) + mutators := make([]tag.Mutator, 0, len(r.conversionTags)) + + for key, f := range r.conversionTags { + mutators = append(mutators, tag.Insert(key, f(req, resp))) + } + + ctx, err := tag.New(r.ctx, mutators...) if err != nil { return err } @@ -131,21 +217,27 @@ func (r *reporter) ReportConversionRequest(req *apixv1.ConversionRequest, resp * return nil } -func RegisterMetrics() { - tagKeys := []tag.Key{ - requestOperationKey, - kindGroupKey, - kindVersionKey, - kindKindKey, - resourceGroupKey, - resourceVersionKey, - resourceResourceKey, - resourceNamespaceKey, - admissionAllowedKey, - desiredAPIVersionKey, - resultStatusKey, - resultReasonKey, - resultCodeKey} +func RegisterMetrics(opts ...StatsReporterOption) { + options := statsReporterOptions{ + tagsToExclude: sets.New[string](), + } + for _, opt := range opts { + opt(&options) + } + + tagKeys := []tag.Key{} + for tag := range allAdmissionTags { + if options.tagsToExclude.Has(tag.Name()) { + continue + } + tagKeys = append(tagKeys, tag) + } + for tag := range allConversionTags { + if options.tagsToExclude.Has(tag.Name()) { + continue + } + tagKeys = append(tagKeys, tag) + } if err := view.Register( &view.View{ diff --git a/webhook/stats_reporter_test.go b/webhook/stats_reporter_test.go index e3911b4d4a..84d63a3707 100644 --- a/webhook/stats_reporter_test.go +++ b/webhook/stats_reporter_test.go @@ -70,6 +70,47 @@ func TestWebhookStatsReporterAdmission(t *testing.T) { metricstest.CheckDistributionData(t, requestLatenciesName, expectedTags, 2, shortTime, longTime) } +func TestWebhookStatsReporterAdmissionWithoutNamespaceTag(t *testing.T) { + setup(WithoutTags(resourceNamespaceKey.Name())) + req := &admissionv1.AdmissionRequest{ + UID: "705ab4f5-6393-11e8-b7cc-42010a800002", + Kind: metav1.GroupVersionKind{Group: "autoscaling", Version: "v1", Kind: "Scale"}, + Resource: metav1.GroupVersionResource{Group: "apps", Version: "v1", Resource: "deployments"}, + Name: "my-deployment", + Namespace: "my-namespace", + Operation: admissionv1.Update, + } + + resp := &admissionv1.AdmissionResponse{ + UID: req.UID, + Allowed: true, + } + + r, _ := NewStatsReporter(WithoutTags(resourceNamespaceKey.Name())) + + shortTime, longTime := 1100.0, 9100.0 + expectedTags := map[string]string{ + requestOperationKey.Name(): string(req.Operation), + kindGroupKey.Name(): req.Kind.Group, + kindVersionKey.Name(): req.Kind.Version, + kindKindKey.Name(): req.Kind.Kind, + resourceGroupKey.Name(): req.Resource.Group, + resourceVersionKey.Name(): req.Resource.Version, + resourceResourceKey.Name(): req.Resource.Resource, + admissionAllowedKey.Name(): strconv.FormatBool(resp.Allowed), + } + + if err := r.ReportAdmissionRequest(req, resp, time.Duration(shortTime)*time.Millisecond); err != nil { + t.Fatalf("ReportAdmissionRequest() = %v", err) + } + if err := r.ReportAdmissionRequest(req, resp, time.Duration(longTime)*time.Millisecond); err != nil { + t.Fatalf("ReportAdmissionRequest() = %v", err) + } + + metricstest.CheckCountData(t, requestCountName, expectedTags, 2) + metricstest.CheckDistributionData(t, requestLatenciesName, expectedTags, 2, shortTime, longTime) +} + func TestWebhookStatsReporterConversion(t *testing.T) { setup() req := &apixv1.ConversionRequest{ @@ -103,12 +144,12 @@ func TestWebhookStatsReporterConversion(t *testing.T) { metricstest.CheckDistributionData(t, requestLatenciesName, expectedTags, 2, shortTime, longTime) } -func setup() { - resetMetrics() +func setup(opts ...StatsReporterOption) { + resetMetrics(opts...) } // opencensus metrics carry global state that need to be reset between unit tests -func resetMetrics() { +func resetMetrics(opts ...StatsReporterOption) { metricstest.Unregister(requestCountName, requestLatenciesName) - RegisterMetrics() + RegisterMetrics(opts...) } From 46a2e067a4e67095ef121c7975db0180806cc647 Mon Sep 17 00:00:00 2001 From: zhouhaibing089 Date: Sun, 31 Mar 2024 16:44:43 -0700 Subject: [PATCH 2/2] webhook: add StatsReporterOptions in webhook.Options There are two ways to customize StatsReporter: 1. Use a whole new StatsReporter implementation. 1. Or pass Option funcs to customize the default StatsReporter. Option 1 is less practical at this time due to the metrics registration conflict. `webhook.RegisterMetrics()` is called regardless which StatsReporter implementation is used (which is a problem by itself). The second option is more practical since it works well without dealing with metrics conflicts. The `webhook.Option` in particular allows people to discard certain metrics tags. --- injection/sharedmain/main.go | 7 ++++++- webhook/webhook.go | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/injection/sharedmain/main.go b/injection/sharedmain/main.go index 25562e965e..79c42c2014 100644 --- a/injection/sharedmain/main.go +++ b/injection/sharedmain/main.go @@ -290,7 +290,12 @@ func MainWithConfig(ctx context.Context, component string, cfg *rest.Config, cto var wh *webhook.Webhook if len(webhooks) > 0 { // Register webhook metrics - webhook.RegisterMetrics() + opts := webhook.GetOptions(ctx) + if opts != nil { + webhook.RegisterMetrics(opts.StatsReporterOptions...) + } else { + webhook.RegisterMetrics() + } wh, err = webhook.New(ctx, webhooks) if err != nil { diff --git a/webhook/webhook.go b/webhook/webhook.go index eff693e80d..d8842df35a 100644 --- a/webhook/webhook.go +++ b/webhook/webhook.go @@ -70,6 +70,9 @@ type Options struct { // only a single port for the service. Port int + // StatsReporterOptions are the options used to initialize the default StatsReporter + StatsReporterOptions []StatsReporterOption + // StatsReporter reports metrics about the webhook. // This will be automatically initialized by the constructor if left uninitialized. StatsReporter StatsReporter @@ -144,7 +147,7 @@ func New( logger := logging.FromContext(ctx) if opts.StatsReporter == nil { - reporter, err := NewStatsReporter() + reporter, err := NewStatsReporter(opts.StatsReporterOptions...) if err != nil { return nil, err }