From 8ede86afff8b54459d9855657775e5af2649c63b Mon Sep 17 00:00:00 2001 From: Patrick Seidensal Date: Tue, 7 May 2024 11:08:18 +0200 Subject: [PATCH] metrics: make sure metrics work well with sharding Refers to #2355 --- .github/workflows/e2e-ci.yml | 1 + .github/workflows/nightly-ci.yml | 1 + charts/fleet/templates/service.yaml | 16 +++++- dev/README.md | 21 +++++--- e2e/assets/clustergroup-template.yaml | 6 +++ e2e/assets/gitrepo-template.yaml | 5 +- .../metrics/fleetcontroller_service.yaml | 10 +++- e2e/metrics/bundle_test.go | 1 + e2e/metrics/bundledeployment_test.go | 1 + e2e/metrics/cluster_test.go | 29 +++++++--- e2e/metrics/clustergroup_test.go | 5 ++ e2e/metrics/exporter.go | 2 +- e2e/metrics/gitrepo_test.go | 11 ++-- e2e/metrics/suite_test.go | 53 ++++++++++++------- e2e/single-cluster/values_from_test.go | 2 +- e2e/testenv/template.go | 20 +++++-- 16 files changed, 133 insertions(+), 51 deletions(-) diff --git a/.github/workflows/e2e-ci.yml b/.github/workflows/e2e-ci.yml index a7ceb39652..13ba72e69f 100644 --- a/.github/workflows/e2e-ci.yml +++ b/.github/workflows/e2e-ci.yml @@ -108,6 +108,7 @@ jobs: # 2. Run tests for metrics ginkgo e2e/metrics + SHARD=shard1 ginkgo e2e/metrics # 3. Run tests requiring only the git server e2e/testenv/infra/infra setup --git-server=true diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index eb282d2b1e..92f243bc84 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -111,6 +111,7 @@ jobs: # 2. Run tests for metrics ginkgo e2e/metrics + SHARD=shard1 ginkgo e2e/metrics # 3. Run tests requiring only the git server e2e/testenv/infra/infra setup --git-server=true diff --git a/charts/fleet/templates/service.yaml b/charts/fleet/templates/service.yaml index cc17e9b285..6451e8eab4 100644 --- a/charts/fleet/templates/service.yaml +++ b/charts/fleet/templates/service.yaml @@ -1,8 +1,13 @@ -{{- if .Values.metrics.enabled }} +{{ $shards := list "" }} +{{ if .Values.shards }} +{{ $shards = concat $shards .Values.shards | uniq }} +{{ end }} +{{ range $shards }} +{{- if $.Values.metrics.enabled }} apiVersion: v1 kind: Service metadata: - name: monitoring-fleet-controller + name: "monitoring-fleet-controller{{if . }}-shard-{{ . }}{{end}}" labels: app: fleet-controller spec: @@ -14,4 +19,11 @@ spec: name: metrics selector: app: fleet-controller + {{- if empty . }} + shard-default: "true" + {{- else }} + shard: "{{ . }}" + {{- end }} +{{- end }} +--- {{- end }} diff --git a/dev/README.md b/dev/README.md index eb4f7874cd..a0330428f1 100644 --- a/dev/README.md +++ b/dev/README.md @@ -389,9 +389,11 @@ helm upgrade --install --create-namespace -n cattle-system-monitoring \ That alone suffices to get a working monitoring setup for the upstream cluster. But to connect it to the fleet-controller exported metrics, you need to add a -service monitor. The service monitor is currently not part of the Helm chart. -However, the necessary Kubernetes service resource is, unless you have disabled -monitoring in the Helm chart when installing fleet. +service monitor for each controller. + +The service monitor is currently not part of the Helm chart. However, the +necessary Kubernetes service resource is, unless you have disabled monitoring in +the Helm chart when installing fleet. ```yaml apiVersion: monitoring.coreos.com/v1 @@ -415,12 +417,17 @@ spec: selector: matchLabels: app: fleet-controller - + shard-default: true + # or: + # shard: ``` -This configures Prometheus to scrape the metrics from the fleet-controller. By -accessing the Prometheus UI, you can now see the metrics from the -Fleet-Controller. They are all prefixed with `fleet_`, which you will need to +This configures Prometheus to scrape the metrics from the (unshared) +fleet-controller. Should you have sharding enabled, you will additionally need +to add one ServiceMonitor for each shard. + +By accessing the Prometheus UI, you can now see the metrics from the +fleet-controller. They are all prefixed with `fleet_`, which you will need to enter into the expression field of the Graph page to to get auto-completion. > **__NOTE:__** The Prometheus UI can be used to check the Prometheus diff --git a/e2e/assets/clustergroup-template.yaml b/e2e/assets/clustergroup-template.yaml index 7c8f319478..d81e83400a 100644 --- a/e2e/assets/clustergroup-template.yaml +++ b/e2e/assets/clustergroup-template.yaml @@ -3,6 +3,12 @@ kind: ClusterGroup metadata: name: {{ .Name }} namespace: {{ .Namespace }} + {{- if .Labels }} + labels: + {{- range $key, $value := .Labels }} + {{$key}}: {{$value}} + {{- end }} + {{- end }} spec: selector: matchLabels: diff --git a/e2e/assets/gitrepo-template.yaml b/e2e/assets/gitrepo-template.yaml index 8f4f331577..579cc5f0d5 100644 --- a/e2e/assets/gitrepo-template.yaml +++ b/e2e/assets/gitrepo-template.yaml @@ -2,6 +2,10 @@ kind: GitRepo apiVersion: fleet.cattle.io/v1alpha1 metadata: name: {{ .Name }} + {{- if ne .Shard "" }} + labels: + fleet.cattle.io/shard: {{ .Shard }} + {{- end }} spec: repo: https://github.com/rancher/fleet-test-data @@ -10,5 +14,4 @@ spec: {{- range .Paths}} - {{.}} {{- end}} - targetNamespace: {{ .TargetNamespace }} diff --git a/e2e/assets/metrics/fleetcontroller_service.yaml b/e2e/assets/metrics/fleetcontroller_service.yaml index dea6ffeecd..7ae6583838 100644 --- a/e2e/assets/metrics/fleetcontroller_service.yaml +++ b/e2e/assets/metrics/fleetcontroller_service.yaml @@ -2,10 +2,18 @@ apiVersion: v1 kind: Service metadata: name: {{ .Name }} + labels: + app: fleet-controller + env: test spec: selector: app: fleet-controller - shard-default: "true" + {{- if .IsDefaultShard }} + shard-default: "{{ .IsDefaultShard }}" + {{- end }} + {{- if ne .Shard "" }} + shard: {{ .Shard }} + {{- end }} ports: - protocol: TCP port: {{ .Port }} diff --git a/e2e/metrics/bundle_test.go b/e2e/metrics/bundle_test.go index 5c3736c80a..d93e4d291d 100644 --- a/e2e/metrics/bundle_test.go +++ b/e2e/metrics/bundle_test.go @@ -147,6 +147,7 @@ var _ = Describe("Bundle Metrics", Label("bundle"), func() { namespace, gitRepoName, branch, + shard, "simple-manifest", ) Expect(err).ToNot(HaveOccurred()) diff --git a/e2e/metrics/bundledeployment_test.go b/e2e/metrics/bundledeployment_test.go index 2521a6ef7a..ff46619e6d 100644 --- a/e2e/metrics/bundledeployment_test.go +++ b/e2e/metrics/bundledeployment_test.go @@ -39,6 +39,7 @@ var _ = Describe("BundleDeployment Metrics", Label("bundledeployment"), func() { namespace, objName, branch, + shard, "simple-manifest", ) Expect(err).ToNot(HaveOccurred()) diff --git a/e2e/metrics/cluster_test.go b/e2e/metrics/cluster_test.go index 29f0389e6a..70725dfdb1 100644 --- a/e2e/metrics/cluster_test.go +++ b/e2e/metrics/cluster_test.go @@ -2,6 +2,7 @@ package metrics_test import ( "fmt" + "os" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -87,6 +88,9 @@ func expectMetrics(name string, expectedMetrics MetricsSelector, expectExist boo var _ = Describe("Cluster Metrics", Label("cluster"), func() { It("should have metrics for the local cluster resource", func() { + if os.Getenv("SHARD") != "" { + Skip("local cluster isn't handled by shards and hence cannot be tested using sharding") + } eventuallyExpectMetrics("local", expectedMetrics, true) }) @@ -99,14 +103,19 @@ var _ = Describe("Cluster Metrics", Label("cluster"), func() { ns := "fleet-local" kw := k.Namespace(ns) + labels := map[string]string{ + "management.cattle.io/cluster-display-name": "testing", + "name": "testing", + } + if shard != "" { + labels["fleet.cattle.io/shard"] = shard + } + err := testenv.CreateCluster( k, ns, name, - map[string]string{ - "management.cattle.io/cluster-display-name": "testing", - "name": "testing", - }, + labels, map[string]string{ "clientID": "testing", }, @@ -146,15 +155,19 @@ var _ = Describe("Cluster Metrics", Label("cluster"), func() { name := "testing-deletion" ns := "fleet-local" kw := k.Namespace(ns) + labels := map[string]string{ + "management.cattle.io/cluster-display-name": "testing", + "name": "testing", + } + if shard != "" { + labels["fleet.cattle.io/shard"] = shard + } Expect(testenv.CreateCluster( k, ns, name, - map[string]string{ - "management.cattle.io/cluster-display-name": "testing", - "name": "testing", - }, + labels, nil, )).ToNot(HaveOccurred()) diff --git a/e2e/metrics/clustergroup_test.go b/e2e/metrics/clustergroup_test.go index 0c0a31a1f4..9035da9683 100644 --- a/e2e/metrics/clustergroup_test.go +++ b/e2e/metrics/clustergroup_test.go @@ -45,6 +45,10 @@ var _ = Describe("Cluster Metrics", Label("clustergroup"), func() { "test-cluster-group", rand.NewSource(time.Now().UnixNano()), ) + labels := make(map[string]string) + if shard != "" { + labels["fleet.cattle.io/shard"] = shard + } err := testenv.CreateClusterGroup( k, namespace, @@ -52,6 +56,7 @@ var _ = Describe("Cluster Metrics", Label("clustergroup"), func() { map[string]string{ "name": "local", }, + labels, ) Expect(err).ToNot(HaveOccurred()) diff --git a/e2e/metrics/exporter.go b/e2e/metrics/exporter.go index 9d1af3047d..5759c3efe1 100644 --- a/e2e/metrics/exporter.go +++ b/e2e/metrics/exporter.go @@ -56,7 +56,7 @@ func (m *ExporterTest) FindOneMetric( // check if name exists. mf, ok := allMetrics[name] if !ok { - return nil, fmt.Errorf("metric %q not found", name) + return nil, fmt.Errorf("metric %q not found for URL %v", name, m.url) } var metrics []*dto.Metric diff --git a/e2e/metrics/gitrepo_test.go b/e2e/metrics/gitrepo_test.go index 63470bd33d..5c76a3443d 100644 --- a/e2e/metrics/gitrepo_test.go +++ b/e2e/metrics/gitrepo_test.go @@ -14,15 +14,13 @@ import ( ) var _ = Describe("GitRepo Metrics", Label("gitrepo"), func() { - const ( - objName = "metrics" - branch = "master" - ) var ( // kw is the kubectl command for namespace the workload is deployed to kw kubectl.Command namespace string + objName = "metrics" + branch = "master" ) BeforeEach(func() { @@ -41,6 +39,7 @@ var _ = Describe("GitRepo Metrics", Label("gitrepo"), func() { namespace, objName, branch, + shard, "simple-manifest", ) Expect(err).ToNot(HaveOccurred()) @@ -87,8 +86,8 @@ var _ = Describe("GitRepo Metrics", Label("gitrepo"), func() { }).ShouldNot(HaveOccurred()) }) - Context("when the GitRepo is changed", func() { - It("it should not duplicate metrics if GitRepo is updated", func() { + When("the GitRepo is changed", func() { + It("it should not duplicate metrics", func() { out, err := kw.Patch( "gitrepo", objName, "--type=json", diff --git a/e2e/metrics/suite_test.go b/e2e/metrics/suite_test.go index 1f51588982..8b67d7880d 100644 --- a/e2e/metrics/suite_test.go +++ b/e2e/metrics/suite_test.go @@ -23,23 +23,35 @@ func TestE2E(t *testing.T) { var ( env *testenv.Env // k is the kubectl command for the cluster registration namespace - k kubectl.Command - et metrics.ExporterTest - loadBalancerName string + k kubectl.Command + et metrics.ExporterTest + shard string ) -func setupLoadBalancer() (metricsURL string) { +type ServiceData struct { + Name string + Port int64 + IsDefaultShard bool + Shard string +} + +// setupLoadBalancer creates a load balancer service for the fleet controller. +// If shard is empty, it creates a service for the default (unsharded) +// controller. +func setupLoadBalancer(shard string) (metricsURL string) { rs := rand.NewSource(time.Now().UnixNano()) port := rs.Int63()%1000 + 30000 - loadBalancerName = testenv.AddRandomSuffix("fleetcontroller", rs) + loadBalancerName := testenv.AddRandomSuffix("fleetcontroller", rs) ks := k.Namespace("cattle-fleet-system") err := testenv.ApplyTemplate( ks, testenv.AssetPath("metrics/fleetcontroller_service.yaml"), - map[string]interface{}{ - "Name": loadBalancerName, - "Port": port, + ServiceData{ + Name: loadBalancerName, + Port: port, + IsDefaultShard: shard == "", + Shard: shard, }, ) Expect(err).ToNot(HaveOccurred()) @@ -53,12 +65,12 @@ func setupLoadBalancer() (metricsURL string) { return ip, err }).ShouldNot(BeEmpty()) - return -} + DeferCleanup(func() { + ks := k.Namespace("cattle-fleet-system") + _, _ = ks.Delete("service", loadBalancerName) + }) -func tearDownLoadBalancer() { - ks := k.Namespace("cattle-fleet-system") - _, _ = ks.Delete("service", loadBalancerName) + return metricsURL } var _ = BeforeSuite(func() { @@ -66,19 +78,20 @@ var _ = BeforeSuite(func() { SetDefaultEventuallyPollingInterval(time.Second) testenv.SetRoot("../..") + if os.Getenv("SHARD") != "" { + shard = os.Getenv("SHARD") + } + + // Enable passing the metrics URL via environment solely for debugging + // purposes, e.g. when a fleetcontroller is run outside the cluster. This is + // not intended for regular use. var metricsURL string if os.Getenv("METRICS_URL") != "" { metricsURL = os.Getenv("METRICS_URL") } else { - metricsURL = setupLoadBalancer() + metricsURL = setupLoadBalancer(shard) } et = metrics.NewExporterTest(metricsURL) env = testenv.New() }) - -var _ = AfterSuite(func() { - if os.Getenv("METRICS_URL") == "" { - tearDownLoadBalancer() - } -}) diff --git a/e2e/single-cluster/values_from_test.go b/e2e/single-cluster/values_from_test.go index e2506372e7..0bed024e9e 100644 --- a/e2e/single-cluster/values_from_test.go +++ b/e2e/single-cluster/values_from_test.go @@ -36,7 +36,7 @@ var _ = Describe("ValuesFrom", func() { "--from-file=values.yaml="+testenv.AssetPath("single-cluster/values-from-configmap.yaml")) Expect(err).ToNot(HaveOccurred(), out) - err = testenv.CreateGitRepo(k, namespace, "values-from", "master", "helm-values-from") + err = testenv.CreateGitRepo(k, namespace, "values-from", "master", "", "helm-values-from") Expect(err).ToNot(HaveOccurred()) DeferCleanup(func() { diff --git a/e2e/testenv/template.go b/e2e/testenv/template.go index 88b8288723..240f7f1f11 100644 --- a/e2e/testenv/template.go +++ b/e2e/testenv/template.go @@ -19,21 +19,31 @@ const clusterTemplate = "cluster-template.yaml" const clustergroupTemplate = "clustergroup-template.yaml" // GitRepoData can be used with the gitrepo-template.yaml asset when no custom -// GitRepo properties are required. All fields are required. +// GitRepo properties are required. All fields except Shard are required. type GitRepoData struct { Name string Branch string Paths []string TargetNamespace string + Shard string } -// CreateGitRepo uses the template to create a gitrepo resource. The namespace is the TargetNamespace for the workloads. -func CreateGitRepo(k kubectl.Command, namespace string, name string, branch string, paths ...string) error { +// CreateGitRepo uses the template to create a gitrepo resource. The namespace +// is the TargetNamespace for the workloads. +func CreateGitRepo( + k kubectl.Command, + namespace string, + name string, + branch string, + shard string, + paths ...string, +) error { return ApplyTemplate(k, AssetPath(gitrepoTemplate), GitRepoData{ TargetNamespace: namespace, Name: name, Branch: branch, Paths: paths, + Shard: shard, }) } @@ -57,12 +67,14 @@ func CreateClusterGroup( k kubectl.Command, namespace, name string, + matchLabels map[string]string, labels map[string]string, ) error { return ApplyTemplate(k, AssetPath(clustergroupTemplate), map[string]interface{}{ "Name": name, "Namespace": namespace, - "MatchLabels": labels, + "MatchLabels": matchLabels, + "Labels": labels, }) }