Skip to content

Commit

Permalink
metrics: make sure metrics work well with sharding
Browse files Browse the repository at this point in the history
Refers to #2355
  • Loading branch information
p-se committed May 21, 2024
1 parent e51e937 commit 9807f35
Show file tree
Hide file tree
Showing 16 changed files with 134 additions and 52 deletions.
1 change: 1 addition & 0 deletions .github/workflows/e2e-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ jobs:
# 2. Run tests for metrics
ginkgo e2e/metrics
SHARD=shard1 ginkgo e2e/metrics
# 3. Run tests requiring only the git server
e2e/testenv/infra/infra setup --git-server=true
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nightly-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ jobs:
# 2. Run tests for metrics
ginkgo e2e/metrics
SHARD=shard1 ginkgo e2e/metrics
# 3. Run tests requiring only the git server
e2e/testenv/infra/infra setup --git-server=true
Expand Down
14 changes: 13 additions & 1 deletion charts/fleet/templates/service.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
{{- if .Values.metrics.enabled }}
{{ $shards := list "" }}
{{ if .Values.shards }}
{{ $shards = concat $shards .Values.shards | uniq }}
{{ end }}
{{ range $shards }}
apiVersion: v1
kind: Service
metadata:
name: monitoring-fleet-controller
name: "monitoring-fleet-controller{{if . }}-shard-{{ . }}{{end}}"
labels:
app: fleet-controller
spec:
Expand All @@ -14,4 +19,11 @@ spec:
name: metrics
selector:
app: fleet-controller
{{- if empty . }}
shard-default: "true"
{{- else }}
shard: "{{ . }}"
{{- end }}
---
{{- end }}
{{- end }}
27 changes: 18 additions & 9 deletions dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -389,9 +389,11 @@ helm upgrade --install --create-namespace -n cattle-system-monitoring \

That alone suffices to get a working monitoring setup for the upstream cluster.
But to connect it to the fleet-controller exported metrics, you need to add a
service monitor. The service monitor is currently not part of the Helm chart.
However, the necessary Kubernetes service resource is, unless you have disabled
monitoring in the Helm chart when installing fleet.
service monitor for each controller.

The service monitor is currently not part of the Helm chart. However, the
necessary Kubernetes service resource is, unless you have disabled monitoring in
the Helm chart when installing fleet.

```yaml
apiVersion: monitoring.coreos.com/v1
Expand All @@ -415,13 +417,20 @@ spec:
selector:
matchLabels:
app: fleet-controller

shard-default: true
# or:
# shard: <shard-id>
```

This configures Prometheus to scrape the metrics from the fleet-controller. By
accessing the Prometheus UI, you can now see the metrics from the
Fleet-Controller. They are all prefixed with `fleet_`, which you will need to
enter into the expression field of the Graph page to to get auto-completion.
This configures Prometheus to scrape the metrics from the (unsharded)
fleet-controller. Should you have sharding enabled, you will additionally need
to add one ServiceMonitor for each shard.

By accessing the Prometheus UI, you can now see the metrics from the
fleet-controller. They are all prefixed with `fleet_`, which you will need to
enter into the expression field of the Graph page to get auto-completion.
Alternatively you can use the metrics explorer, which is a button next to the
evaluation input field.

> **__NOTE:__** The Prometheus UI can be used to check the Prometheus
configuration (e.g., the scrape targets), scraped metrics, check alerts or
Expand All @@ -430,7 +439,7 @@ dashboards. It is not accessible by default and not meant to be shown to
casual users.

To access the Prometheus UI, you can forward the port of the Prometheus service
to your local machine and access it.
to your local machine and access it through that port.

```bash
kubectl port-forward -n cattle-system-monitoring \
Expand Down
6 changes: 6 additions & 0 deletions e2e/assets/clustergroup-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ kind: ClusterGroup
metadata:
name: {{ .Name }}
namespace: {{ .Namespace }}
{{- if .Labels }}
labels:
{{- range $key, $value := .Labels }}
{{$key}}: {{$value}}
{{- end }}
{{- end }}
spec:
selector:
matchLabels:
Expand Down
5 changes: 4 additions & 1 deletion e2e/assets/gitrepo-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ kind: GitRepo
apiVersion: fleet.cattle.io/v1alpha1
metadata:
name: {{ .Name }}
{{- if ne .Shard "" }}
labels:
fleet.cattle.io/shard: {{ .Shard }}
{{- end }}

spec:
repo: https://github.com/rancher/fleet-test-data
Expand All @@ -10,5 +14,4 @@ spec:
{{- range .Paths}}
- {{.}}
{{- end}}

targetNamespace: {{ .TargetNamespace }}
9 changes: 8 additions & 1 deletion e2e/assets/metrics/fleetcontroller_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@ apiVersion: v1
kind: Service
metadata:
name: {{ .Name }}
labels:
app: fleet-controller
env: test
spec:
selector:
app: fleet-controller
shard-default: "true"
{{- if .IsDefaultShard }}
shard-default: "{{ .IsDefaultShard }}"
{{ else }}
shard: {{ .Shard }}
{{- end }}
ports:
- protocol: TCP
port: {{ .Port }}
Expand Down
1 change: 1 addition & 0 deletions e2e/metrics/bundle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ var _ = Describe("Bundle Metrics", Label("bundle"), func() {
namespace,
gitRepoName,
branch,
shard,
"simple-manifest",
)
Expect(err).ToNot(HaveOccurred())
Expand Down
1 change: 1 addition & 0 deletions e2e/metrics/bundledeployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var _ = Describe("BundleDeployment Metrics", Label("bundledeployment"), func() {
namespace,
objName,
branch,
shard,
"simple-manifest",
)
Expect(err).ToNot(HaveOccurred())
Expand Down
28 changes: 20 additions & 8 deletions e2e/metrics/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ func expectMetrics(name string, expectedMetrics MetricsSelector, expectExist boo

var _ = Describe("Cluster Metrics", Label("cluster"), func() {
It("should have metrics for the local cluster resource", func() {
if shard != "" {
Skip("local cluster isn't handled by shards and hence cannot be tested using sharding")
}
eventuallyExpectMetrics("local", expectedMetrics, true)
})

Expand All @@ -99,14 +102,19 @@ var _ = Describe("Cluster Metrics", Label("cluster"), func() {
ns := "fleet-local"
kw := k.Namespace(ns)

labels := map[string]string{
"management.cattle.io/cluster-display-name": "testing",
"name": "testing",
}
if shard != "" {
labels["fleet.cattle.io/shard"] = shard
}

err := testenv.CreateCluster(
k,
ns,
name,
map[string]string{
"management.cattle.io/cluster-display-name": "testing",
"name": "testing",
},
labels,
map[string]string{
"clientID": "testing",
},
Expand Down Expand Up @@ -146,15 +154,19 @@ var _ = Describe("Cluster Metrics", Label("cluster"), func() {
name := "testing-deletion"
ns := "fleet-local"
kw := k.Namespace(ns)
labels := map[string]string{
"management.cattle.io/cluster-display-name": "testing",
"name": "testing",
}
if shard != "" {
labels["fleet.cattle.io/shard"] = shard
}

Expect(testenv.CreateCluster(
k,
ns,
name,
map[string]string{
"management.cattle.io/cluster-display-name": "testing",
"name": "testing",
},
labels,
nil,
)).ToNot(HaveOccurred())

Expand Down
5 changes: 5 additions & 0 deletions e2e/metrics/clustergroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,18 @@ var _ = Describe("Cluster Metrics", Label("clustergroup"), func() {
"test-cluster-group",
rand.NewSource(time.Now().UnixNano()),
)
labels := make(map[string]string)
if shard != "" {
labels["fleet.cattle.io/shard"] = shard
}
err := testenv.CreateClusterGroup(
k,
namespace,
clusterGroupName,
map[string]string{
"name": "local",
},
labels,
)
Expect(err).ToNot(HaveOccurred())

Expand Down
2 changes: 1 addition & 1 deletion e2e/metrics/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func (m *ExporterTest) FindOneMetric(
// check if name exists.
mf, ok := allMetrics[name]
if !ok {
return nil, fmt.Errorf("metric %q not found", name)
return nil, fmt.Errorf("metric %q not found for URL %v", name, m.url)
}

var metrics []*dto.Metric
Expand Down
11 changes: 5 additions & 6 deletions e2e/metrics/gitrepo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@ import (
)

var _ = Describe("GitRepo Metrics", Label("gitrepo"), func() {
const (
objName = "metrics"
branch = "master"
)

var (
// kw is the kubectl command for namespace the workload is deployed to
kw kubectl.Command
namespace string
objName = "metrics"
branch = "master"
)

BeforeEach(func() {
Expand All @@ -41,6 +39,7 @@ var _ = Describe("GitRepo Metrics", Label("gitrepo"), func() {
namespace,
objName,
branch,
shard,
"simple-manifest",
)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -87,8 +86,8 @@ var _ = Describe("GitRepo Metrics", Label("gitrepo"), func() {
}).ShouldNot(HaveOccurred())
})

Context("when the GitRepo is changed", func() {
It("it should not duplicate metrics if GitRepo is updated", func() {
When("the GitRepo is changed", func() {
It("it should not duplicate metrics", func() {
out, err := kw.Patch(
"gitrepo", objName,
"--type=json",
Expand Down
53 changes: 33 additions & 20 deletions e2e/metrics/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,35 @@ func TestE2E(t *testing.T) {
var (
env *testenv.Env
// k is the kubectl command for the cluster registration namespace
k kubectl.Command
et metrics.ExporterTest
loadBalancerName string
k kubectl.Command
et metrics.ExporterTest
shard string
)

func setupLoadBalancer() (metricsURL string) {
type ServiceData struct {
Name string
Port int64
IsDefaultShard bool
Shard string
}

// setupLoadBalancer creates a load balancer service for the fleet controller.
// If shard is empty, it creates a service for the default (unsharded)
// controller.
func setupLoadBalancer(shard string) (metricsURL string) {
rs := rand.NewSource(time.Now().UnixNano())
port := rs.Int63()%1000 + 30000
loadBalancerName = testenv.AddRandomSuffix("fleetcontroller", rs)
loadBalancerName := testenv.AddRandomSuffix("fleetcontroller", rs)

ks := k.Namespace("cattle-fleet-system")
err := testenv.ApplyTemplate(
ks,
testenv.AssetPath("metrics/fleetcontroller_service.yaml"),
map[string]interface{}{
"Name": loadBalancerName,
"Port": port,
ServiceData{
Name: loadBalancerName,
Port: port,
IsDefaultShard: shard == "",
Shard: shard,
},
)
Expect(err).ToNot(HaveOccurred())
Expand All @@ -53,32 +65,33 @@ func setupLoadBalancer() (metricsURL string) {
return ip, err
}).ShouldNot(BeEmpty())

return
}
DeferCleanup(func() {
ks := k.Namespace("cattle-fleet-system")
_, _ = ks.Delete("service", loadBalancerName)
})

func tearDownLoadBalancer() {
ks := k.Namespace("cattle-fleet-system")
_, _ = ks.Delete("service", loadBalancerName)
return metricsURL
}

var _ = BeforeSuite(func() {
SetDefaultEventuallyTimeout(testenv.Timeout)
SetDefaultEventuallyPollingInterval(time.Second)
testenv.SetRoot("../..")

if os.Getenv("SHARD") != "" {
shard = os.Getenv("SHARD")
}

// Enable passing the metrics URL via environment solely for debugging
// purposes, e.g. when a fleetcontroller is run outside the cluster. This is
// not intended for regular use.
var metricsURL string
if os.Getenv("METRICS_URL") != "" {
metricsURL = os.Getenv("METRICS_URL")
} else {
metricsURL = setupLoadBalancer()
metricsURL = setupLoadBalancer(shard)
}
et = metrics.NewExporterTest(metricsURL)

env = testenv.New()
})

var _ = AfterSuite(func() {
if os.Getenv("METRICS_URL") == "" {
tearDownLoadBalancer()
}
})
2 changes: 1 addition & 1 deletion e2e/single-cluster/values_from_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ var _ = Describe("ValuesFrom", func() {
"--from-file=values.yaml="+testenv.AssetPath("single-cluster/values-from-configmap.yaml"))
Expect(err).ToNot(HaveOccurred(), out)

err = testenv.CreateGitRepo(k, namespace, "values-from", "master", "helm-values-from")
err = testenv.CreateGitRepo(k, namespace, "values-from", "master", "", "helm-values-from")
Expect(err).ToNot(HaveOccurred())

DeferCleanup(func() {
Expand Down
Loading

0 comments on commit 9807f35

Please sign in to comment.