diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8376f748c..b3930e9fe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,6 +14,7 @@ on: env: GO_VERSION: '1.24.9' + CERT_MANAGER_VERSION: 'v1.16.2' jobs: detect-noop: @@ -125,6 +126,7 @@ jobs: PROPERTY_PROVIDER: 'azure' RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL: ${{ matrix.resource-snapshot-creation-minimum-interval }} RESOURCE_CHANGES_COLLECTION_DURATION: ${{ matrix.resource-changes-collection-duration }} + CERT_MANAGER_VERSION: ${{ env.CERT_MANAGER_VERSION }} - name: Collect logs if: always() diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index 3f44d29a4..5b1710d5f 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -2,11 +2,33 @@ ## Install Chart +### Default Installation (Self-Signed Certificates) + ```console # Helm install with fleet-system namespace already created helm install hub-agent ./charts/hub-agent/ ``` +### Installation with cert-manager + +When using cert-manager for certificate management, install cert-manager as a prerequisite first: + +```console +# Install cert-manager +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.16.2 \ + --set crds.enabled=true + +# Then install hub-agent with cert-manager enabled +helm install hub-agent ./charts/hub-agent --set useCertManager=true +``` + +This configures cert-manager to manage webhook certificates. + ## Upgrade Chart ```console @@ -32,6 +54,11 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `affinity` | Node affinity for hub-agent pods | `{}` | | `tolerations` | Tolerations for hub-agent pods | `[]` | | `logVerbosity` | Log level (klog V logs) | `5` | +| `enableWebhook` | Enable webhook server | `true` | +| `webhookServiceName` | Webhook service name | `fleetwebhook` | +| `enableGuardRail` | Enable guard rail webhook configurations | `true` | +| `webhookClientConnectionType` | Connection type for webhook client (service or url) | `service` | +| `useCertManager` | Use cert-manager for webhook certificate management | `false` | | `enableV1Beta1APIs` | Watch for v1beta1 APIs | `true` | | `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` | | `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` | @@ -41,4 +68,38 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `MaxFleetSizeSupported` | Max number of member clusters supported | `100` | | `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created. | `30s` | | `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` | -| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster. | `false` | \ No newline at end of file +| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster. | `false` | + +## Certificate Management + +The hub-agent supports two modes for webhook certificate management: + +### Automatic Certificate Generation (Default) + +By default, the hub-agent generates certificates automatically at startup. This mode: +- Requires no external dependencies +- Works out of the box +- Certificates are valid for 10 years + +### cert-manager (Optional) + +When `useCertManager=true`, certificates are managed by cert-manager. This mode: +- Requires cert-manager to be installed as a prerequisite +- Handles certificate rotation automatically (90-day certificates) +- Follows industry-standard certificate management practices +- Suitable for production environments + +To switch to cert-manager mode: +```console +# Install cert-manager first +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.16.2 \ + --set crds.enabled=true + +# Then install hub-agent with cert-manager enabled +helm install hub-agent ./charts/hub-agent --set useCertManager=true +``` \ No newline at end of file diff --git a/charts/hub-agent/templates/certificate.yaml b/charts/hub-agent/templates/certificate.yaml new file mode 100644 index 000000000..ff5b3135e --- /dev/null +++ b/charts/hub-agent/templates/certificate.yaml @@ -0,0 +1,62 @@ +{{- if and .Values.enableWebhook .Values.useCertManager }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: fleet-webhook-server-cert + namespace: {{ .Values.namespace }} + labels: + {{- include "hub-agent.labels" . | nindent 4 }} +spec: + # Secret name where cert-manager will store the certificate + secretName: fleet-webhook-server-cert + + # Certificate duration (90 days is cert-manager's default and recommended) + duration: 2160h # 90 days + + # Renew certificate 30 days before expiry + renewBefore: 720h # 30 days + + # Subject configuration + subject: + organizations: + - KubeFleet + + # Common name + commonName: fleet-webhook.{{ .Values.namespace }}.svc + + # DNS names for the certificate + dnsNames: + - {{ .Values.webhookServiceName }} + - {{ .Values.webhookServiceName }}.{{ .Values.namespace }} + - {{ .Values.webhookServiceName }}.{{ .Values.namespace }}.svc + - {{ .Values.webhookServiceName }}.{{ .Values.namespace }}.svc.cluster.local + + # Issuer reference - using self-signed issuer + issuerRef: + name: fleet-selfsigned-issuer + kind: Issuer + group: cert-manager.io + + # Private key configuration + privateKey: + algorithm: ECDSA + size: 256 + + # Key usages + usages: + - digital signature + - key encipherment + - server auth +--- +# Self-signed issuer for generating the certificate +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: fleet-selfsigned-issuer + namespace: {{ .Values.namespace }} + labels: + {{- include "hub-agent.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end }} diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index 7ed6ab7ba..b960857f2 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -6,6 +6,7 @@ metadata: labels: {{- include "hub-agent.labels" . | nindent 4 }} spec: + replicas: {{ .Values.replicaCount }} selector: matchLabels: {{- include "hub-agent.selectorLabels" . | nindent 6 }} @@ -25,6 +26,7 @@ spec: - --webhook-service-name={{ .Values.webhookServiceName }} - --enable-guard-rail={{ .Values.enableGuardRail }} - --enable-workload={{ .Values.enableWorkload }} + - --use-cert-manager={{ .Values.useCertManager }} - --whitelisted-users=system:serviceaccount:fleet-system:hub-agent-sa - --webhook-client-connection-type={{.Values.webhookClientConnectionType}} - --v={{ .Values.logVerbosity }} @@ -73,6 +75,19 @@ spec: fieldPath: metadata.namespace resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if .Values.useCertManager }} + volumeMounts: + - name: webhook-cert + mountPath: /tmp/k8s-webhook-server/serving-certs + readOnly: true + {{- end }} + {{- if .Values.useCertManager }} + volumes: + - name: webhook-cert + secret: + secretName: fleet-webhook-server-cert + defaultMode: 0644 + {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index a423c6270..1c427df1f 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -17,13 +17,17 @@ webhookServiceName: fleetwebhook enableGuardRail: true webhookClientConnectionType: service enableWorkload: false +# useCertManager enables cert-manager for webhook certificate management +# When enabled, cert-manager will be installed as a dependency +# and a Certificate resource will be created +useCertManager: false + forceDeleteWaitTime: 15m0s clusterUnhealthyThreshold: 3m0s resourceSnapshotCreationMinimumInterval: 30s resourceChangesCollectionDuration: 15s -namespace: - fleet-system +namespace: fleet-system resources: limits: diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index f896ab71a..1b8a8237e 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -46,6 +46,7 @@ import ( "github.com/kubefleet-dev/kubefleet/cmd/hubagent/options" "github.com/kubefleet-dev/kubefleet/cmd/hubagent/workload" mcv1beta1 "github.com/kubefleet-dev/kubefleet/pkg/controllers/membercluster/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/utils/validator" "github.com/kubefleet-dev/kubefleet/pkg/webhook" // +kubebuilder:scaffold:imports ) @@ -156,7 +157,7 @@ func main() { if opts.EnableWebhook { whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") - if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload); err != nil { + if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.UseCertManager); err != nil { klog.ErrorS(err, "unable to set up webhook") exitWithErrorFunc() } @@ -164,10 +165,21 @@ func main() { ctx := ctrl.SetupSignalHandler() if err := workload.SetupControllers(ctx, &wg, mgr, config, opts); err != nil { - klog.ErrorS(err, "unable to set up ready check") + klog.ErrorS(err, "unable to set up controllers") exitWithErrorFunc() } + // Add webhook readiness check AFTER controllers are set up (when ResourceInformer is initialized) + // This prevents webhook from accepting requests before discovery cache is populated + if opts.EnableWebhook { + // AddReadyzCheck adds additional readiness check instead of replacing the one registered earlier provided the name is different. + // Both registered checks need to pass for the manager to be considered ready. + if err := mgr.AddReadyzCheck("webhook-cache", webhook.ResourceInformerReadinessChecker(validator.ResourceInformer)); err != nil { + klog.ErrorS(err, "unable to set up webhook readiness check") + exitWithErrorFunc() + } + } + // +kubebuilder:scaffold:builder wg.Add(1) @@ -188,9 +200,9 @@ func main() { } // SetupWebhook generates the webhook cert and then set up the webhook configurator. -func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool) error { +func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool) error { // Generate self-signed key and crt files in FleetWebhookCertDir for the webhook server to start. - w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload) + w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager) if err != nil { klog.ErrorS(err, "fail to generate WebhookConfig") return err diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index 6573789c1..7568ae7cc 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -110,6 +110,9 @@ type Options struct { // EnableWorkload enables workload resources (pods and replicasets) to be created in the hub cluster. // When set to true, the pod and replicaset validating webhooks are disabled. EnableWorkload bool + // UseCertManager indicates whether to use cert-manager for webhook certificate management. + // When enabled, webhook certificates are managed by cert-manager instead of self-signed generation. + UseCertManager bool // ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created. // Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration. ResourceSnapshotCreationMinimumInterval time.Duration @@ -185,6 +188,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.IntVar(&o.PprofPort, "pprof-port", 6065, "The port for pprof profiling.") flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.") flags.BoolVar(&o.EnableWorkload, "enable-workload", false, "If set, workloads (pods and replicasets) can be created in the hub cluster. This disables the pod and replicaset validating webhooks.") + flags.BoolVar(&o.UseCertManager, "use-cert-manager", false, "If set, cert-manager will be used for webhook certificate management instead of self-signed certificates.") flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.") flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second, "The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.") diff --git a/pkg/utils/informer/informermanager.go b/pkg/utils/informer/informermanager.go index 4ea1a5143..53da3343a 100644 --- a/pkg/utils/informer/informermanager.go +++ b/pkg/utils/informer/informermanager.go @@ -61,6 +61,9 @@ type Manager interface { // GetNameSpaceScopedResources returns the list of namespace scoped resources we are watching. GetNameSpaceScopedResources() []schema.GroupVersionResource + // GetAllResources returns the list of all resources (both cluster-scoped and namespace-scoped) we are watching. + GetAllResources() []schema.GroupVersionResource + // IsClusterScopedResources returns if a resource is cluster scoped. IsClusterScopedResources(resource schema.GroupVersionKind) bool @@ -224,6 +227,19 @@ func (s *informerManagerImpl) GetNameSpaceScopedResources() []schema.GroupVersio return res } +func (s *informerManagerImpl) GetAllResources() []schema.GroupVersionResource { + s.resourcesLock.RLock() + defer s.resourcesLock.RUnlock() + + res := make([]schema.GroupVersionResource, 0, len(s.apiResources)) + for _, resource := range s.apiResources { + if resource.isPresent { + res = append(res, resource.GroupVersionResource) + } + } + return res +} + func (s *informerManagerImpl) IsClusterScopedResources(gvk schema.GroupVersionKind) bool { s.resourcesLock.RLock() defer s.resourcesLock.RUnlock() diff --git a/pkg/utils/informer/informermanager_test.go b/pkg/utils/informer/informermanager_test.go new file mode 100644 index 000000000..5d0984c87 --- /dev/null +++ b/pkg/utils/informer/informermanager_test.go @@ -0,0 +1,270 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package informer + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" + "k8s.io/client-go/kubernetes/scheme" +) + +func TestGetAllResources(t *testing.T) { + tests := []struct { + name string + namespaceScopedResources []APIResourceMeta + clusterScopedResources []APIResourceMeta + staticResources []APIResourceMeta + expectedResourceCount int + expectedNamespacedCount int + }{ + { + name: "mixed cluster and namespace scoped resources", + namespaceScopedResources: []APIResourceMeta{ + { + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "ConfigMap", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "configmaps", + }, + IsClusterScoped: false, + }, + { + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Secret", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "secrets", + }, + IsClusterScoped: false, + }, + }, + clusterScopedResources: []APIResourceMeta{ + { + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Namespace", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "namespaces", + }, + IsClusterScoped: true, + }, + }, + staticResources: []APIResourceMeta{ + { + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Node", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "nodes", + }, + IsClusterScoped: true, + isStaticResource: true, + }, + }, + expectedResourceCount: 4, // All resources including static + expectedNamespacedCount: 2, // Only namespace-scoped, excluding static + }, + { + name: "no resources", + expectedResourceCount: 0, + expectedNamespacedCount: 0, + }, + { + name: "only namespace scoped resources", + namespaceScopedResources: []APIResourceMeta{ + { + GroupVersionKind: schema.GroupVersionKind{ + Group: "apps", + Version: "v1", + Kind: "Deployment", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "apps", + Version: "v1", + Resource: "deployments", + }, + IsClusterScoped: false, + }, + }, + expectedResourceCount: 1, + expectedNamespacedCount: 1, + }, + { + name: "only cluster scoped resources", + clusterScopedResources: []APIResourceMeta{ + { + GroupVersionKind: schema.GroupVersionKind{ + Group: "rbac.authorization.k8s.io", + Version: "v1", + Kind: "ClusterRole", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "rbac.authorization.k8s.io", + Version: "v1", + Resource: "clusterroles", + }, + IsClusterScoped: true, + }, + }, + expectedResourceCount: 1, + expectedNamespacedCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a fake dynamic client + fakeClient := fake.NewSimpleDynamicClient(scheme.Scheme) + stopCh := make(chan struct{}) + defer close(stopCh) + + mgr := NewInformerManager(fakeClient, 0, stopCh) + implMgr := mgr.(*informerManagerImpl) + + // Add namespace-scoped resources + for _, res := range tt.namespaceScopedResources { + res.isPresent = true + implMgr.apiResources[res.GroupVersionKind] = &res + } + + // Add cluster-scoped resources + for _, res := range tt.clusterScopedResources { + res.isPresent = true + implMgr.apiResources[res.GroupVersionKind] = &res + } + + // Add static resources + for _, res := range tt.staticResources { + res.isPresent = true + implMgr.apiResources[res.GroupVersionKind] = &res + } + + // Test GetAllResources + allResources := mgr.GetAllResources() + assert.Equal(t, tt.expectedResourceCount, len(allResources), "GetAllResources should return correct count") + + // Verify all expected resources are present + resourceMap := make(map[schema.GroupVersionResource]bool) + for _, gvr := range allResources { + resourceMap[gvr] = true + } + + for _, res := range tt.namespaceScopedResources { + assert.True(t, resourceMap[res.GroupVersionResource], "namespace-scoped resource %v should be in GetAllResources", res.GroupVersionResource) + } + + for _, res := range tt.clusterScopedResources { + assert.True(t, resourceMap[res.GroupVersionResource], "cluster-scoped resource %v should be in GetAllResources", res.GroupVersionResource) + } + + for _, res := range tt.staticResources { + assert.True(t, resourceMap[res.GroupVersionResource], "static resource %v should be in GetAllResources", res.GroupVersionResource) + } + + // Test GetNameSpaceScopedResources + namespacedResources := mgr.GetNameSpaceScopedResources() + assert.Equal(t, tt.expectedNamespacedCount, len(namespacedResources), "GetNameSpaceScopedResources should return correct count") + + // Verify only namespace-scoped, non-static resources are present + namespacedMap := make(map[schema.GroupVersionResource]bool) + for _, gvr := range namespacedResources { + namespacedMap[gvr] = true + } + + for _, res := range tt.namespaceScopedResources { + assert.True(t, namespacedMap[res.GroupVersionResource], "namespace-scoped resource %v should be in GetNameSpaceScopedResources", res.GroupVersionResource) + } + + // Verify cluster-scoped and static resources are NOT in namespace-scoped list + for _, res := range tt.clusterScopedResources { + assert.False(t, namespacedMap[res.GroupVersionResource], "cluster-scoped resource %v should NOT be in GetNameSpaceScopedResources", res.GroupVersionResource) + } + + for _, res := range tt.staticResources { + assert.False(t, namespacedMap[res.GroupVersionResource], "static resource %v should NOT be in GetNameSpaceScopedResources", res.GroupVersionResource) + } + }) + } +} + +func TestGetAllResources_NotPresent(t *testing.T) { + // Test that resources marked as not present are excluded + fakeClient := fake.NewSimpleDynamicClient(scheme.Scheme) + stopCh := make(chan struct{}) + defer close(stopCh) + + mgr := NewInformerManager(fakeClient, 0, stopCh) + implMgr := mgr.(*informerManagerImpl) + + // Add a resource that is present + presentRes := APIResourceMeta{ + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "ConfigMap", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "configmaps", + }, + IsClusterScoped: false, + isPresent: true, + } + implMgr.apiResources[presentRes.GroupVersionKind] = &presentRes + + // Add a resource that is NOT present (deleted) + notPresentRes := APIResourceMeta{ + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Secret", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "secrets", + }, + IsClusterScoped: false, + isPresent: false, + } + implMgr.apiResources[notPresentRes.GroupVersionKind] = ¬PresentRes + + allResources := mgr.GetAllResources() + assert.Equal(t, 1, len(allResources), "should only return present resources") + assert.Equal(t, presentRes.GroupVersionResource, allResources[0], "should return the present resource") +} diff --git a/pkg/webhook/readiness.go b/pkg/webhook/readiness.go new file mode 100644 index 000000000..889841828 --- /dev/null +++ b/pkg/webhook/readiness.go @@ -0,0 +1,61 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "fmt" + "net/http" + + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" + + "github.com/kubefleet-dev/kubefleet/pkg/utils/informer" +) + +// ResourceInformerReadinessChecker creates a readiness check function that verifies +// all resource informer caches are synced before marking the pod as ready. +// This prevents the webhook from accepting requests before the discovery cache is populated. +func ResourceInformerReadinessChecker(resourceInformer informer.Manager) func(*http.Request) error { + return func(_ *http.Request) error { + if resourceInformer == nil { + return fmt.Errorf("resource informer not initialized") + } + + // Require ALL informer caches to be synced before marking ready + allResources := resourceInformer.GetAllResources() + if len(allResources) == 0 { + // This can happen during startup when the ResourceInformer is created but the ChangeDetector + // hasn't discovered and registered any resources yet via AddDynamicResources(). + return fmt.Errorf("resource informer not ready: no resources registered") + } + + // Check that ALL informers have synced + unsyncedResources := []schema.GroupVersionResource{} + for _, gvr := range allResources { + if !resourceInformer.IsInformerSynced(gvr) { + unsyncedResources = append(unsyncedResources, gvr) + } + } + + if len(unsyncedResources) > 0 { + return fmt.Errorf("resource informer not ready: %d/%d informers not synced yet", len(unsyncedResources), len(allResources)) + } + + klog.V(5).InfoS("All resource informers synced", "totalInformers", len(allResources)) + return nil + } +} diff --git a/pkg/webhook/readiness_test.go b/pkg/webhook/readiness_test.go new file mode 100644 index 000000000..d93852f05 --- /dev/null +++ b/pkg/webhook/readiness_test.go @@ -0,0 +1,140 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" + + "github.com/kubefleet-dev/kubefleet/pkg/utils/informer" + testinformer "github.com/kubefleet-dev/kubefleet/test/utils/informer" +) + +func TestResourceInformerReadinessChecker(t *testing.T) { + tests := []struct { + name string + resourceInformer informer.Manager + expectError bool + errorContains string + }{ + { + name: "nil informer", + resourceInformer: nil, + expectError: true, + errorContains: "resource informer not initialized", + }, + { + name: "no resources registered", + resourceInformer: &testinformer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{}, + }, + expectError: true, + errorContains: "no resources registered", + }, + { + name: "all informers synced", + resourceInformer: &testinformer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{ + {Group: "", Version: "v1", Kind: "ConfigMap"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Secret"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Namespace"}: true, // cluster-scoped + }, + IsClusterScopedResource: true, // true = map stores cluster-scoped resources + InformerSynced: ptr.To(true), + }, + expectError: false, + }, + { + name: "some informers not synced", + resourceInformer: &testinformer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{ + {Group: "", Version: "v1", Kind: "ConfigMap"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Secret"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Namespace"}: true, // cluster-scoped + }, + IsClusterScopedResource: true, + InformerSynced: ptr.To(false), + }, + expectError: true, + errorContains: "informers not synced yet", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + checker := ResourceInformerReadinessChecker(tt.resourceInformer) + err := checker(nil) + + if tt.expectError { + assert.Error(t, err) + if tt.errorContains != "" { + assert.Contains(t, err.Error(), tt.errorContains) + } + } else { + assert.NoError(t, err) + } + }) + } +} + +func TestResourceInformerReadinessChecker_PartialSync(t *testing.T) { + // Test the case where we have multiple resources but only some are synced + fakeManager := &testinformer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{ + {Group: "", Version: "v1", Kind: "ConfigMap"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Secret"}: false, // namespace-scoped + {Group: "apps", Version: "v1", Kind: "Deployment"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Namespace"}: true, // cluster-scoped + }, + IsClusterScopedResource: true, + InformerSynced: ptr.To(false), + // This will make IsInformerSynced return false for all resources + } + + checker := ResourceInformerReadinessChecker(fakeManager) + err := checker(nil) + + assert.Error(t, err) + assert.Contains(t, err.Error(), "informers not synced yet") + // Should report 4 unsynced (3 namespace-scoped + 1 cluster-scoped) + assert.Contains(t, err.Error(), "4/4") +} + +func TestResourceInformerReadinessChecker_AllSyncedMultipleResources(t *testing.T) { + // Test with many resources all synced + fakeManager := &testinformer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{ + {Group: "", Version: "v1", Kind: "ConfigMap"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Secret"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Service"}: false, // namespace-scoped + {Group: "apps", Version: "v1", Kind: "Deployment"}: false, // namespace-scoped + {Group: "apps", Version: "v1", Kind: "StatefulSet"}: false, // namespace-scoped + {Group: "", Version: "v1", Kind: "Namespace"}: true, // cluster-scoped + {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRole"}: true, // cluster-scoped + }, + IsClusterScopedResource: true, + InformerSynced: ptr.To(true), + } + + checker := ResourceInformerReadinessChecker(fakeManager) + err := checker(nil) + + assert.NoError(t, err, "Should be ready when all informers are synced") +} diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 781833e3e..9429d4803 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -70,6 +70,7 @@ import ( const ( fleetWebhookCertFileName = "tls.crt" fleetWebhookKeyFileName = "tls.key" + fleetWebhookCertSecretName = "fleet-webhook-server-cert" //nolint:gosec // This is a Secret name, not a credential fleetValidatingWebhookCfgName = "fleet-validating-webhook-configuration" fleetGuardRailWebhookCfgName = "fleet-guard-rail-webhook-configuration" fleetMutatingWebhookCfgName = "fleet-mutating-webhook-configuration" @@ -160,9 +161,11 @@ type Config struct { denyModifyMemberClusterLabels bool enableWorkload bool + // useCertManager indicates whether cert-manager is used for certificate management + useCertManager bool } -func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool) (*Config, error) { +func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool) (*Config, error) { // We assume the Pod namespace should be passed to env through downward API in the Pod spec. namespace := os.Getenv("POD_NAMESPACE") if namespace == "" { @@ -178,13 +181,29 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 enableGuardRail: enableGuardRail, denyModifyMemberClusterLabels: denyModifyMemberClusterLabels, enableWorkload: enableWorkload, + useCertManager: useCertManager, } - caPEM, err := w.genCertificate(certDir) - if err != nil { - return nil, err + + var caPEM []byte + var err error + + if useCertManager { + // When using cert-manager, certificates are mounted as files by Kubernetes + // cert-manager creates tls.crt and tls.key, but we need ca.crt for the webhook config + caPEM, err = w.loadCertManagerCA(certDir) + if err != nil { + return nil, fmt.Errorf("failed to load cert-manager CA certificate: %w", err) + } + } else { + // Use self-signed certificate generation (original flow) + caPEM, err = w.genCertificate(certDir) + if err != nil { + return nil, err + } } + w.caPEM = caPEM - return &w, err + return &w, nil } func (w *Config) Start(ctx context.Context) error { @@ -214,12 +233,19 @@ func (w *Config) createFleetWebhookConfiguration(ctx context.Context) error { // createMutatingWebhookConfiguration creates the MutatingWebhookConfiguration object for the webhook. func (w *Config) createMutatingWebhookConfiguration(ctx context.Context, webhooks []admv1.MutatingWebhook, configName string) error { + annotations := map[string]string{} + if w.useCertManager { + // Tell cert-manager's CA injector to automatically inject the CA bundle + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, fleetWebhookCertSecretName) + } + mutatingWebhookConfig := admv1.MutatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ Name: configName, Labels: map[string]string{ "admissions.enforcer/disabled": "true", }, + Annotations: annotations, }, Webhooks: webhooks, } @@ -267,12 +293,19 @@ func (w *Config) buildFleetMutatingWebhooks() []admv1.MutatingWebhook { } func (w *Config) createValidatingWebhookConfiguration(ctx context.Context, webhooks []admv1.ValidatingWebhook, configName string) error { + annotations := map[string]string{} + if w.useCertManager { + // Tell cert-manager's CA injector to automatically inject the CA bundle + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, fleetWebhookCertSecretName) + } + validatingWebhookConfig := admv1.ValidatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ Name: configName, Labels: map[string]string{ "admissions.enforcer/disabled": "true", }, + Annotations: annotations, }, Webhooks: webhooks, } @@ -657,6 +690,26 @@ func (w *Config) genCertificate(certDir string) ([]byte, error) { return caPEM, nil } +// loadCertManagerCA loads the CA certificate from the mounted cert-manager Secret. +// When using cert-manager, Kubernetes mounts the Secret as files in the certDir. +// cert-manager creates: ca.crt, tls.crt, and tls.key +// The tls.crt and tls.key are automatically used by the webhook server. +// We only need to read ca.crt for the webhook configuration's CABundle. +func (w *Config) loadCertManagerCA(certDir string) ([]byte, error) { + caPath := filepath.Join(certDir, "ca.crt") + caCert, err := os.ReadFile(caPath) + if err != nil { + return nil, fmt.Errorf("failed to read ca.crt from %s: %w", caPath, err) + } + + if len(caCert) == 0 { + return nil, fmt.Errorf("ca.crt is empty at %s", caPath) + } + + klog.V(2).InfoS("Successfully loaded CA certificate from cert-manager mounted Secret", "path", caPath) + return caCert, nil +} + // genSelfSignedCert generates the self signed Certificate/Key pair func (w *Config) genSelfSignedCert() (caPEMByte, certPEMByte, keyPEMByte []byte, err error) { // CA config diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index f1975b30b..1a5619068 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -1,6 +1,8 @@ package webhook import ( + "os" + "path/filepath" "testing" "github.com/google/go-cmp/cmp" @@ -12,6 +14,27 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/utils" ) +const mockCA = `-----BEGIN CERTIFICATE----- +MIIBkTCB+wIJAKHHCgVZU7c4MA0GCSqGSIb3DQEBCwUAMBExDzANBgNVBAMMBnRl +c3RDQTAeFw0yNDAxMDEwMDAwMDBaFw0yNTAxMDEwMDAwMDBaMBExDzANBgNVBAMM +BnRlc3RDQTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAwjBCKwYJKoZIhvcN +AQkBFhZmb3JtYXRAZXhhbXBsZS5jb20wHhcNMjQwMTAxMDAwMDAwWhcNMjUwMTAx +MDAwMDAwWjARMQ8wDQYDVQQDDAZ0ZXN0Q0EwXDANBgkqhkiG9w0BAQEFAANLADBI +AkEAwjBCKwYJKoZIhvcNAQEBBQADQQAwPgJBAMIwQisGCSqGSIb3DQEBAQUAA0EA +MD4CQQDCMEIrBgkqhkiG9w0BAQEFAANBADANBgkqhkiG9w0BAQUFAANBAMA= +-----END CERTIFICATE-----` + +// setupMockCertManagerFiles creates mock certificate files for testing cert-manager mode. +func setupMockCertManagerFiles(t *testing.T, certDir string) { + t.Helper() + if err := os.WriteFile(filepath.Join(certDir, "ca.crt"), []byte(mockCA), 0600); err != nil { + t.Fatalf("failed to create mock ca.crt: %v", err) + } + t.Cleanup(func() { + os.RemoveAll(certDir) + }) +} + func TestBuildFleetMutatingWebhooks(t *testing.T) { url := options.WebhookClientConnectionType("url") testCases := map[string]struct { @@ -110,6 +133,7 @@ func TestNewWebhookConfig(t *testing.T) { enableGuardRail bool denyModifyMemberClusterLabels bool enableWorkload bool + useCertManager bool want *Config wantErr bool }{ @@ -119,10 +143,34 @@ func TestNewWebhookConfig(t *testing.T) { webhookServiceName: "test-webhook", port: 8080, clientConnectionType: nil, - certDir: "/tmp/cert", + certDir: t.TempDir(), + enableGuardRail: true, + denyModifyMemberClusterLabels: true, + enableWorkload: false, + useCertManager: false, + want: &Config{ + serviceNamespace: "test-namespace", + serviceName: "test-webhook", + servicePort: 8080, + clientConnectionType: nil, + enableGuardRail: true, + denyModifyMemberClusterLabels: true, + enableWorkload: false, + useCertManager: false, + }, + wantErr: false, + }, + { + name: "valid input with cert-manager", + mgr: nil, + webhookServiceName: "test-webhook", + port: 8080, + clientConnectionType: nil, + certDir: t.TempDir(), enableGuardRail: true, denyModifyMemberClusterLabels: true, enableWorkload: false, + useCertManager: true, want: &Config{ serviceNamespace: "test-namespace", serviceName: "test-webhook", @@ -131,6 +179,7 @@ func TestNewWebhookConfig(t *testing.T) { enableGuardRail: true, denyModifyMemberClusterLabels: true, enableWorkload: false, + useCertManager: true, }, wantErr: false, }, @@ -138,8 +187,13 @@ func TestNewWebhookConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { t.Setenv("POD_NAMESPACE", "test-namespace") - defer t.Setenv("POD_NAMESPACE", "") - got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload) + + // Create mock certificate files for cert-manager mode + if tt.useCertManager { + setupMockCertManagerFiles(t, tt.certDir) + } + + got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager) if (err != nil) != tt.wantErr { t.Errorf("NewWebhookConfig() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index d4b6b8a2a..5edc75480 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -28,6 +28,7 @@ export PROPERTY_PROVIDER="${PROPERTY_PROVIDER:-azure}" export USE_PREDEFINED_REGIONS="${USE_PREDEFINED_REGIONS:-false}" export RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL="${RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL:-0m}" export RESOURCE_CHANGES_COLLECTION_DURATION="${RESOURCE_CHANGES_COLLECTION_DURATION:-0m}" +export CERT_MANAGER_VERSION="${CERT_MANAGER_VERSION:-v1.16.2}" # The pre-defined regions; if the AKS property provider is used. # @@ -114,14 +115,31 @@ done # Install the helm charts -# Install the hub agent to the hub cluster kind export kubeconfig --name $HUB_CLUSTER + +# Install cert-manager first (required for webhook certificates) +echo "Installing cert-manager..." + +# Install cert-manager using Helm to avoid ownership conflicts with hub-agent chart +helm repo add jetstack https://charts.jetstack.io --force-update +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version $CERT_MANAGER_VERSION \ + --set crds.enabled=true \ + --wait \ + --timeout=300s + +# Install the hub agent to the hub cluster helm install hub-agent ../../charts/hub-agent/ \ --set image.pullPolicy=Never \ --set image.repository=$REGISTRY/$HUB_AGENT_IMAGE \ --set image.tag=$TAG \ --set namespace=fleet-system \ --set logVerbosity=5 \ + --set replicaCount=3 \ + --set useCertManager=true \ --set enableWebhook=true \ --set enableWorkload=true \ --set webhookClientConnectionType=service \ diff --git a/test/utils/informer/manager.go b/test/utils/informer/manager.go index b67c8fc6a..fde757292 100644 --- a/test/utils/informer/manager.go +++ b/test/utils/informer/manager.go @@ -160,6 +160,21 @@ func (m *FakeManager) GetNameSpaceScopedResources() []schema.GroupVersionResourc return m.NamespaceScopedResources } +func (m *FakeManager) GetAllResources() []schema.GroupVersionResource { + allResources := make([]schema.GroupVersionResource, 0, len(m.APIResources)) + for gvk := range m.APIResources { + // Return a GVR with the same Group/Version and Kind as Resource + // The actual resource name doesn't matter since IsInformerSynced ignores the GVR parameter + gvr := schema.GroupVersionResource{ + Group: gvk.Group, + Version: gvk.Version, + Resource: gvk.Kind, + } + allResources = append(allResources, gvr) + } + return allResources +} + func (m *FakeManager) IsClusterScopedResources(gvk schema.GroupVersionKind) bool { return m.APIResources[gvk] == m.IsClusterScopedResource }