diff --git a/approval-request-metric-collector/Images/approval-controller-metric-collector.png b/approval-request-metric-collector/Images/approval-controller-metric-collector.png new file mode 100644 index 0000000..65b1fc8 Binary files /dev/null and b/approval-request-metric-collector/Images/approval-controller-metric-collector.png differ diff --git a/approval-request-metric-collector/Makefile b/approval-request-metric-collector/Makefile new file mode 100644 index 0000000..fc7adf0 --- /dev/null +++ b/approval-request-metric-collector/Makefile @@ -0,0 +1,77 @@ +# Makefile for ApprovalRequest Controller + +# Image settings +IMAGE_NAME ?= approval-request-controller +IMAGE_TAG ?= latest +REGISTRY ?= + +# Build settings +GOOS ?= $(shell go env GOOS) +GOARCH ?= $(shell go env GOARCH) + +# Tools +CONTROLLER_GEN_VERSION ?= v0.16.0 +CONTROLLER_GEN = go run sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) + +.PHONY: help +help: ## Display this help + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Code Generation + +.PHONY: manifests +manifests: ## Generate CRD manifests + $(CONTROLLER_GEN) crd paths="./apis/..." output:crd:artifacts:config=config/crd/bases + +.PHONY: generate +generate: ## Generate DeepCopy code + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./apis/..." + +##@ Build + +.PHONY: docker-build +docker-build: ## Build docker image + docker buildx build \ + --file docker/approval-request-controller.Dockerfile \ + --output=type=docker \ + --platform=linux/$(GOARCH) \ + --build-arg GOARCH=$(GOARCH) \ + --tag $(IMAGE_NAME):$(IMAGE_TAG) \ + . + +.PHONY: docker-push +docker-push: ## Push docker image + docker push $(REGISTRY)$(IMAGE_NAME):$(IMAGE_TAG) + +##@ Development + +.PHONY: run +run: ## Run controller locally + go run ./cmd/approvalrequestcontroller/main.go + +##@ Deployment + +.PHONY: install +install: ## Install helm chart + helm install approval-request-controller ./charts/approval-request-controller \ + --namespace fleet-system \ + --create-namespace \ + --set image.repository=$(IMAGE_NAME) \ + --set image.tag=$(IMAGE_TAG) + +.PHONY: upgrade +upgrade: ## Upgrade helm chart + helm upgrade approval-request-controller ./charts/approval-request-controller \ + --namespace fleet-system \ + --set image.repository=$(IMAGE_NAME) \ + --set image.tag=$(IMAGE_TAG) + +.PHONY: uninstall +uninstall: ## Uninstall helm chart + helm uninstall approval-request-controller --namespace fleet-system + +##@ Kind + +.PHONY: kind-load +kind-load: docker-build ## Build and load image into kind cluster + kind load docker-image $(IMAGE_NAME):$(IMAGE_TAG) --name hub diff --git a/approval-request-metric-collector/README.md b/approval-request-metric-collector/README.md new file mode 100644 index 0000000..59224a2 --- /dev/null +++ b/approval-request-metric-collector/README.md @@ -0,0 +1,720 @@ +# Approval Controller and Metric Collector Tutorial + +This tutorial demonstrates how to use the Approval Request Controller and Metric Collector with KubeFleet for automated staged rollout approvals based on workload health metrics. + +## Overview + +This directory contains two controllers: +- **approval-request-controller**: Runs on the hub cluster to automate approval decisions for staged updates +- **metric-collector**: Runs on member clusters to collect and report workload health metrics + +![Approval Controller and Metric Collector Architecture](./images/approval-controller-metric-collector.png) + +## How It Works + +### Custom Resource Definitions (CRDs) + +This solution introduces three new CRDs that work together with KubeFleet's native resources: + +#### Hub Cluster CRDs + +1. **MetricCollectorReport** (namespaced) + - Created by approval-request-controller in `fleet-member-` namespaces on hub + - Watched and updated by metric-collector running on member clusters + - Contains specification of Prometheus URL and collected `workload_health` metrics + - Updated every 30 seconds by the metric collector with latest health data + +2. **ClusterStagedWorkloadTracker** (cluster-scoped) + - Defines which workloads to monitor for a ClusterStagedUpdateRun + - The name must match the ClusterStagedUpdateRun name + - Specifies workload's name, namespace and expected health status + - Used by approval-request-controller to determine if stage is ready for approval + +3. **StagedWorkloadTracker** (namespaced) + - Defines which workloads to monitor for a StagedUpdateRun + - The name and namespace must match the StagedUpdateRun name and namespace + - Specifies namespace, workload name, and expected health status + - Used by approval-request-controller to determine if stage is ready for approval + +### Automated Approval Flow + +1. **Stage Initialization** + - User creates an UpdateRun (`ClusterStagedUpdateRun` or `StagedUpdateRun`) on the hub + - KubeFleet creates an ApprovalRequest (`ClusterApprovalRequest` or `ApprovalRequest`) for the first stage + - The ApprovalRequest enters "Pending" state, waiting for approval + +2. **Metric Collector Report Creation** + - Approval-request-controller watches the `ClusterApprovalRequest` and `ApprovalRequest` objects + - For each cluster in the current stage: + - Creates a `MetricCollectorReport` in `fleet-member-` namespace on hub + - Sets `spec.prometheusUrl` to the Prometheus endpoint + - Each report is specific to one cluster + +3. **Metric Collection on Member Clusters** + - Metric-collector controller runs on each member cluster + - Watches for `MetricCollectorReport` in its `fleet-member-` namespace on hub + - Every 30 seconds, it: + - Queries local Prometheus using URL from report spec with PromQL: `workload_health` + - Prometheus returns metrics for all pods with `prometheus.io/scrape: "true"` annotation + - Extracts workload health (1.0 = healthy, 0.0 = unhealthy) + - Updates the `MetricCollectorReport` status on hub with collected metrics + +4. **Health Evaluation** + - Approval-request-controller monitors `MetricCollectorReports` from all stage clusters + - Every 15 seconds, it: + - Fetches the appropriate workload tracker: + - For cluster-scoped: `ClusterStagedWorkloadTracker` with same name as ClusterStagedUpdateRun + - For namespace-scoped: `StagedWorkloadTracker` with same name and namespace as StagedUpdateRun + - For each cluster in the stage: + - Reads its `MetricCollectorReport` status from `fleet-member-` namespace + - Verifies all tracked workloads are present and healthy + - If any workload is missing or unhealthy, waits for next cycle + - If ALL workloads across ALL clusters are healthy: + - Sets ApprovalRequest condition `Approved: True` + - KubeFleet proceeds to roll out the stage + +5. **Stage Progression** + - KubeFleet applies the update to the approved stage clusters + - Creates a new ApprovalRequest for the next stage (if any) + - The cycle repeats for each stage + +## Prerequisites + +- Docker for building images +- Azure CLI (`az`) for ACR operations +- kubectl configured with access to your clusters +- Helm 3.x +- KubeFleet installed on hub and member clusters +- Azure Container Registry (ACR) with anonymous pull enabled + +## Building and Pushing Images to ACR + +Before installing the controllers, you need to build the Docker images and push them to Azure Container Registry (ACR). + +**Critical Note:** Enable anonymous pull on the ACR so that clusters can pull images without authentication. Ensure to disable anonymous pull or delete the ACR after testing. + +### 1. Create ACR with Anonymous Pull + +Create a resource group and ACR with Standard SKU (Basic SKU doesn't support anonymous pull): + +```bash +# Create resource group +az group create --name test-kubefleet-rg --location eastus + +# Create container registry with Standard SKU +az acr create --resource-group test-kubefleet-rg --name myfleetacr --sku Standard + +# Login to ACR +az acr login --name myfleetacr + +# Enable anonymous pull +az acr update --name myfleetacr --anonymous-pull-enabled +``` + +From the `az acr create` output, note down the login server (e.g., `myfleetacr.azurecr.io`). + +### 2. Build and Push Images + +Export registry and tag variables: + +```bash +export REGISTRY="myfleetacr.azurecr.io" +export TAG="latest" + +cd approval-request-metric-collector +``` + +Build and push the approval-request-controller image: + +```bash +docker buildx build \ + --file docker/approval-request-controller.Dockerfile \ + --tag ${REGISTRY}/approval-request-controller:${TAG} \ + --platform=linux/amd64 \ + --push \ + . +``` + +Build and push the metric-collector image: + +```bash +docker buildx build \ + --file docker/metric-collector.Dockerfile \ + --tag ${REGISTRY}/metric-collector:${TAG} \ + --platform=linux/amd64 \ + --push \ + . +``` + +Build and push the metric-app image: + +```bash +docker buildx build \ + --file docker/metric-app.Dockerfile \ + --tag ${REGISTRY}/metric-app:${TAG} \ + --platform=linux/amd64 \ + --push \ + . +``` + +### 3. Verify Images in ACR + +List images in your ACR: + +```bash +az acr repository list --name myfleetacr --output table +``` + +Expected output: +``` +Result +--------------------------- +approval-request-controller +metric-app +metric-collector +``` + +Verify tags for a specific image: + +```bash +az acr repository show-tags --name myfleetacr --repository approval-request-controller --output table +``` + +Expected output: +``` +Result +-------- +latest +``` + +**You're now ready to proceed with the setup!** Your ACR contains all three required images that will be pulled by both kind and production clusters. + +### 4. Cleanup (After Testing) + +When you're done testing, delete the resource group to clean up all resources: + +```bash +az group delete --name test-kubefleet-rg +``` + +## Setup Overview + +Before diving into the setup steps, here's a bird's eye view of what you'll be building: + +### Architecture Components + +**Hub Cluster** - The control plane where you'll deploy: +1. **3 Member Clusters** (kind-cluster-1, kind-cluster-2, kind-cluster-3) + - Labeled with `environment=staging` or `environment=prod` + - These labels determine which stage each cluster belongs to during rollouts + +2. **Prometheus** (propagated to all clusters) + - Monitors workload health via `/metrics` endpoints + - Scrapes pods with `prometheus.io/scrape: "true"` annotation + - Provides `workload_health` metric (1.0 = healthy, 0.0 = unhealthy) + +3. **Approval Request Controller** + - Watches `ClusterApprovalRequest` and `ApprovalRequest` objects + - Creates MetricCollectorReport directly in `fleet-member-` namespaces + - Evaluates workload health from MetricCollectorReport status + - Auto-approves stages when all workloads are healthy + +4. **Sample Metric App** (will be rolled out to clusters) + - Simple Go application exposing `/metrics` endpoint + - Reports `workload_health=1.0` by default + - Used to demonstrate health-based approvals + +**Member Clusters** - Where workloads run: +1. **Metric Collector** + - Connects to hub cluster to watch MetricCollectorReport in its namespace + - Queries local Prometheus every 30 seconds using URL from MetricCollectorReport spec + - Updates MetricCollectorReport status on hub with collected health metrics + +2. **Prometheus** (received from hub) + - Runs on each member cluster + - Scrapes local workload metrics + +3. **Sample Metric App** (received from hub) + - Deployed via staged rollout + - Monitored for health during updates + +### WorkloadTracker - The Decision Maker + +The **WorkloadTracker** is a critical resource that tells the approval controller which workloads must be healthy before approving a stage. Without it, the controller doesn't know what to monitor. + +**Two Types:** + +1. **ClusterStagedWorkloadTracker** (for ClusterStagedUpdateRun) + - Cluster-scoped resource on the hub + - Name must exactly match the ClusterStagedUpdateRun name + - Example: If your UpdateRun is named `example-cluster-staged-run`, the tracker must also be named `example-cluster-staged-run` + - Contains a list of workloads (name + namespace) to monitor across all clusters in each stage + +2. **StagedWorkloadTracker** (for StagedUpdateRun) + - Namespace-scoped resource on the hub + - Name and namespace must exactly match the StagedUpdateRun + - Example: If your UpdateRun is `example-staged-run` in namespace `test-ns`, the tracker must be `example-staged-run` in `test-ns` + - Contains a list of workloads to monitor + +**How It Works:** +```yaml +# ClusterStagedWorkloadTracker example +workloads: + - name: sample-metric-app # Deployment name + namespace: test-ns # Namespace where it runs +``` + +When the approval controller evaluates a stage: +1. It fetches the WorkloadTracker that matches the UpdateRun name (and namespace) +2. For each cluster in the stage, it reads the MetricCollectorReport +3. It verifies that every workload listed in the tracker appears in the report with `health=1.0` +4. Only when ALL workloads in ALL clusters are healthy does it approve the stage + +**Critical Rule:** The WorkloadTracker must be created BEFORE starting the UpdateRun. If the controller can't find a matching tracker, it won't approve any stages. + +### The Staged Rollout Flow + +When you create a **ClusterStagedUpdateRun** or **StagedUpdateRun**, here's what happens: + +1. **Stage 1 (staging)**: Rollout starts with `kind-cluster-1` + - KubeFleet creates an ApprovalRequest for the staging stage + - Approval controller creates MetricCollectorReport in `fleet-member-kind-cluster-1` namespace + - Metric collector on `kind-cluster-1` watches its report on hub and updates status with health metrics + - When `sample-metric-app` is healthy, approval controller auto-approves + - KubeFleet proceeds with the rollout to `kind-cluster-1` + +2. **Stage 2 (prod)**: After staging succeeds + - KubeFleet creates an ApprovalRequest for the prod stage + - Approval controller creates MetricCollectorReports in `fleet-member-kind-cluster-2` and `fleet-member-kind-cluster-3` + - Metric collectors on both clusters watch their reports and update with health data + - When ALL workloads across BOTH prod clusters are healthy, auto-approve + - KubeFleet completes the rollout to production clusters + +### Key Resources You'll Create + +| Resource | Purpose | Where | +|----------|---------|-------| +| **MemberCluster** | Register member clusters with hub, apply stage labels | Hub | +| **ClusterResourcePlacement** | Define what resources to propagate (Prometheus, sample-app) | Hub | +| **StagedUpdateStrategy** | Define stages with label selectors and approval requirements | Hub | +| **WorkloadTracker** | Specify which workloads to monitor for health | Hub | +| **UpdateRun** | Start the staged rollout process | Hub | +| **MetricCollectorReport** | Created by approval controller, updated by metric collector | Hub (fleet-member-* ns) | + +### What the Installation Scripts Do + +**`install-on-hub.sh`** (Approval Request Controller): +- Takes ACR registry URL and hub cluster name as parameters +- Pulls approval-request-controller image from ACR +- Verifies KubeFleet CRDs are installed +- Installs controller via Helm with custom CRDs (MetricCollectorReport, WorkloadTrackers) +- Sets up RBAC for managing MetricCollectorReports and reading approval requests + +**`install-on-member.sh`** (Metric Collector): +- Takes ACR registry URL, hub cluster, and member cluster names as parameters +- Pulls metric-collector image from ACR +- Creates service account with hub cluster access token and RBAC for watching/updating MetricCollectorReports +- Installs metric-collector via Helm on each member cluster +- Configures connection to hub API server to watch reports and local Prometheus for metrics + +With this understanding, you're ready to start the setup! + +## Setup + +### Prerequisites + +Before starting this tutorial, ensure you have: +- A KubeFleet hub cluster with fleet controllers installed +- Three member clusters joined to the hub cluster +- kubectl configured with access to the hub cluster context + +### 1. Label Member Clusters for Staged Rollout + +The staged rollout uses labels to determine which clusters belong to each stage. Label your three member clusters appropriately: + +```bash +# Switch to hub cluster context +kubectl config use-context + +# Label the first cluster for staging (Stage 1) +# Replace with your actual cluster name (e.g., kind-cluster-1, aks-cluster-1, etc.) +kubectl label membercluster environment=staging --overwrite +kubectl label membercluster kubernetes-fleet.io/cluster-name= --overwrite + +# Label the second cluster for production (Stage 2) +# Replace with your actual cluster name +kubectl label membercluster environment=prod --overwrite +kubectl label membercluster kubernetes-fleet.io/cluster-name= --overwrite + +# Label the third cluster for production (Stage 2) +# Replace with your actual cluster name +kubectl label membercluster environment=prod --overwrite +kubectl label membercluster kubernetes-fleet.io/cluster-name= --overwrite + +# Verify the labels are applied +kubectl get membercluster --show-labels +``` + +Expected output: +```bash +NAME JOINED AGE LABELS +cluster-1 True 5m environment=staging,kubernetes-fleet.io/cluster-name=cluster-1,... +cluster-2 True 5m environment=prod,kubernetes-fleet.io/cluster-name=cluster-2,... +cluster-3 True 5m environment=prod,kubernetes-fleet.io/cluster-name=cluster-3,... +``` + +These labels are used by the `StagedUpdateStrategy` to select clusters for each stage: +- **Stage 1 (staging)**: Selects clusters with `environment=staging` → cluster-1 +- **Stage 2 (prod)**: Selects clusters with `environment=prod` → cluster-2 and cluster-3 + +### 2. Deploy Prometheus + +From the kubefleet-cookbook repo, navigate to the approval-request-metric-collector directory and deploy Prometheus for metrics collection: + +```bash +cd approval-request-metric-collector + +# Switch to hub cluster context +kubectl config use-context + +# Create prometheus namespace +kubectl create ns prometheus + +# Deploy Prometheus (ConfigMap, Deployment, Service, RBAC, and CRP) +# - ConfigMap: Contains Prometheus scrape configuration +# - Deployment: Runs Prometheus server +# - Service: Exposes Prometheus on port 9090 +# - RBAC: ServiceAccount, ClusterRole, and ClusterRoleBinding for pod discovery +# - CRP: ClusterResourcePlacement to propagate Prometheus to all member clusters +kubectl apply -f ./examples/prometheus/ +``` + +This deploys Prometheus configured to scrape pods from all namespaces with the proper annotations. + +### 3. Deploy Sample Metric Application + +Create the test namespace and deploy the sample application: + +```bash +# Create test namespace +kubectl create ns test-ns + +# Deploy sample metric app +# This creates a Deployment with a simple Go app that exposes a /metrics endpoint +# The app reports workload_health=1.0 (healthy) by default +# Note: Update the image reference in the YAML to use your ACR registry +# Change "image: metric-app:local" to "image: ${REGISTRY}/metric-app:latest" +# You can use sed to update it: +sed "s|image: metric-app:local|image: ${REGISTRY}/metric-app:latest|" \ + ./examples/sample-metric-app/sample-metric-app.yaml | kubectl apply -f - +``` + +**Alternative:** Manually edit `./examples/sample-metric-app/sample-metric-app.yaml` to change: +```yaml +image: metric-app:local +imagePullPolicy: IfNotPresent +``` +to: +```yaml +image: myfleetacr.azurecr.io/metric-app:latest +imagePullPolicy: Always +``` +Then apply: `kubectl apply -f ./examples/sample-metric-app/` + +### 4. Install Approval Request Controller (Hub Cluster) + +Install the approval request controller on the hub cluster using the ACR registry: + +```bash +# Set your ACR registry name +export REGISTRY="myfleetacr.azurecr.io" + +# Navigate to scripts directory and run the installation script +cd scripts +./install-on-hub.sh ${REGISTRY} +cd .. +``` + +The script performs the following: +1. Pulls the `approval-request-controller` image from your ACR +2. Verifies that required kubefleet CRDs are installed +3. Installs the controller via Helm with the custom CRDs (MetricCollector, MetricCollectorReport, ClusterStagedWorkloadTracker, StagedWorkloadTracker) +4. Verifies the installation + +### 5. Configure Workload Tracker + +Apply the appropriate workload tracker based on which type of staged update you'll use: + +#### For Cluster-Scoped Updates (ClusterStagedUpdateRun): + +```bash +# Apply ClusterStagedWorkloadTracker +# This defines which workloads to monitor for the staged rollout +# The name "example-cluster-staged-run" must match the ClusterStagedUpdateRun name +# Tracks: sample-metric-app in test-ns namespace +kubectl apply -f ./examples/workloadtracker/clusterstagedworkloadtracker.yaml +``` + +#### For Namespace-Scoped Updates (StagedUpdateRun): + +```bash +# Apply StagedWorkloadTracker +# This defines which workloads to monitor for the namespace-scoped staged rollout +# The name "example-staged-run" and namespace "test-ns" must match the StagedUpdateRun +# Tracks: sample-metric-app in test-ns namespace +kubectl apply -f ./examples/workloadtracker/stagedworkloadtracker.yaml +``` +Install the metric collector on all member clusters using the ACR registry: + +```bash +# Navigate to scripts directory +cd scripts + +# Run the installation script for all member clusters +# Replace with your hub cluster name (e.g., kind-hub, hub) +# Replace , , with your actual cluster names +./install-on-member.sh ${REGISTRY} + +# Example: +# ./install-on-member.sh ${REGISTRY} kind-hub kind-cluster-1 kind-cluster-2 kind-cluster-3 + +# Return to parent directory +cd .. +``` +4. Configures connection to hub API server and local Prometheus +```bash +cd ../approval-request-controller + +# Switch to hub cluster context +kubectl config use-context + +# Apply ClusterStagedUpdateStrategy +#### Option A: Cluster-Scoped Staged Update (ClusterStagedUpdateRun) + +Switch back to hub cluster and create a cluster-scoped staged update run: + +```bash +# Apply ClusterStagedUpdateStrategy +# Defines the stages for the rollout: staging (cluster-1) -> prod (cluster-2, cluster-3) +# Each stage requires approval before proceeding +kubectl apply -f ./examples/updateRun/example-csus.yaml + +# Apply ClusterResourcePlacement for sample-metric-app +# This is the resource that will be updated across stages +# Selects the sample-metric-app deployment in test-ns namespace +kubectl apply -f ./examples/updateRun/example-crp.yaml + +# Verify CRP is created +kubectl get crp -A +``` + +Output: +```bash +NAME GEN SCHEDULED SCHEDULED-GEN AVAILABLE AVAILABLE-GEN AGE +example-crp 1 True 1 4s +prometheus-crp 1 True 1 True 1 3m1s +``` + +```bash +# Apply ClusterStagedUpdateRun to start the staged rollout +# This creates the actual update run that progresses through the defined stages +# Name: example-cluster-staged-run (must match ClusterStagedWorkloadTracker) +# References the ClusterResourcePlacement (example-crp) and ClusterStagedUpdateStrategy +kubectl apply -f ./examples/updateRun/example-csur.yaml + +# Check the staged update run status +kubectl get csur -A +``` + +#### Option B: Namespace-Scoped Staged Update (StagedUpdateRun) + +Alternatively, you can use namespace-scoped resources: + +```bash +cd ../approval-request-controller + +# Switch to hub cluster +kubectl config use-context kind-hub +``` + +``` bash +# Apply namespace-scoped ClusterResourcePlacement +# This CRP is configured to only place resources in the test-ns namespace +# This resource is needed because we cannot propagate Namespace which is a +# cluster-scoped resource via RP +kubectl apply -f ./examples/updateRun/example-ns-only-crp.yaml + +kubectl get crp -A +``` + +Output: +```bash +NAME GEN SCHEDULED SCHEDULED-GEN AVAILABLE AVAILABLE-GEN AGE +ns-only-crp 1 True 1 True 1 5s +prometheus-crp 1 True 1 True 1 2m34s +``` + +```bash +# Apply StagedUpdateStrategy (namespace-scoped) +# Defines the stages: staging (cluster-1) -> prod (cluster-2, cluster-3) +# Each stage requires approval before proceeding +kubectl apply -f ./examples/updateRun/example-sus.yaml + +# Apply ResourcePlacement (namespace-scoped) +# This is the namespace-scoped version that works with the test-ns namespace +# References the ns-only-crp ClusterResourcePlacement +kubectl apply -f ./examples/updateRun/example-rp.yaml + +# Verify RP is created +kubectl get rp -A +``` + +Output: +```bash +NAMESPACE NAME GEN SCHEDULED SCHEDULED-GEN AVAILABLE AVAILABLE-GEN AGE +test-ns example-rp 1 True 1 35s +``` + +```bash +# Apply StagedUpdateRun to start the staged rollout (namespace-scoped) +# This creates the actual update run that progresses through the defined stages +# Name: example-staged-run (must match StagedWorkloadTracker) +# Namespace: test-ns (must match StagedWorkloadTracker namespace) +# References the ResourcePlacement (example-rp) +kubectl apply -f ./examples/updateRun/example-sur.yaml + +# Check the staged update run status +kubectl get sur -A +``` + +Output: +```bash +NAMESPACE NAME PLACEMENT RESOURCE-SNAPSHOT-INDEX POLICY-SNAPSHOT-INDEX INITIALIZED PROGRESSING SUCCEEDED AGE +test-ns example-staged-run example-rp 0 0 True True 5s +``` + +### 8. Monitor the Staged Rollout + +Watch the staged update progress: + +#### For Cluster-Scoped Updates: + +```bash +# Check the staged update run status +kubectl get csur -A + +# Check approval requests (should be auto-approved based on metrics) +kubectl get clusterapprovalrequest -A +``` + +Output: +```bash +NAME UPDATE-RUN STAGE APPROVED AGE +example-cluster-staged-run-after-staging example-cluster-staged-run staging True 2m9s +``` + +```bash +# Check metric collector reports +kubectl get metriccollectorreport -A +``` + +Output: +```bash +NAMESPACE NAME WORKLOADS LAST-COLLECTION AGE +fleet-member-kind-cluster-1 mc-example-cluster-staged-run-staging 1 27s 2m57s +``` + +#### For Namespace-Scoped Updates: + +```bash +# Check the staged update run status +kubectl get sur -A + +# Check approval requests (should be auto-approved based on metrics) +kubectl get approvalrequest -A +``` + +Output: +```bash +NAMESPACE NAME UPDATE-RUN STAGE APPROVED AGE +test-ns example-staged-run-after-staging example-staged-run staging True 64s +``` + +```bash +# Check metric collector reports +kubectl get metriccollectorreport -A +``` + +Output: +```bash +NAMESPACE NAME WORKLOADS LAST-COLLECTION AGE +fleet-member-kind-cluster-1 mc-example-staged-run-staging 1 27s 57s +``` + +The approval controller will automatically approve stages when the metric collectors report that workloads are healthy. + +## Verification + +### Check Controller Status + +On the hub cluster: +```bash +kubectl config use-context kind-hub +kubectl get pods -n fleet-system +kubectl logs -n fleet-system deployment/approval-request-controller -f +``` + +On member clusters: +```bash +kubectl config use-context kind-cluster-1 +kubectl get pods -n default +kubectl logs -n default deployment/metric-collector -f +``` + +### Check Metrics Collection + +Verify that MetricCollectorReports are being created and updated on the hub: +```bash +kubectl config use-context kind-hub +kubectl get metriccollectorreport -A +``` + +## Configuration + +### Approval Request Controller +- Located in `charts/approval-request-controller/values.yaml` +- Key settings: log level, resource limits, RBAC, CRD installation +- Default Prometheus URL: `http://prometheus.prometheus.svc.cluster.local:9090` +- Reconciliation interval: 15 seconds + +### Metric Collector +- Located in `charts/metric-collector/values.yaml` +- Key settings: hub cluster URL, Prometheus URL, member cluster name +- Metric collection interval: 30 seconds +- Connects to hub using service account token + +## Troubleshooting + +### Controller not starting +- Check that all required CRDs are installed: `kubectl get crds | grep autoapprove.kubernetes-fleet.io` +- Verify RBAC permissions are configured correctly + +### Metrics not being collected +- Verify Prometheus is accessible: `kubectl port-forward -n test-ns svc/prometheus 9090:9090` +- Check metric collector logs for connection errors +- Ensure workloads have Prometheus scrape annotations + +### Approvals not happening +- Check the appropriate Workload tracker object exists +- Check that the workload tracker name matches the update run name: + - For ClusterStagedUpdateRun: ClusterStagedWorkloadTracker name must match + - For StagedUpdateRun: StagedWorkloadTracker name and namespace must match +- Verify workload tracker resources define correct health thresholds +- Verify MetricCollectorReports are being created on the hub +- Review approval-request-controller logs for decision-making details + +## Additional Resources + +- [Approval Request Controller README](./approval-request-controller/README.md) +- [Metric Collector README](./metric-collector/README.md) +- [KubeFleet Documentation](https://github.com/Azure/kubefleet) diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/doc.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/doc.go new file mode 100644 index 0000000..9d38394 --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the placement v1beta1 API group +// +kubebuilder:object:generate=true +// +groupName=autoapprove.kubernetes-fleet.io +package v1alpha1 diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/groupversion_info.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/groupversion_info.go new file mode 100644 index 0000000..6f1fbac --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/groupversion_info.go @@ -0,0 +1,35 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +kubebuilder:object:generate=true +// +groupName=autoapprove.kubernetes-fleet.io +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "autoapprove.kubernetes-fleet.io", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/metriccollectorreport_types.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/metriccollectorreport_types.go new file mode 100644 index 0000000..d30e06c --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/metriccollectorreport_types.go @@ -0,0 +1,104 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope="Namespaced",shortName=mcr,categories={fleet,fleet-metrics} +// +kubebuilder:storageversion +// +kubebuilder:printcolumn:JSONPath=`.status.workloadsMonitored`,name="Workloads",type=integer +// +kubebuilder:printcolumn:JSONPath=`.status.lastCollectionTime`,name="Last-Collection",type=date +// +kubebuilder:printcolumn:JSONPath=`.metadata.creationTimestamp`,name="Age",type=date + +// MetricCollectorReport is created by the approval-request-controller on the hub cluster +// in the fleet-member-{clusterName} namespace. The metric-collector on the member cluster +// watches these reports and updates their status with collected metrics. +// +// Controller workflow: +// 1. Approval-controller creates MetricCollectorReport with spec on hub +// 2. Metric-collector watches MetricCollectorReport on hub (in fleet-member-{clusterName} namespace) +// 3. Metric-collector queries Prometheus on member cluster +// 4. Metric-collector updates MetricCollectorReport status on hub with collected metrics +// +// Namespace: fleet-member-{clusterName} +// Name: Matches the UpdateRun name +type MetricCollectorReport struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec MetricCollectorReportSpec `json:"spec,omitempty"` + Status MetricCollectorReportStatus `json:"status,omitempty"` +} + +// MetricCollectorReportSpec defines the configuration for metric collection. +type MetricCollectorReportSpec struct { + // PrometheusURL is the URL of the Prometheus server on the member cluster + // Example: "http://prometheus.fleet-system.svc.cluster.local:9090" + PrometheusURL string `json:"prometheusUrl"` +} + +// MetricCollectorReportStatus contains the collected metrics from the member cluster. +type MetricCollectorReportStatus struct { + // Conditions represent the latest available observations of the report's state. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` + + // WorkloadsMonitored is the count of workloads being monitored. + // +optional + WorkloadsMonitored int32 `json:"workloadsMonitored,omitempty"` + + // LastCollectionTime is when metrics were last collected on the member cluster. + // +optional + LastCollectionTime *metav1.Time `json:"lastCollectionTime,omitempty"` + + // CollectedMetrics contains the most recent metrics from each workload. + // +optional + CollectedMetrics []WorkloadMetrics `json:"collectedMetrics,omitempty"` +} + +// WorkloadMetrics represents metrics collected from a single workload pod. +type WorkloadMetrics struct { + // Namespace of the workload. + // +required + Namespace string `json:"namespace"` + + // WorkloadName from the workload_health metric label. + // +required + WorkloadName string `json:"workloadName"` + + // Health indicates if the workload is healthy (true=healthy, false=unhealthy). + // +required + Health bool `json:"health"` +} + +// +kubebuilder:object:root=true + +// MetricCollectorReportList contains a list of MetricCollectorReport. +type MetricCollectorReportList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []MetricCollectorReport `json:"items"` +} + +func init() { + SchemeBuilder.Register(&MetricCollectorReport{}, &MetricCollectorReportList{}) +} diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/workloadtracker_types.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/workloadtracker_types.go new file mode 100644 index 0000000..56925ee --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/workloadtracker_types.go @@ -0,0 +1,100 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// WorkloadReference represents a workload to be tracked +type WorkloadReference struct { + // Name is the name of the workload + // +required + Name string `json:"name"` + + // Namespace is the namespace of the workload + // +required + Namespace string `json:"namespace"` +} + +// +genclient +// +genclient:nonNamespaced +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope="Cluster",categories={fleet,fleet-placement} +// +kubebuilder:storageversion +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ClusterStagedWorkloadTracker expresses user intent to track certain workloads for a ClusterStagedUpdateRun. +// The name of this resource should match the name of the ClusterStagedUpdateRun it is used for. +// For example, if the ClusterStagedUpdateRun is named "example-cluster-staged-run", the +// ClusterStagedWorkloadTracker should also be named "example-cluster-staged-run". +type ClusterStagedWorkloadTracker struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Workloads is a list of workloads to track + // +optional + Workloads []WorkloadReference `json:"workloads,omitempty"` +} + +// +kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ClusterStagedWorkloadTrackerList contains a list of ClusterStagedWorkloadTracker +type ClusterStagedWorkloadTrackerList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ClusterStagedWorkloadTracker `json:"items"` +} + +// +genclient +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope="Namespaced",categories={fleet,fleet-placement} +// +kubebuilder:storageversion +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// StagedWorkloadTracker expresses user intent to track certain workloads for a StagedUpdateRun. +// The name and namespace of this resource should match the name and namespace of the StagedUpdateRun it is used for. +// For example, if the StagedUpdateRun is named "example-staged-run" in namespace "test-ns", the +// StagedWorkloadTracker should also be named "example-staged-run" in namespace "test-ns". +type StagedWorkloadTracker struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Workloads is a list of workloads to track + // +optional + Workloads []WorkloadReference `json:"workloads,omitempty"` +} + +// +kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// StagedWorkloadTrackerList contains a list of StagedWorkloadTracker +type StagedWorkloadTrackerList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []StagedWorkloadTracker `json:"items"` +} + +func init() { + SchemeBuilder.Register( + &ClusterStagedWorkloadTracker{}, + &ClusterStagedWorkloadTrackerList{}, + &StagedWorkloadTracker{}, + &StagedWorkloadTrackerList{}, + ) +} diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/zz_generated.deepcopy.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000..7e3ca6d --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,285 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedWorkloadTracker) DeepCopyInto(out *ClusterStagedWorkloadTracker) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + if in.Workloads != nil { + in, out := &in.Workloads, &out.Workloads + *out = make([]WorkloadReference, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedWorkloadTracker. +func (in *ClusterStagedWorkloadTracker) DeepCopy() *ClusterStagedWorkloadTracker { + if in == nil { + return nil + } + out := new(ClusterStagedWorkloadTracker) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedWorkloadTracker) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedWorkloadTrackerList) DeepCopyInto(out *ClusterStagedWorkloadTrackerList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterStagedWorkloadTracker, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedWorkloadTrackerList. +func (in *ClusterStagedWorkloadTrackerList) DeepCopy() *ClusterStagedWorkloadTrackerList { + if in == nil { + return nil + } + out := new(ClusterStagedWorkloadTrackerList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedWorkloadTrackerList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReport) DeepCopyInto(out *MetricCollectorReport) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReport. +func (in *MetricCollectorReport) DeepCopy() *MetricCollectorReport { + if in == nil { + return nil + } + out := new(MetricCollectorReport) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MetricCollectorReport) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReportList) DeepCopyInto(out *MetricCollectorReportList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]MetricCollectorReport, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReportList. +func (in *MetricCollectorReportList) DeepCopy() *MetricCollectorReportList { + if in == nil { + return nil + } + out := new(MetricCollectorReportList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MetricCollectorReportList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReportSpec) DeepCopyInto(out *MetricCollectorReportSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReportSpec. +func (in *MetricCollectorReportSpec) DeepCopy() *MetricCollectorReportSpec { + if in == nil { + return nil + } + out := new(MetricCollectorReportSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReportStatus) DeepCopyInto(out *MetricCollectorReportStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.LastCollectionTime != nil { + in, out := &in.LastCollectionTime, &out.LastCollectionTime + *out = (*in).DeepCopy() + } + if in.CollectedMetrics != nil { + in, out := &in.CollectedMetrics, &out.CollectedMetrics + *out = make([]WorkloadMetrics, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReportStatus. +func (in *MetricCollectorReportStatus) DeepCopy() *MetricCollectorReportStatus { + if in == nil { + return nil + } + out := new(MetricCollectorReportStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StagedWorkloadTracker) DeepCopyInto(out *StagedWorkloadTracker) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + if in.Workloads != nil { + in, out := &in.Workloads, &out.Workloads + *out = make([]WorkloadReference, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedWorkloadTracker. +func (in *StagedWorkloadTracker) DeepCopy() *StagedWorkloadTracker { + if in == nil { + return nil + } + out := new(StagedWorkloadTracker) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *StagedWorkloadTracker) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StagedWorkloadTrackerList) DeepCopyInto(out *StagedWorkloadTrackerList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]StagedWorkloadTracker, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedWorkloadTrackerList. +func (in *StagedWorkloadTrackerList) DeepCopy() *StagedWorkloadTrackerList { + if in == nil { + return nil + } + out := new(StagedWorkloadTrackerList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *StagedWorkloadTrackerList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadMetrics) DeepCopyInto(out *WorkloadMetrics) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadMetrics. +func (in *WorkloadMetrics) DeepCopy() *WorkloadMetrics { + if in == nil { + return nil + } + out := new(WorkloadMetrics) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadReference) DeepCopyInto(out *WorkloadReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadReference. +func (in *WorkloadReference) DeepCopy() *WorkloadReference { + if in == nil { + return nil + } + out := new(WorkloadReference) + in.DeepCopyInto(out) + return out +} diff --git a/approval-request-metric-collector/charts/approval-request-controller/Chart.yaml b/approval-request-metric-collector/charts/approval-request-controller/Chart.yaml new file mode 100644 index 0000000..f5e253c --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: approval-request-controller +description: A Helm chart for ApprovalRequest Controller on Hub Cluster +type: application +version: 0.1.0 +appVersion: "1.0" diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/_helpers.tpl b/approval-request-metric-collector/charts/approval-request-controller/templates/_helpers.tpl new file mode 100644 index 0000000..a603fac --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "approval-request-controller.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "approval-request-controller.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "approval-request-controller.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "approval-request-controller.labels" -}} +helm.sh/chart: {{ include "approval-request-controller.chart" . }} +{{ include "approval-request-controller.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "approval-request-controller.selectorLabels" -}} +app.kubernetes.io/name: {{ include "approval-request-controller.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "approval-request-controller.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "approval-request-controller.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml new file mode 120000 index 0000000..89ed678 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml new file mode 120000 index 0000000..32b1524 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml new file mode 120000 index 0000000..db857c7 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/deployment.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/deployment.yaml new file mode 100644 index 0000000..82d7905 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/deployment.yaml @@ -0,0 +1,83 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "approval-request-controller.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.controller.replicas }} + selector: + matchLabels: + {{- include "approval-request-controller.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "approval-request-controller.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "approval-request-controller.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: controller + securityContext: + {{- toYaml .Values.securityContext | nindent 10 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - /approval-request-controller + args: + - --metrics-bind-address=:{{ .Values.metrics.port }} + - --health-probe-bind-address=:{{ .Values.healthProbe.port }} + + ports: + {{- if .Values.metrics.enabled }} + - name: metrics + containerPort: {{ .Values.metrics.port }} + protocol: TCP + {{- end }} + {{- if .Values.healthProbe.enabled }} + - name: health + containerPort: {{ .Values.healthProbe.port }} + protocol: TCP + {{- end }} + + {{- if .Values.healthProbe.enabled }} + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 15 + periodSeconds: 20 + + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + {{- end }} + + resources: + {{- toYaml .Values.controller.resources | nindent 10 }} + + {{- with .Values.controller.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/rbac.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/rbac.yaml new file mode 100644 index 0000000..c7ff6f4 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/rbac.yaml @@ -0,0 +1,72 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "approval-request-controller.fullname" . }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} +rules: + # CRD access for checking prerequisites + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] + + # ApprovalRequest and ClusterApprovalRequest (KubeFleet resources) + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["approvalrequests", "clusterapprovalrequests"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["approvalrequests/status", "clusterapprovalrequests/status"] + verbs: ["update", "patch"] + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["approvalrequests/finalizers", "clusterapprovalrequests/finalizers"] + verbs: ["update"] + + # MetricCollector and MetricCollectorReport (our custom resources) + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors", "metriccollectorreports"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors/status", "metriccollectorreports/status"] + verbs: ["update", "patch"] + + # ClusterResourcePlacement and ClusterResourceOverride (KubeFleet resources) + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["clusterresourceplacements", "clusterresourceoverrides"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + + # UpdateRuns (KubeFleet resources) + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["stagedupdateruns", "clusterstagedupdateruns"] + verbs: ["get", "list", "watch"] + + # WorkloadTracker (our custom resource) + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["clusterstagedworkloadtrackers", "stagedworkloadtrackers"] + verbs: ["get", "list", "watch"] + + # Events + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] + + # Leader election + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "create", "update", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "approval-request-controller.fullname" . }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "approval-request-controller.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "approval-request-controller.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/serviceaccount.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/serviceaccount.yaml new file mode 100644 index 0000000..ba3fdd1 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "approval-request-controller.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/values.yaml b/approval-request-metric-collector/charts/approval-request-controller/values.yaml new file mode 100644 index 0000000..89713c0 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/values.yaml @@ -0,0 +1,84 @@ +# Default values for approval-request-controller +# This is a YAML-formatted file. + +# Controller image configuration +image: + repository: approval-request-controller + pullPolicy: IfNotPresent + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +# Controller configuration +controller: + # Number of replicas + replicas: 1 + + # Log verbosity level (0-10) + logLevel: 2 + + # Resource requests and limits + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + + # Node selector + nodeSelector: {} + + # Tolerations + tolerations: [] + + # Affinity + affinity: {} + +# RBAC configuration +rbac: + create: true + +# ServiceAccount configuration +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# Pod annotations +podAnnotations: {} + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 65532 + fsGroup: 65532 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Metrics server configuration +metrics: + enabled: true + port: 8080 + +# Health probe configuration +healthProbe: + enabled: true + port: 8081 + +# CRD installation +crds: + # Install MetricCollectorReport CRD + install: true diff --git a/approval-request-metric-collector/charts/metric-collector/Chart.yaml b/approval-request-metric-collector/charts/metric-collector/Chart.yaml new file mode 100644 index 0000000..2ea221d --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +name: metric-collector +description: MetricCollector for Kubernetes Fleet - Collects workload health metrics and reports to hub cluster +type: application +version: 0.1.0 +appVersion: "latest" +keywords: + - kubernetes + - fleet + - metrics + - monitoring +maintainers: + - name: KubeFleet Team +home: https://github.com/kubefleet-dev/kubefleet +sources: + - https://github.com/kubefleet-dev/kubefleet/tree/main/standalone-metric-collector diff --git a/approval-request-metric-collector/charts/metric-collector/templates/_helpers.tpl b/approval-request-metric-collector/charts/metric-collector/templates/_helpers.tpl new file mode 100644 index 0000000..653f3de --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "metric-collector.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "metric-collector.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "metric-collector.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "metric-collector.labels" -}} +helm.sh/chart: {{ include "metric-collector.chart" . }} +{{ include "metric-collector.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "metric-collector.selectorLabels" -}} +app.kubernetes.io/name: {{ include "metric-collector.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "metric-collector.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "metric-collector.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml b/approval-request-metric-collector/charts/metric-collector/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml new file mode 120000 index 0000000..32b1524 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/metric-collector/templates/deployment.yaml b/approval-request-metric-collector/charts/metric-collector/templates/deployment.yaml new file mode 100644 index 0000000..1bff73a --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/deployment.yaml @@ -0,0 +1,158 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "metric-collector.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.controller.replicas }} + selector: + matchLabels: + {{- include "metric-collector.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "metric-collector.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "metric-collector.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: controller + securityContext: + {{- toYaml .Values.securityContext | nindent 10 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - /metric-collector + args: + - --v={{ .Values.controller.logLevel }} + - --hub-qps=100 + - --hub-burst=200 + - --metrics-bind-address=:{{ .Values.metrics.port }} + - --health-probe-bind-address=:{{ .Values.healthProbe.port }} + - --leader-elect=false + env: + # Member cluster identity + - name: MEMBER_CLUSTER_NAME + value: {{ .Values.memberCluster.name | quote }} + + # Hub cluster connection + - name: HUB_SERVER_URL + value: {{ .Values.hubCluster.url | quote }} + + # Prometheus URL + - name: PROMETHEUS_URL + value: {{ .Values.prometheus.url | quote }} + + {{- if .Values.hubCluster.customHeader }} + - name: HUB_KUBE_HEADER + value: {{ .Values.hubCluster.customHeader | quote }} + {{- end }} + + {{- if .Values.hubCluster.auth.useCertificateAuth }} + # Certificate-based authentication + - name: IDENTITY_CERT + value: /etc/hub-certs/{{ .Values.hubCluster.auth.certSecretKey }} + - name: IDENTITY_KEY + value: /etc/hub-certs/{{ .Values.hubCluster.auth.keySecretKey }} + {{- else }} + # Token-based authentication + - name: CONFIG_PATH + value: /var/run/secrets/hub/{{ .Values.hubCluster.auth.tokenSecretKey }} + {{- end }} + + {{- if .Values.hubCluster.tls.insecure }} + - name: TLS_INSECURE + value: "true" + {{- else if .Values.hubCluster.tls.caSecretName }} + - name: HUB_CERTIFICATE_AUTHORITY + value: /etc/hub-ca/{{ .Values.hubCluster.tls.caSecretKey }} + {{- end }} + + volumeMounts: + {{- if .Values.hubCluster.auth.useCertificateAuth }} + - name: hub-certs + mountPath: /etc/hub-certs + readOnly: true + {{- else }} + - name: hub-token + mountPath: /var/run/secrets/hub + readOnly: true + {{- end }} + + {{- if and (not .Values.hubCluster.tls.insecure) .Values.hubCluster.tls.caSecretName }} + - name: hub-ca + mountPath: /etc/hub-ca + readOnly: true + {{- end }} + + ports: + {{- if .Values.metrics.enabled }} + - name: metrics + containerPort: {{ .Values.metrics.port }} + protocol: TCP + {{- end }} + {{- if .Values.healthProbe.enabled }} + - name: health + containerPort: {{ .Values.healthProbe.port }} + protocol: TCP + {{- end }} + + {{- if .Values.healthProbe.enabled }} + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 15 + periodSeconds: 20 + + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + {{- end }} + + resources: + {{- toYaml .Values.controller.resources | nindent 10 }} + + volumes: + {{- if .Values.hubCluster.auth.useCertificateAuth }} + - name: hub-certs + secret: + secretName: {{ .Values.hubCluster.auth.certSecretName }} + {{- else }} + - name: hub-token + secret: + secretName: {{ .Values.hubCluster.auth.tokenSecretName }} + {{- end }} + + {{- if and (not .Values.hubCluster.tls.insecure) .Values.hubCluster.tls.caSecretName }} + - name: hub-ca + secret: + secretName: {{ .Values.hubCluster.tls.caSecretName }} + {{- end }} + + {{- with .Values.controller.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/hub-rbac.yaml b/approval-request-metric-collector/charts/metric-collector/templates/hub-rbac.yaml new file mode 100644 index 0000000..5df3812 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/hub-rbac.yaml @@ -0,0 +1,85 @@ +{{- if .Values.hubCluster.createRBAC }} +# This template generates RBAC resources for the hub cluster +# Apply this on the HUB cluster to grant the metric-collector permissions +# to watch/update MetricCollectorReport resources in the fleet-member- namespace +# +# Usage: +# helm template metric-collector ./charts/metric-collector \ +# --set hubCluster.createRBAC=true \ +# --show-only templates/hub-rbac.yaml | kubectl apply -f - --context=hub-cluster +# +--- +# Role for MetricCollectorReport access in fleet-member- namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "metric-collector.fullname" . }}-report-access + namespace: fleet-member-{{ .Values.memberCluster.name }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + annotations: + helm.sh/resource-policy: keep +rules: + # MetricCollectorReport access + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectorreports"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectorreports/status"] + verbs: ["update", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "metric-collector.fullname" . }}-report-access + namespace: fleet-member-{{ .Values.memberCluster.name }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + fleet.kubernetes.io/member-cluster: {{ .Values.memberCluster.name }} + annotations: + helm.sh/resource-policy: keep +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "metric-collector.fullname" . }}-report-access +subjects: + - kind: ServiceAccount + name: {{ .Values.hubCluster.auth.serviceAccountName | default (include "metric-collector.serviceAccountName" .) }} + namespace: fleet-member-{{ .Values.memberCluster.name }} +--- +# ClusterRole for reading ClusterStagedWorkloadTracker (cluster-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "metric-collector.fullname" . }}-workloadtracker-reader + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + annotations: + helm.sh/resource-policy: keep +rules: + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["clusterstagedworkloadtrackers"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "metric-collector.fullname" . }}-{{ .Values.memberCluster.name }}-workloadtracker + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + fleet.kubernetes.io/member-cluster: {{ .Values.memberCluster.name }} + annotations: + helm.sh/resource-policy: keep +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "metric-collector.fullname" . }}-workloadtracker-reader +subjects: + - kind: ServiceAccount + name: {{ .Values.hubCluster.auth.serviceAccountName | default (include "metric-collector.serviceAccountName" .) }} + namespace: fleet-member-{{ .Values.memberCluster.name }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/rbac-member.yaml b/approval-request-metric-collector/charts/metric-collector/templates/rbac-member.yaml new file mode 100644 index 0000000..3bd3b5c --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/rbac-member.yaml @@ -0,0 +1,44 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "metric-collector.fullname" . }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} +rules: + # MetricCollector CRD access on member cluster + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors/status"] + verbs: ["update", "patch"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors/finalizers"] + verbs: ["update"] + + # Events + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] + + # Leader election + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "create", "update", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "metric-collector.fullname" . }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "metric-collector.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "metric-collector.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/serviceaccount.yaml b/approval-request-metric-collector/charts/metric-collector/templates/serviceaccount.yaml new file mode 100644 index 0000000..b5d081d --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "metric-collector.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/values.yaml b/approval-request-metric-collector/charts/metric-collector/values.yaml new file mode 100644 index 0000000..8af6dd9 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/values.yaml @@ -0,0 +1,140 @@ +# Default values for metric-collector +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# Controller image configuration +image: + repository: metric-collector + pullPolicy: IfNotPresent + tag: "latest" + +# Metric app image configuration (used in sample deployments) +metricApp: + image: + repository: metric-app + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +# Member cluster configuration +memberCluster: + # Name of the member cluster (required) + # This should match the cluster name in the fleet + name: "" + +# Hub cluster connection configuration +hubCluster: + # Hub API server URL (required) + # Example: https://hub-cluster.example.com:6443 + url: "" + + # Set to true to generate hub RBAC resources + # These resources must be applied on the hub cluster + createRBAC: false + + # Authentication configuration + auth: + # Token-based authentication (default) + useTokenAuth: true + tokenSecretName: "hub-token" + tokenSecretKey: "token" + + # Certificate-based authentication + useCertificateAuth: false + certSecretName: "" + certSecretKey: "tls.crt" + keySecretKey: "tls.key" + + # ServiceAccount details for RBAC binding on hub cluster + # Leave empty to use the default serviceAccount from this chart + serviceAccountName: "" + serviceAccountNamespace: "" + + # TLS configuration + tls: + # Skip TLS verification (not recommended for production) + insecure: false + # CA certificate for hub cluster + caSecretName: "" + caSecretKey: "ca.crt" + + # Custom header for hub requests (optional) + customHeader: "" + +# Prometheus configuration +prometheus: + # Prometheus URL (required) + # Example: http://prometheus.monitoring.svc.cluster.local:9090 + url: "" + +# Controller configuration +controller: + # Number of replicas + replicas: 1 + + # Collection interval (how often to scrape metrics) + collectionInterval: "30s" + + # Log verbosity level (0-10) + logLevel: 2 + + # Resource requests and limits + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + + # Node selector + nodeSelector: {} + + # Tolerations + tolerations: [] + + # Affinity + affinity: {} + +# RBAC configuration +rbac: + create: true + +# ServiceAccount configuration +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# Pod annotations +podAnnotations: {} + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 65532 + fsGroup: 65532 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Metrics server configuration +metrics: + enabled: true + port: 8080 + +# Health probe configuration +healthProbe: + enabled: true + port: 8081 diff --git a/approval-request-metric-collector/cmd/approvalrequestcontroller/main.go b/approval-request-metric-collector/cmd/approvalrequestcontroller/main.go new file mode 100644 index 0000000..f35bba1 --- /dev/null +++ b/approval-request-metric-collector/cmd/approvalrequestcontroller/main.go @@ -0,0 +1,164 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "flag" + "fmt" + "os" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + localv1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" + approvalcontroller "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/pkg/controllers/approvalrequest" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +var ( + scheme = runtime.NewScheme() +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(placementv1beta1.AddToScheme(scheme)) + utilruntime.Must(localv1alpha1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) +} + +func main() { + var metricsAddr string + var probeAddr string + + flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + klog.InfoS("Starting ApprovalRequest Controller") + + config := ctrl.GetConfigOrDie() + + // Check required CRDs are installed before starting + if err := checkRequiredCRDs(config); err != nil { + klog.ErrorS(err, "Required CRDs not found") + os.Exit(1) + } + + mgr, err := ctrl.NewManager(config, ctrl.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{ + BindAddress: metricsAddr, + }, + HealthProbeBindAddress: probeAddr, + }) + if err != nil { + klog.ErrorS(err, "Unable to create manager") + os.Exit(1) + } + + // Setup ApprovalRequest controller + approvalRequestReconciler := &approvalcontroller.Reconciler{ + Client: mgr.GetClient(), + } + if err = approvalRequestReconciler.SetupWithManagerForApprovalRequest(mgr); err != nil { + klog.ErrorS(err, "Unable to create controller", "controller", "ApprovalRequest") + os.Exit(1) + } + + // Setup ClusterApprovalRequest controller + clusterApprovalRequestReconciler := &approvalcontroller.Reconciler{ + Client: mgr.GetClient(), + } + if err = clusterApprovalRequestReconciler.SetupWithManagerForClusterApprovalRequest(mgr); err != nil { + klog.ErrorS(err, "Unable to create controller", "controller", "ClusterApprovalRequest") + os.Exit(1) + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + klog.ErrorS(err, "Unable to set up health check") + os.Exit(1) + } + + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + klog.ErrorS(err, "Unable to set up ready check") + os.Exit(1) + } + + klog.InfoS("Starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + klog.ErrorS(err, "Problem running manager") + os.Exit(1) + } +} + +// checkRequiredCRDs checks that all required CRDs are installed +func checkRequiredCRDs(config *rest.Config) error { + requiredCRDs := []string{ + "approvalrequests.placement.kubernetes-fleet.io", + "clusterapprovalrequests.placement.kubernetes-fleet.io", + "metriccollectorreports.autoapprove.kubernetes-fleet.io", + "clusterstagedworkloadtrackers.autoapprove.kubernetes-fleet.io", + "stagedworkloadtrackers.autoapprove.kubernetes-fleet.io", + "clusterstagedupdateruns.placement.kubernetes-fleet.io", + "stagedupdateruns.placement.kubernetes-fleet.io", + } + + klog.InfoS("Checking for required CRDs", "count", len(requiredCRDs)) + + c, err := client.New(config, client.Options{Scheme: scheme}) + if err != nil { + return err + } + + ctx := context.Background() + missingCRDs := []string{} + + for _, crdName := range requiredCRDs { + crd := &apiextensionsv1.CustomResourceDefinition{} + err := c.Get(ctx, client.ObjectKey{Name: crdName}, crd) + if err != nil { + klog.ErrorS(err, "CRD not found", "crd", crdName) + missingCRDs = append(missingCRDs, crdName) + } else { + klog.V(3).InfoS("CRD found", "crd", crdName) + } + } + + if len(missingCRDs) > 0 { + return fmt.Errorf("missing required CRDs: %v", missingCRDs) + } + + klog.InfoS("All required CRDs are installed") + return nil +} diff --git a/approval-request-metric-collector/cmd/metricapp/main.go b/approval-request-metric-collector/cmd/metricapp/main.go new file mode 100644 index 0000000..17dc094 --- /dev/null +++ b/approval-request-metric-collector/cmd/metricapp/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +func main() { + // Define a simple gauge metric for health with labels + workloadHealth := prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "workload_health", + Help: "Indicates if the workload is healthy (1=healthy, 0=unhealthy)", + }, + ) + + // Set it to 1 (healthy) with labels + workloadHealth.Set(1) + + // Register metric with Prometheus default registry + prometheus.MustRegister(workloadHealth) + + // Expose metrics endpoint + http.Handle("/metrics", promhttp.Handler()) + + // Start HTTP server + http.ListenAndServe(":8080", nil) +} diff --git a/approval-request-metric-collector/cmd/metriccollector/main.go b/approval-request-metric-collector/cmd/metriccollector/main.go new file mode 100644 index 0000000..2a8020f --- /dev/null +++ b/approval-request-metric-collector/cmd/metriccollector/main.go @@ -0,0 +1,237 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "flag" + "fmt" + "net/http" + "os" + + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/healthz" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + placementv1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" + metriccollector "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/pkg/controllers/metriccollector" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +var ( + hubQPS = flag.Int("hub-qps", 100, "QPS for hub cluster client") + hubBurst = flag.Int("hub-burst", 200, "Burst for hub cluster client") + metricsAddr = flag.String("metrics-bind-address", ":8080", "The address the metric endpoint binds to.") + probeAddr = flag.String("health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + leaderElectionID = flag.String("leader-election-id", "metric-collector-leader", "The leader election ID.") + enableLeaderElect = flag.Bool("leader-elect", true, "Enable leader election for controller manager.") +) + +func main() { + klog.InitFlags(nil) + flag.Parse() + + klog.InfoS("Starting MetricCollector Controller") + + // Get member cluster identity + memberClusterName := os.Getenv("MEMBER_CLUSTER_NAME") + if memberClusterName == "" { + klog.ErrorS(nil, "MEMBER_CLUSTER_NAME environment variable not set") + os.Exit(1) + } + + // Construct hub namespace + hubNamespace := fmt.Sprintf("fleet-member-%s", memberClusterName) + klog.InfoS("Using hub namespace", "namespace", hubNamespace, "memberCluster", memberClusterName) + + // Build hub cluster config + hubConfig, err := buildHubConfig() + if err != nil { + klog.ErrorS(err, "Failed to build hub cluster config") + os.Exit(1) + } + hubConfig.QPS = float32(*hubQPS) + hubConfig.Burst = *hubBurst + + // Start controller + if err := Start(ctrl.SetupSignalHandler(), hubConfig, memberClusterName, hubNamespace); err != nil { + klog.ErrorS(err, "Failed to start controller") + os.Exit(1) + } +} + +// buildHubConfig creates hub cluster config from environment variables +// following the same pattern as member-agent +func buildHubConfig() (*rest.Config, error) { + hubURL := os.Getenv("HUB_SERVER_URL") + if hubURL == "" { + return nil, fmt.Errorf("HUB_SERVER_URL environment variable not set") + } + + // Check for custom headers + customHeader := os.Getenv("HUB_KUBE_HEADER") + + // Check TLS insecure flag + tlsInsecure := os.Getenv("TLS_INSECURE") == "true" + + // Initialize hub config + hubConfig := &rest.Config{ + Host: hubURL, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: tlsInsecure, + }, + WrapTransport: func(rt http.RoundTripper) http.RoundTripper { + if customHeader != "" { + return &customHeaderTransport{ + Base: rt, + Header: customHeader, + } + } + return rt + }, + } + + // Check for certificate-based authentication + identityKey := os.Getenv("IDENTITY_KEY") + identityCert := os.Getenv("IDENTITY_CERT") + if identityKey != "" && identityCert != "" { + klog.InfoS("Using certificate-based authentication for hub cluster") + // Read certificate files + certData, err := os.ReadFile(identityCert) + if err != nil { + return nil, fmt.Errorf("failed to read identity cert: %w", err) + } + keyData, err := os.ReadFile(identityKey) + if err != nil { + return nil, fmt.Errorf("failed to read identity key: %w", err) + } + hubConfig.CertData = certData + hubConfig.KeyData = keyData + } else { + // Token-based authentication + klog.InfoS("Using token-based authentication for hub cluster") + configPath := os.Getenv("CONFIG_PATH") + if configPath == "" { + configPath = "/var/run/secrets/hub/token" + } + tokenData, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read hub token from %s: %w", configPath, err) + } + hubConfig.BearerToken = string(tokenData) + } + + // Handle CA certificate + caBundle := os.Getenv("CA_BUNDLE") + hubCA := os.Getenv("HUB_CERTIFICATE_AUTHORITY") + if caBundle != "" { + klog.InfoS("Using CA bundle for hub cluster TLS") + caData, err := os.ReadFile(caBundle) + if err != nil { + return nil, fmt.Errorf("failed to read CA bundle: %w", err) + } + hubConfig.CAData = caData + } else if hubCA != "" { + klog.InfoS("Using hub certificate authority for hub cluster TLS") + caData, err := os.ReadFile(hubCA) + if err != nil { + return nil, fmt.Errorf("failed to read hub CA: %w", err) + } + hubConfig.CAData = caData + } else { + // If no CA specified, try to load system CA pool + klog.InfoS("No CA specified, using insecure connection or system CA pool") + } + + return hubConfig, nil +} + +// customHeaderTransport adds custom headers to requests +type customHeaderTransport struct { + Base http.RoundTripper + Header string +} + +func (t *customHeaderTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req.Header.Add("X-Custom-Header", t.Header) + return t.Base.RoundTrip(req) +} + +// Start starts the controller with hub cluster connection +func Start(ctx context.Context, hubCfg *rest.Config, memberClusterName, hubNamespace string) error { + // Create scheme with required APIs + scheme := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(scheme); err != nil { + return fmt.Errorf("failed to add client-go scheme: %w", err) + } + if err := placementv1alpha1.AddToScheme(scheme); err != nil { + return fmt.Errorf("failed to add placement v1alpha1 API to scheme: %w", err) + } + if err := placementv1beta1.AddToScheme(scheme); err != nil { + return fmt.Errorf("failed to add placement v1beta1 API to scheme: %w", err) + } + + // Create hub cluster manager - watches MetricCollectorReport in hub namespace + hubMgr, err := ctrl.NewManager(hubCfg, ctrl.Options{ + Scheme: scheme, + Cache: cache.Options{ + DefaultNamespaces: map[string]cache.Config{ + hubNamespace: {}, // Only watch fleet-member- + }, + }, + Metrics: metricsserver.Options{ + BindAddress: *metricsAddr, + }, + HealthProbeBindAddress: *probeAddr, + LeaderElection: *enableLeaderElect, + LeaderElectionID: *leaderElectionID, + }) + if err != nil { + return fmt.Errorf("failed to create hub manager: %w", err) + } + + // Setup MetricCollectorReport controller (watches hub, queries member Prometheus) + if err := (&metriccollector.Reconciler{ + HubClient: hubMgr.GetClient(), + }).SetupWithManager(hubMgr); err != nil { + return fmt.Errorf("failed to setup controller: %w", err) + } + + // Add health checks + if err := hubMgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("failed to add healthz check: %w", err) + } + if err := hubMgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("failed to add readyz check: %w", err) + } + + klog.InfoS("Starting MetricCollector controller", + "hubUrl", hubCfg.Host, + "hubNamespace", hubNamespace, + "memberCluster", memberClusterName, + "metricsAddr", *metricsAddr, + "probeAddr", *probeAddr) + + // Start hub manager (watches MetricCollectorReport on hub, queries Prometheus on member) + klog.InfoS("Starting hub manager", "namespace", hubNamespace) + return hubMgr.Start(ctx) +} diff --git a/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml new file mode 100644 index 0000000..79e83cf --- /dev/null +++ b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml @@ -0,0 +1,64 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.0 + name: clusterstagedworkloadtrackers.autoapprove.kubernetes-fleet.io +spec: + group: autoapprove.kubernetes-fleet.io + names: + categories: + - fleet + - fleet-placement + kind: ClusterStagedWorkloadTracker + listKind: ClusterStagedWorkloadTrackerList + plural: clusterstagedworkloadtrackers + singular: clusterstagedworkloadtracker + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ClusterStagedWorkloadTracker expresses user intent to track certain workloads for a ClusterStagedUpdateRun. + The name of this resource should match the name of the ClusterStagedUpdateRun it is used for. + For example, if the ClusterStagedUpdateRun is named "example-cluster-staged-run", the + ClusterStagedWorkloadTracker should also be named "example-cluster-staged-run". + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + workloads: + description: Workloads is a list of workloads to track + items: + description: WorkloadReference represents a workload to be tracked + properties: + name: + description: Name is the name of the workload + type: string + namespace: + description: Namespace is the namespace of the workload + type: string + required: + - name + - namespace + type: object + type: array + type: object + served: true + storage: true diff --git a/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml new file mode 100644 index 0000000..9548c23 --- /dev/null +++ b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml @@ -0,0 +1,177 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.0 + name: metriccollectorreports.autoapprove.kubernetes-fleet.io +spec: + group: autoapprove.kubernetes-fleet.io + names: + categories: + - fleet + - fleet-metrics + kind: MetricCollectorReport + listKind: MetricCollectorReportList + plural: metriccollectorreports + shortNames: + - mcr + singular: metriccollectorreport + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.workloadsMonitored + name: Workloads + type: integer + - jsonPath: .status.lastCollectionTime + name: Last-Collection + type: date + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + MetricCollectorReport is created by the approval-request-controller on the hub cluster + in the fleet-member-{clusterName} namespace. The metric-collector on the member cluster + watches these reports and updates their status with collected metrics. + + Controller workflow: + 1. Approval-controller creates MetricCollectorReport with spec on hub + 2. Metric-collector watches MetricCollectorReport on hub (in fleet-member-{clusterName} namespace) + 3. Metric-collector queries Prometheus on member cluster + 4. Metric-collector updates MetricCollectorReport status on hub with collected metrics + + Namespace: fleet-member-{clusterName} + Name: Matches the UpdateRun name + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: MetricCollectorReportSpec defines the configuration for metric + collection. + properties: + prometheusUrl: + description: |- + PrometheusURL is the URL of the Prometheus server on the member cluster + Example: "http://prometheus.fleet-system.svc.cluster.local:9090" + type: string + required: + - prometheusUrl + type: object + status: + description: MetricCollectorReportStatus contains the collected metrics + from the member cluster. + properties: + collectedMetrics: + description: CollectedMetrics contains the most recent metrics from + each workload. + items: + description: WorkloadMetrics represents metrics collected from a + single workload pod. + properties: + health: + description: Health indicates if the workload is healthy (true=healthy, + false=unhealthy). + type: boolean + namespace: + description: Namespace of the workload. + type: string + workloadName: + description: WorkloadName from the workload_health metric label. + type: string + required: + - health + - namespace + - workloadName + type: object + type: array + conditions: + description: Conditions represent the latest available observations + of the report's state. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastCollectionTime: + description: LastCollectionTime is when metrics were last collected + on the member cluster. + format: date-time + type: string + workloadsMonitored: + description: WorkloadsMonitored is the count of workloads being monitored. + format: int32 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml new file mode 100644 index 0000000..ef221cd --- /dev/null +++ b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml @@ -0,0 +1,64 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.0 + name: stagedworkloadtrackers.autoapprove.kubernetes-fleet.io +spec: + group: autoapprove.kubernetes-fleet.io + names: + categories: + - fleet + - fleet-placement + kind: StagedWorkloadTracker + listKind: StagedWorkloadTrackerList + plural: stagedworkloadtrackers + singular: stagedworkloadtracker + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + StagedWorkloadTracker expresses user intent to track certain workloads for a StagedUpdateRun. + The name and namespace of this resource should match the name and namespace of the StagedUpdateRun it is used for. + For example, if the StagedUpdateRun is named "example-staged-run" in namespace "test-ns", the + StagedWorkloadTracker should also be named "example-staged-run" in namespace "test-ns". + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + workloads: + description: Workloads is a list of workloads to track + items: + description: WorkloadReference represents a workload to be tracked + properties: + name: + description: Name is the name of the workload + type: string + namespace: + description: Namespace is the namespace of the workload + type: string + required: + - name + - namespace + type: object + type: array + type: object + served: true + storage: true diff --git a/approval-request-metric-collector/docker/approval-request-controller.Dockerfile b/approval-request-metric-collector/docker/approval-request-controller.Dockerfile new file mode 100644 index 0000000..7775210 --- /dev/null +++ b/approval-request-metric-collector/docker/approval-request-controller.Dockerfile @@ -0,0 +1,27 @@ +# Build stage +FROM golang:1.24 AS builder + +WORKDIR /workspace + +# Copy go mod files +COPY go.mod go.sum* ./ +RUN go mod download + +# Copy source code +COPY apis/ apis/ +COPY pkg/ pkg/ +COPY cmd/ cmd/ + +# Build the controller +ARG GOARCH=amd64 +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} go build \ + -a -o approval-request-controller \ + ./cmd/approvalrequestcontroller + +# Runtime stage +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/approval-request-controller . +USER 65532:65532 + +ENTRYPOINT ["/approval-request-controller"] diff --git a/approval-request-metric-collector/docker/metric-app.Dockerfile b/approval-request-metric-collector/docker/metric-app.Dockerfile new file mode 100644 index 0000000..86100e3 --- /dev/null +++ b/approval-request-metric-collector/docker/metric-app.Dockerfile @@ -0,0 +1,27 @@ +# Build stage +FROM golang:1.24 AS builder + +WORKDIR /workspace + +# Copy go mod files +COPY go.mod go.sum* ./ +RUN go mod download + +# Copy source code +COPY apis/ apis/ +COPY pkg/ pkg/ +COPY cmd/ cmd/ + +# Build the application +ARG GOARCH=amd64 +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} go build \ + -a -o metric-app \ + ./cmd/metricapp + +# Runtime stage +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/metric-app . +USER 65532:65532 + +ENTRYPOINT ["/metric-app"] diff --git a/approval-request-metric-collector/docker/metric-collector.Dockerfile b/approval-request-metric-collector/docker/metric-collector.Dockerfile new file mode 100644 index 0000000..1ebff59 --- /dev/null +++ b/approval-request-metric-collector/docker/metric-collector.Dockerfile @@ -0,0 +1,27 @@ +# Build stage +FROM golang:1.24 AS builder + +WORKDIR /workspace + +# Copy go mod files +COPY go.mod go.sum* ./ +RUN go mod download + +# Copy source code +COPY apis/ apis/ +COPY pkg/ pkg/ +COPY cmd/ cmd/ + +# Build the collector +ARG GOARCH=amd64 +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} go build \ + -a -o metric-collector \ + ./cmd/metriccollector + +# Runtime stage +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/metric-collector . +USER 65532:65532 + +ENTRYPOINT ["/metric-collector"] diff --git a/approval-request-metric-collector/examples/membercluster/fleet_v1beta1_membercluster.yaml b/approval-request-metric-collector/examples/membercluster/fleet_v1beta1_membercluster.yaml new file mode 100644 index 0000000..ceb7d7b --- /dev/null +++ b/approval-request-metric-collector/examples/membercluster/fleet_v1beta1_membercluster.yaml @@ -0,0 +1,41 @@ +apiVersion: cluster.kubernetes-fleet.io/v1beta1 +kind: MemberCluster +metadata: + name: kind-cluster-1 + labels: + environment: staging + kubernetes-fleet.io/cluster-name: kind-cluster-1 +spec: + identity: + name: fleet-member-agent-cluster-1 + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" +--- +apiVersion: cluster.kubernetes-fleet.io/v1beta1 +kind: MemberCluster +metadata: + name: kind-cluster-2 + labels: + environment: prod + kubernetes-fleet.io/cluster-name: kind-cluster-2 +spec: + identity: + name: fleet-member-agent-cluster-2 + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" +--- +apiVersion: cluster.kubernetes-fleet.io/v1beta1 +kind: MemberCluster +metadata: + name: kind-cluster-3 + labels: + environment: prod + kubernetes-fleet.io/cluster-name: kind-cluster-3 +spec: + identity: + name: fleet-member-agent-cluster-3 + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" \ No newline at end of file diff --git a/approval-request-metric-collector/examples/prometheus/configmap.yaml b/approval-request-metric-collector/examples/prometheus/configmap.yaml new file mode 100644 index 0000000..e0d33b7 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/configmap.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: prometheus +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + scrape_configs: + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + # Scrape pods from all namespaces + relabel_configs: + # Only scrape pods with prometheus.io/scrape annotation + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + # Use the port from prometheus.io/port annotation or default pod IP + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + # Use the path from prometheus.io/path annotation or default /metrics + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + # Add pod metadata as labels + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: app + # Add CLUSTER_NAME and WORKLOAD_NAME from env vars if present + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) diff --git a/approval-request-metric-collector/examples/prometheus/deployment.yaml b/approval-request-metric-collector/examples/prometheus/deployment.yaml new file mode 100644 index 0000000..a922073 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/deployment.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: prometheus + labels: + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + containers: + - name: prometheus + image: prom/prometheus:v2.47.0 + args: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + ports: + - name: web + containerPort: 9090 + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-storage + mountPath: /prometheus + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-storage + emptyDir: {} diff --git a/approval-request-metric-collector/examples/prometheus/prometheus-crp.yaml b/approval-request-metric-collector/examples/prometheus/prometheus-crp.yaml new file mode 100644 index 0000000..8243057 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/prometheus-crp.yaml @@ -0,0 +1,22 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: prometheus-crp +spec: + resourceSelectors: + - group: "" + version: v1 + kind: Namespace + name: prometheus + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: prometheus + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: prometheus + policy: + placementType: PickAll + strategy: + type: RollingUpdate \ No newline at end of file diff --git a/approval-request-metric-collector/examples/prometheus/rbac.yaml b/approval-request-metric-collector/examples/prometheus/rbac.yaml new file mode 100644 index 0000000..4dd638d --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/rbac.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: prometheus diff --git a/approval-request-metric-collector/examples/prometheus/service.yaml b/approval-request-metric-collector/examples/prometheus/service.yaml new file mode 100644 index 0000000..ff61964 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: prometheus + labels: + app: prometheus +spec: + type: ClusterIP + ports: + - name: web + port: 9090 + targetPort: 9090 + protocol: TCP + selector: + app: prometheus diff --git a/approval-request-metric-collector/examples/sample-metric-app/sample-metric-app.yaml b/approval-request-metric-collector/examples/sample-metric-app/sample-metric-app.yaml new file mode 100644 index 0000000..5deb993 --- /dev/null +++ b/approval-request-metric-collector/examples/sample-metric-app/sample-metric-app.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sample-metric-app + namespace: test-ns + labels: + app: sample-metric-app +spec: + replicas: 1 + selector: + matchLabels: + app: sample-metric-app + template: + metadata: + labels: + app: sample-metric-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: metric-app + image: metric-app:local + imagePullPolicy: IfNotPresent + ports: + - containerPort: 8080 diff --git a/approval-request-metric-collector/examples/updateRun/example-crp.yaml b/approval-request-metric-collector/examples/updateRun/example-crp.yaml new file mode 100644 index 0000000..21a0827 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-crp.yaml @@ -0,0 +1,14 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: example-crp +spec: + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + policy: + placementType: PickAll + strategy: + type: External diff --git a/approval-request-metric-collector/examples/updateRun/example-csur.yaml b/approval-request-metric-collector/examples/updateRun/example-csur.yaml new file mode 100644 index 0000000..107f5fc --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-csur.yaml @@ -0,0 +1,10 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateRun +metadata: + name: example-cluster-staged-run +spec: + placementName: example-crp + resourceSnapshotIndex: "0" + stagedRolloutStrategyName: example-cluster-staged-strategy + state: Started + \ No newline at end of file diff --git a/approval-request-metric-collector/examples/updateRun/example-csus.yaml b/approval-request-metric-collector/examples/updateRun/example-csus.yaml new file mode 100644 index 0000000..14db148 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-csus.yaml @@ -0,0 +1,18 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateStrategy +metadata: + name: example-cluster-staged-strategy +spec: + stages: + - name: staging + labelSelector: + matchLabels: + environment: staging + afterStageTasks: + - type: Approval + - name: prod + labelSelector: + matchLabels: + environment: prod + afterStageTasks: + - type: Approval \ No newline at end of file diff --git a/approval-request-metric-collector/examples/updateRun/example-ns-only-crp.yaml b/approval-request-metric-collector/examples/updateRun/example-ns-only-crp.yaml new file mode 100644 index 0000000..ddff3f0 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-ns-only-crp.yaml @@ -0,0 +1,15 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: ns-only-crp +spec: + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + selectionScope: NamespaceOnly + policy: + placementType: PickAll + strategy: + type: RollingUpdate \ No newline at end of file diff --git a/approval-request-metric-collector/examples/updateRun/example-rp.yaml b/approval-request-metric-collector/examples/updateRun/example-rp.yaml new file mode 100644 index 0000000..0836868 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-rp.yaml @@ -0,0 +1,15 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ResourcePlacement +metadata: + name: example-rp + namespace: test-ns +spec: + resourceSelectors: + - group: "apps" + kind: Deployment + name: sample-metric-app + version: v1 + policy: + placementType: PickAll + strategy: + type: External \ No newline at end of file diff --git a/approval-request-metric-collector/examples/updateRun/example-sur.yaml b/approval-request-metric-collector/examples/updateRun/example-sur.yaml new file mode 100644 index 0000000..e045585 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-sur.yaml @@ -0,0 +1,11 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: StagedUpdateRun +metadata: + name: example-staged-run + namespace: test-ns +spec: + placementName: example-rp + resourceSnapshotIndex: "0" + stagedRolloutStrategyName: example-staged-strategy + state: Started + \ No newline at end of file diff --git a/approval-request-metric-collector/examples/updateRun/example-sus.yaml b/approval-request-metric-collector/examples/updateRun/example-sus.yaml new file mode 100644 index 0000000..7b2798b --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-sus.yaml @@ -0,0 +1,19 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: StagedUpdateStrategy +metadata: + name: example-staged-strategy + namespace: test-ns +spec: + stages: + - name: staging + labelSelector: + matchLabels: + environment: staging + afterStageTasks: + - type: Approval + - name: prod + labelSelector: + matchLabels: + environment: prod + afterStageTasks: + - type: Approval \ No newline at end of file diff --git a/approval-request-metric-collector/examples/workloadtracker/clusterstagedworkloadtracker.yaml b/approval-request-metric-collector/examples/workloadtracker/clusterstagedworkloadtracker.yaml new file mode 100644 index 0000000..343d05b --- /dev/null +++ b/approval-request-metric-collector/examples/workloadtracker/clusterstagedworkloadtracker.yaml @@ -0,0 +1,8 @@ +apiVersion: autoapprove.kubernetes-fleet.io/v1alpha1 +kind: ClusterStagedWorkloadTracker +metadata: + # The name must match the name of the ClusterStagedUpdateRun it is used for + name: example-cluster-staged-run +workloads: + - name: sample-metric-app + namespace: test-ns diff --git a/approval-request-metric-collector/examples/workloadtracker/stagedworkloadtracker.yaml b/approval-request-metric-collector/examples/workloadtracker/stagedworkloadtracker.yaml new file mode 100644 index 0000000..b54fce0 --- /dev/null +++ b/approval-request-metric-collector/examples/workloadtracker/stagedworkloadtracker.yaml @@ -0,0 +1,9 @@ +apiVersion: autoapprove.kubernetes-fleet.io/v1alpha1 +kind: StagedWorkloadTracker +metadata: + # The name and namespace must match the name and namespace of the StagedUpdateRun it is used for + name: example-staged-run + namespace: test-ns +workloads: + - name: sample-metric-app + namespace: test-ns diff --git a/approval-request-metric-collector/go.mod b/approval-request-metric-collector/go.mod new file mode 100644 index 0000000..1513223 --- /dev/null +++ b/approval-request-metric-collector/go.mod @@ -0,0 +1,72 @@ +module github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector + +go 1.24.9 + +require ( + github.com/kubefleet-dev/kubefleet v0.1.2 + github.com/prometheus/client_golang v1.22.0 + k8s.io/api v0.34.1 + k8s.io/apiextensions-apiserver v0.34.1 + k8s.io/apimachinery v0.34.1 + k8s.io/client-go v0.34.1 + k8s.io/klog/v2 v2.130.1 + sigs.k8s.io/controller-runtime v0.22.4 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.1 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect + github.com/go-openapi/swag v0.23.1 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.9.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/gomega v1.37.0 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.62.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + github.com/spf13/pflag v1.0.6 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.goms.io/fleet-networking v0.3.3 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/oauth2 v0.29.0 // indirect + golang.org/x/sync v0.18.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect + golang.org/x/time v0.11.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect + k8s.io/metrics v0.32.3 // indirect + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect + sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect +) diff --git a/approval-request-metric-collector/go.sum b/approval-request-metric-collector/go.sum new file mode 100644 index 0000000..90d0995 --- /dev/null +++ b/approval-request-metric-collector/go.sum @@ -0,0 +1,196 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= +github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= +github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= +github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= +github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubefleet-dev/kubefleet v0.1.2 h1:BUOwehI9iBavU6TEbebrSxtFXHwyOcY1eacHyfHEjxo= +github.com/kubefleet-dev/kubefleet v0.1.2/go.mod h1:EYDCdtdM02qQkH3Gm5/K1cHDy26f2LbM7WzVGn2saLs= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= +github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= +github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.goms.io/fleet-networking v0.3.3 h1:5rwBntaUoLF+E1CzaWAEL4GdvLJPQorKhjgkbLlllPE= +go.goms.io/fleet-networking v0.3.3/go.mod h1:Qgbi8M1fGaz/p5rtb6HJPmTDATWRnMt9HD1gz57WKUc= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/oauth2 v0.29.0 h1:WdYw2tdTK1S8olAzWHdgeqfy+Mtm9XNhv/xJsY65d98= +golang.org/x/oauth2 v0.29.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= +golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= +k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= +k8s.io/metrics v0.32.3 h1:2vsBvw0v8rIIlczZ/lZ8Kcqk9tR6Fks9h+dtFNbc2a4= +k8s.io/metrics v0.32.3/go.mod h1:9R1Wk5cb+qJpCQon9h52mgkVCcFeYxcY+YkumfwHVCU= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= +sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/approval-request-metric-collector/hack/boilerplate.go.txt b/approval-request-metric-collector/hack/boilerplate.go.txt new file mode 100644 index 0000000..1f31a2d --- /dev/null +++ b/approval-request-metric-collector/hack/boilerplate.go.txt @@ -0,0 +1,15 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ diff --git a/approval-request-metric-collector/pkg/controllers/approvalrequest/controller.go b/approval-request-metric-collector/pkg/controllers/approvalrequest/controller.go new file mode 100644 index 0000000..5ddcb48 --- /dev/null +++ b/approval-request-metric-collector/pkg/controllers/approvalrequest/controller.go @@ -0,0 +1,577 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package approvalrequest features a controller to reconcile ApprovalRequest objects +// and create MetricCollectorReport resources on the hub cluster for metric collection. +package approvalrequest + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + localv1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/utils" +) + +const ( + // metricCollectorFinalizer is the finalizer added to ApprovalRequest objects for cleanup + metricCollectorFinalizer = "kubernetes-fleet.io/metric-collector-report-cleanup" + + // prometheusURL is the default Prometheus URL to use for all clusters + prometheusURL = "http://prometheus.prometheus.svc.cluster.local:9090" +) + +// Reconciler reconciles an ApprovalRequest object and creates MetricCollectorReport resources +// on the hub cluster in fleet-member-{clusterName} namespaces. +type Reconciler struct { + client.Client + recorder record.EventRecorder +} + +// Reconcile reconciles an ApprovalRequest or ClusterApprovalRequest object. +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + startTime := time.Now() + klog.V(2).InfoS("ApprovalRequest reconciliation starts", "request", req.NamespacedName) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("ApprovalRequest reconciliation ends", "request", req.NamespacedName, "latency", latency) + }() + + var approvalReqObj placementv1beta1.ApprovalRequestObj + var isClusterScoped bool + + // Check if request has a namespace to determine resource type + if req.Namespace != "" { + // Fetch namespaced ApprovalRequest + approvalReq := &placementv1beta1.ApprovalRequest{} + if err := r.Client.Get(ctx, req.NamespacedName, approvalReq); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("ApprovalRequest not found, ignoring", "request", req.NamespacedName) + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get ApprovalRequest", "request", req.NamespacedName) + return ctrl.Result{}, err + } + approvalReqObj = approvalReq + isClusterScoped = false + } else { + // Fetch cluster-scoped ClusterApprovalRequest + clusterApprovalReq := &placementv1beta1.ClusterApprovalRequest{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: req.Name}, clusterApprovalReq); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("ClusterApprovalRequest not found, ignoring", "request", req.Name) + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get ClusterApprovalRequest", "request", req.Name) + return ctrl.Result{}, err + } + approvalReqObj = clusterApprovalReq + isClusterScoped = true + } + + return r.reconcileApprovalRequestObj(ctx, approvalReqObj, isClusterScoped) +} + +// reconcileApprovalRequestObj reconciles an ApprovalRequestObj (either ApprovalRequest or ClusterApprovalRequest). +func (r *Reconciler) reconcileApprovalRequestObj(ctx context.Context, approvalReqObj placementv1beta1.ApprovalRequestObj, isClusterScoped bool) (ctrl.Result, error) { + obj := approvalReqObj.(client.Object) + approvalReqRef := klog.KObj(obj) + + // Handle deletion + if !obj.GetDeletionTimestamp().IsZero() { + return r.handleDelete(ctx, approvalReqObj) + } + + // Check if the approval request is already approved or rejected - stop reconciliation if so + approvedCond := meta.FindStatusCondition(approvalReqObj.GetApprovalRequestStatus().Conditions, string(placementv1beta1.ApprovalRequestConditionApproved)) + if approvedCond != nil && approvedCond.Status == metav1.ConditionTrue { + klog.V(2).InfoS("ApprovalRequest has been approved, stopping reconciliation", "approvalRequest", approvalReqRef) + return ctrl.Result{}, nil + } + + // Add finalizer if not present + if !controllerutil.ContainsFinalizer(obj, metricCollectorFinalizer) { + controllerutil.AddFinalizer(obj, metricCollectorFinalizer) + if err := r.Client.Update(ctx, obj); err != nil { + klog.ErrorS(err, "Failed to add finalizer", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + klog.V(2).InfoS("Added finalizer to ApprovalRequest", "approvalRequest", approvalReqRef) + } + + // Get the UpdateRun (ClusterStagedUpdateRun or StagedUpdateRun) + spec := approvalReqObj.GetApprovalRequestSpec() + updateRunName := spec.TargetUpdateRun + stageName := spec.TargetStage + + var stageStatus *placementv1beta1.StageUpdatingStatus + if isClusterScoped { + updateRun := &placementv1beta1.ClusterStagedUpdateRun{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName}, updateRun); err != nil { + klog.ErrorS(err, "Failed to get ClusterStagedUpdateRun", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return ctrl.Result{}, err + } + + // Find the stage + for i := range updateRun.Status.StagesStatus { + if updateRun.Status.StagesStatus[i].StageName == stageName { + stageStatus = &updateRun.Status.StagesStatus[i] + break + } + } + } else { + updateRun := &placementv1beta1.StagedUpdateRun{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName, Namespace: obj.GetNamespace()}, updateRun); err != nil { + klog.ErrorS(err, "Failed to get StagedUpdateRun", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return ctrl.Result{}, err + } + + // Find the stage + for i := range updateRun.Status.StagesStatus { + if updateRun.Status.StagesStatus[i].StageName == stageName { + stageStatus = &updateRun.Status.StagesStatus[i] + break + } + } + } + + if stageStatus == nil { + err := fmt.Errorf("stage %s not found in UpdateRun %s", stageName, updateRunName) + klog.ErrorS(err, "Failed to find stage", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + // Get all cluster names from the stage + clusterNames := make([]string, 0, len(stageStatus.Clusters)) + for _, cluster := range stageStatus.Clusters { + clusterNames = append(clusterNames, cluster.ClusterName) + } + + if len(clusterNames) == 0 { + klog.V(2).InfoS("No clusters in stage, skipping", "approvalRequest", approvalReqRef, "stage", stageName) + return ctrl.Result{}, nil + } + + klog.V(2).InfoS("Found clusters in stage", "approvalRequest", approvalReqRef, "stage", stageName, "clusters", clusterNames) + + // Create or update MetricCollectorReport resources in fleet-member namespaces + if err := r.ensureMetricCollectorReports(ctx, obj, clusterNames, updateRunName, stageName); err != nil { + klog.ErrorS(err, "Failed to ensure MetricCollectorReport resources", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + klog.V(2).InfoS("Successfully ensured MetricCollectorReport resources", "approvalRequest", approvalReqRef, "clusters", clusterNames) + + // Check workload health and approve if all workloads are healthy + if err := r.checkWorkloadHealthAndApprove(ctx, approvalReqObj, clusterNames, updateRunName, stageName); err != nil { + klog.ErrorS(err, "Failed to check workload health", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + // Requeue after 15 seconds to check again (will stop if approved in next reconciliation) + return ctrl.Result{RequeueAfter: 15 * time.Second}, nil +} + +// ensureMetricCollectorReports creates MetricCollectorReport in each fleet-member-{clusterName} namespace +func (r *Reconciler) ensureMetricCollectorReports( + ctx context.Context, + approvalReq client.Object, + clusterNames []string, + updateRunName, stageName string, +) error { + // Generate report name (same for all clusters, different namespaces) + reportName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName) + + // Create MetricCollectorReport in each fleet-member namespace + for _, clusterName := range clusterNames { + reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName) + + report := &localv1alpha1.MetricCollectorReport{ + ObjectMeta: metav1.ObjectMeta{ + Name: reportName, + Namespace: reportNamespace, + Labels: map[string]string{ + "approval-request": approvalReq.GetName(), + "update-run": updateRunName, + "stage": stageName, + "cluster": clusterName, + }, + }, + Spec: localv1alpha1.MetricCollectorReportSpec{ + PrometheusURL: prometheusURL, + }, + } + + // Create or update MetricCollectorReport + existingReport := &localv1alpha1.MetricCollectorReport{} + err := r.Client.Get(ctx, types.NamespacedName{ + Name: reportName, + Namespace: reportNamespace, + }, existingReport) + + if err != nil { + if errors.IsNotFound(err) { + if err := r.Client.Create(ctx, report); err != nil { + return fmt.Errorf("failed to create MetricCollectorReport in %s: %w", reportNamespace, err) + } + klog.V(2).InfoS("Created MetricCollectorReport", + "report", reportName, + "namespace", reportNamespace, + "cluster", clusterName) + } else { + return fmt.Errorf("failed to get MetricCollectorReport in %s: %w", reportNamespace, err) + } + } else { + // Update spec if needed + if existingReport.Spec.PrometheusURL != prometheusURL { + existingReport.Spec.PrometheusURL = prometheusURL + if err := r.Client.Update(ctx, existingReport); err != nil { + return fmt.Errorf("failed to update MetricCollectorReport in %s: %w", reportNamespace, err) + } + klog.V(2).InfoS("Updated MetricCollectorReport", + "report", reportName, + "namespace", reportNamespace, + "cluster", clusterName) + } + } + } + + return nil +} + +// checkWorkloadHealthAndApprove checks if all workloads specified in ClusterStagedWorkloadTracker or StagedWorkloadTracker are healthy +// across all clusters in the stage, and approves the ApprovalRequest if they are. +func (r *Reconciler) checkWorkloadHealthAndApprove( + ctx context.Context, + approvalReqObj placementv1beta1.ApprovalRequestObj, + clusterNames []string, + updateRunName, stageName string, +) error { + obj := approvalReqObj.(client.Object) + approvalReqRef := klog.KObj(obj) + + klog.V(2).InfoS("Starting workload health check", "approvalRequest", approvalReqRef, "clusters", clusterNames) + + // Get the appropriate WorkloadTracker based on scope + // The WorkloadTracker name matches the UpdateRun name + var workloads []localv1alpha1.WorkloadReference + var workloadTrackerName string + + if obj.GetNamespace() == "" { + // Cluster-scoped: Get ClusterStagedWorkloadTracker with same name as ClusterStagedUpdateRun + clusterWorkloadTracker := &localv1alpha1.ClusterStagedWorkloadTracker{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName}, clusterWorkloadTracker); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("ClusterStagedWorkloadTracker not found, skipping health check", + "approvalRequest", approvalReqRef, + "updateRun", updateRunName) + return nil + } + klog.ErrorS(err, "Failed to get ClusterStagedWorkloadTracker", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return fmt.Errorf("failed to get ClusterStagedWorkloadTracker: %w", err) + } + workloads = clusterWorkloadTracker.Workloads + workloadTrackerName = clusterWorkloadTracker.Name + klog.V(2).InfoS("Found ClusterStagedWorkloadTracker", + "approvalRequest", approvalReqRef, + "workloadTracker", workloadTrackerName, + "workloadCount", len(workloads)) + } else { + // Namespace-scoped: Get StagedWorkloadTracker with same name and namespace as StagedUpdateRun + stagedWorkloadTracker := &localv1alpha1.StagedWorkloadTracker{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName, Namespace: obj.GetNamespace()}, stagedWorkloadTracker); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("StagedWorkloadTracker not found, skipping health check", + "approvalRequest", approvalReqRef, + "updateRun", updateRunName, + "namespace", obj.GetNamespace()) + return nil + } + klog.ErrorS(err, "Failed to get StagedWorkloadTracker", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return fmt.Errorf("failed to get StagedWorkloadTracker: %w", err) + } + workloads = stagedWorkloadTracker.Workloads + workloadTrackerName = stagedWorkloadTracker.Name + klog.V(2).InfoS("Found StagedWorkloadTracker", + "approvalRequest", approvalReqRef, + "workloadTracker", klog.KObj(stagedWorkloadTracker), + "workloadCount", len(workloads)) + } + + if len(workloads) == 0 { + klog.V(2).InfoS("WorkloadTracker has no workloads defined, skipping health check", + "approvalRequest", approvalReqRef, + "workloadTracker", workloadTrackerName) + return nil + } + + // MetricCollectorReport name is same as MetricCollector name + metricCollectorName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName) + + // Check each cluster for the required workloads + allHealthy := true + unhealthyDetails := []string{} + + for _, clusterName := range clusterNames { + reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName) + + klog.V(2).InfoS("Checking MetricCollectorReport", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "reportName", metricCollectorName, + "reportNamespace", reportNamespace) + + // Get MetricCollectorReport for this cluster + report := &localv1alpha1.MetricCollectorReport{} + err := r.Client.Get(ctx, types.NamespacedName{ + Name: metricCollectorName, + Namespace: reportNamespace, + }, report) + + if err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("MetricCollectorReport not found yet", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "report", metricCollectorName, + "namespace", reportNamespace) + allHealthy = false + unhealthyDetails = append(unhealthyDetails, fmt.Sprintf("cluster %s: report not found", clusterName)) + continue + } + klog.ErrorS(err, "Failed to get MetricCollectorReport", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "report", metricCollectorName, + "namespace", reportNamespace) + return fmt.Errorf("failed to get MetricCollectorReport for cluster %s: %w", clusterName, err) + } + + klog.V(2).InfoS("Found MetricCollectorReport", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "collectedMetrics", len(report.Status.CollectedMetrics), + "workloadsMonitored", report.Status.WorkloadsMonitored) + + // Check if all workloads from WorkloadTracker are present and healthy + for _, trackedWorkload := range workloads { + found := false + healthy := false + + for _, collectedMetric := range report.Status.CollectedMetrics { + if collectedMetric.Namespace == trackedWorkload.Namespace && + collectedMetric.WorkloadName == trackedWorkload.Name { + found = true + healthy = collectedMetric.Health + klog.V(3).InfoS("Workload metric found", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "workload", trackedWorkload.Name, + "namespace", trackedWorkload.Namespace, + "healthy", healthy) + break + } + } + + if !found { + klog.V(2).InfoS("Workload not found in MetricCollectorReport", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "workload", trackedWorkload.Name, + "namespace", trackedWorkload.Namespace) + allHealthy = false + unhealthyDetails = append(unhealthyDetails, + fmt.Sprintf("cluster %s: workload %s/%s not found", clusterName, trackedWorkload.Namespace, trackedWorkload.Name)) + } else if !healthy { + klog.V(2).InfoS("Workload is not healthy", + "approvalRequest", approvalReqRef, + "cluster", clusterName, + "workload", trackedWorkload.Name, + "namespace", trackedWorkload.Namespace) + allHealthy = false + unhealthyDetails = append(unhealthyDetails, + fmt.Sprintf("cluster %s: workload %s/%s unhealthy", clusterName, trackedWorkload.Namespace, trackedWorkload.Name)) + } + } + } + + // If all workloads are healthy across all clusters, approve the ApprovalRequest + if allHealthy { + klog.InfoS("All workloads are healthy, approving ApprovalRequest", + "approvalRequest", approvalReqRef, + "clusters", clusterNames, + "workloads", len(workloads)) + + status := approvalReqObj.GetApprovalRequestStatus() + approvedCond := meta.FindStatusCondition(status.Conditions, string(placementv1beta1.ApprovalRequestConditionApproved)) + + // Only update if not already approved + if approvedCond == nil || approvedCond.Status != metav1.ConditionTrue { + meta.SetStatusCondition(&status.Conditions, metav1.Condition{ + Type: string(placementv1beta1.ApprovalRequestConditionApproved), + Status: metav1.ConditionTrue, + ObservedGeneration: obj.GetGeneration(), + Reason: "AllWorkloadsHealthy", + Message: fmt.Sprintf("All %d workloads are healthy across %d clusters", len(workloads), len(clusterNames)), + }) + + approvalReqObj.SetApprovalRequestStatus(*status) + if err := r.Client.Status().Update(ctx, obj); err != nil { + klog.ErrorS(err, "Failed to approve ApprovalRequest", "approvalRequest", approvalReqRef) + return fmt.Errorf("failed to approve ApprovalRequest: %w", err) + } + + klog.InfoS("Successfully approved ApprovalRequest", "approvalRequest", approvalReqRef) + r.recorder.Event(obj, "Normal", "Approved", fmt.Sprintf("All %d workloads are healthy across %d clusters in stage %s", len(workloads), len(clusterNames), stageName)) + } else { + klog.V(2).InfoS("ApprovalRequest already approved", "approvalRequest", approvalReqRef) + } + + // Approval successful or already approved + return nil + } + + // Not all workloads are healthy yet, log details and return nil (reconcile will requeue) + klog.V(2).InfoS("Not all workloads are healthy yet", + "approvalRequest", approvalReqRef, + "unhealthyDetails", unhealthyDetails) + + return nil +} + +// handleDelete handles the deletion of an ApprovalRequest or ClusterApprovalRequest +func (r *Reconciler) handleDelete(ctx context.Context, approvalReqObj placementv1beta1.ApprovalRequestObj) (ctrl.Result, error) { + obj := approvalReqObj.(client.Object) + if !controllerutil.ContainsFinalizer(obj, metricCollectorFinalizer) { + return ctrl.Result{}, nil + } + + approvalReqRef := klog.KObj(obj) + klog.V(2).InfoS("Cleaning up MetricCollectorReports for ApprovalRequest", "approvalRequest", approvalReqRef) + + // Get cluster names from UpdateRun to know which reports to delete + spec := approvalReqObj.GetApprovalRequestSpec() + updateRunName := spec.TargetUpdateRun + stageName := spec.TargetStage + reportName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName) + + // Fetch UpdateRun to get cluster names + var clusterNames []string + if obj.GetNamespace() == "" { + // Cluster-scoped: Get ClusterStagedUpdateRun + updateRun := &placementv1beta1.ClusterStagedUpdateRun{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName}, updateRun); err != nil { + if !errors.IsNotFound(err) { + klog.ErrorS(err, "Failed to get ClusterStagedUpdateRun for cleanup", "approvalRequest", approvalReqRef) + } + // Continue with finalizer removal even if UpdateRun not found + } else { + // Find the stage + for i := range updateRun.Status.StagesStatus { + if updateRun.Status.StagesStatus[i].StageName == stageName { + for _, cluster := range updateRun.Status.StagesStatus[i].Clusters { + clusterNames = append(clusterNames, cluster.ClusterName) + } + break + } + } + } + } else { + // Namespace-scoped: Get StagedUpdateRun + updateRun := &placementv1beta1.StagedUpdateRun{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName, Namespace: obj.GetNamespace()}, updateRun); err != nil { + if !errors.IsNotFound(err) { + klog.ErrorS(err, "Failed to get StagedUpdateRun for cleanup", "approvalRequest", approvalReqRef) + } + // Continue with finalizer removal even if UpdateRun not found + } else { + // Find the stage + for i := range updateRun.Status.StagesStatus { + if updateRun.Status.StagesStatus[i].StageName == stageName { + for _, cluster := range updateRun.Status.StagesStatus[i].Clusters { + clusterNames = append(clusterNames, cluster.ClusterName) + } + break + } + } + } + } + + // Delete MetricCollectorReport from each fleet-member namespace + for _, clusterName := range clusterNames { + reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName) + report := &localv1alpha1.MetricCollectorReport{} + + if err := r.Client.Get(ctx, types.NamespacedName{ + Name: reportName, + Namespace: reportNamespace, + }, report); err == nil { + if err := r.Client.Delete(ctx, report); err != nil && !errors.IsNotFound(err) { + klog.ErrorS(err, "Failed to delete MetricCollectorReport", + "report", reportName, + "namespace", reportNamespace, + "cluster", clusterName) + return ctrl.Result{}, fmt.Errorf("failed to delete MetricCollectorReport in %s: %w", reportNamespace, err) + } + klog.V(2).InfoS("Deleted MetricCollectorReport", + "report", reportName, + "namespace", reportNamespace, + "cluster", clusterName) + } + } + + // Remove finalizer + controllerutil.RemoveFinalizer(obj, metricCollectorFinalizer) + if err := r.Client.Update(ctx, obj); err != nil { + klog.ErrorS(err, "Failed to remove finalizer", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + klog.V(2).InfoS("Successfully cleaned up MetricCollectorReports", "approvalRequest", approvalReqRef, "clusters", clusterNames) + return ctrl.Result{}, nil +} + +// SetupWithManagerForClusterApprovalRequest sets up the controller with the Manager for ClusterApprovalRequest resources. +func (r *Reconciler) SetupWithManagerForClusterApprovalRequest(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("clusterapprovalrequest-controller") + return ctrl.NewControllerManagedBy(mgr). + Named("clusterapprovalrequest-controller"). + For(&placementv1beta1.ClusterApprovalRequest{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Complete(r) +} + +// SetupWithManagerForApprovalRequest sets up the controller with the Manager for ApprovalRequest resources. +func (r *Reconciler) SetupWithManagerForApprovalRequest(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("approvalrequest-controller") + return ctrl.NewControllerManagedBy(mgr). + Named("approvalrequest-controller"). + For(&placementv1beta1.ApprovalRequest{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Complete(r) +} diff --git a/approval-request-metric-collector/pkg/controllers/metriccollector/collector.go b/approval-request-metric-collector/pkg/controllers/metriccollector/collector.go new file mode 100644 index 0000000..ef3cde0 --- /dev/null +++ b/approval-request-metric-collector/pkg/controllers/metriccollector/collector.go @@ -0,0 +1,148 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metriccollector + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" +) + +// PrometheusClient is the interface for querying Prometheus +type PrometheusClient interface { + Query(ctx context.Context, query string) (interface{}, error) +} + +// prometheusClient implements PrometheusClient for querying Prometheus API +type prometheusClient struct { + baseURL string + authType string + authSecret *corev1.Secret + httpClient *http.Client +} + +// NewPrometheusClient creates a new Prometheus client +func NewPrometheusClient(baseURL, authType string, authSecret *corev1.Secret) PrometheusClient { + return &prometheusClient{ + baseURL: baseURL, + authType: authType, + authSecret: authSecret, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// Query executes a PromQL query against Prometheus API +func (c *prometheusClient) Query(ctx context.Context, query string) (interface{}, error) { + // Build query URL + queryURL := fmt.Sprintf("%s/api/v1/query", strings.TrimSuffix(c.baseURL, "/")) + params := url.Values{} + params.Add("query", query) + fullURL := fmt.Sprintf("%s?%s", queryURL, params.Encode()) + + // Create request + req, err := http.NewRequestWithContext(ctx, "GET", fullURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + // Add authentication + if err := c.addAuth(req); err != nil { + return nil, fmt.Errorf("failed to add authentication: %w", err) + } + + // Execute request + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to query Prometheus: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("Prometheus query failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Parse response + var result PrometheusResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + if result.Status != "success" { + return nil, fmt.Errorf("Prometheus query failed: %s", result.Error) + } + + return result.Data, nil +} + +// addAuth adds authentication to the request +func (c *prometheusClient) addAuth(req *http.Request) error { + if c.authType == "" || c.authSecret == nil { + return nil + } + + switch c.authType { + case "bearer": + token, ok := c.authSecret.Data["token"] + if !ok { + return fmt.Errorf("token not found in secret") + } + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", string(token))) + case "basic": + username, ok := c.authSecret.Data["username"] + if !ok { + return fmt.Errorf("username not found in secret") + } + password, ok := c.authSecret.Data["password"] + if !ok { + return fmt.Errorf("password not found in secret") + } + auth := base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", username, password))) + req.Header.Set("Authorization", fmt.Sprintf("Basic %s", auth)) + } + + return nil +} + +// PrometheusResponse represents the Prometheus API response +type PrometheusResponse struct { + Status string `json:"status"` + Data PrometheusData `json:"data"` + Error string `json:"error,omitempty"` +} + +// PrometheusData represents the data section of Prometheus response +type PrometheusData struct { + ResultType string `json:"resultType"` + Result []PrometheusResult `json:"result"` +} + +// PrometheusResult represents a single result from Prometheus +type PrometheusResult struct { + Metric map[string]string `json:"metric"` + Value []interface{} `json:"value"` // [timestamp, value] +} diff --git a/approval-request-metric-collector/pkg/controllers/metriccollector/controller.go b/approval-request-metric-collector/pkg/controllers/metriccollector/controller.go new file mode 100644 index 0000000..01d7fb8 --- /dev/null +++ b/approval-request-metric-collector/pkg/controllers/metriccollector/controller.go @@ -0,0 +1,172 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metriccollector + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + localv1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" +) + +const ( + // defaultCollectionInterval is the interval for collecting metrics (30 seconds) + defaultCollectionInterval = 30 * time.Second +) + +// Reconciler reconciles a MetricCollectorReport object on the hub cluster +type Reconciler struct { + // HubClient is the client to access the hub cluster (for MetricCollectorReport and WorkloadTracker) + HubClient client.Client +} + +// Reconcile watches MetricCollectorReport on hub and updates it with metrics from member Prometheus +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + startTime := time.Now() + klog.V(2).InfoS("MetricCollectorReport reconciliation starts", "report", req.NamespacedName) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("MetricCollectorReport reconciliation ends", "report", req.NamespacedName, "latency", latency) + }() + + // 1. Get MetricCollectorReport from hub cluster + report := &localv1alpha1.MetricCollectorReport{} + if err := r.HubClient.Get(ctx, req.NamespacedName, report); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("MetricCollectorReport not found, ignoring", "report", req.NamespacedName) + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get MetricCollectorReport", "report", req.NamespacedName) + return ctrl.Result{}, err + } + + klog.InfoS("Reconciling MetricCollectorReport", "name", report.Name, "namespace", report.Namespace) + + // 2. Get PrometheusURL from report spec (or use default) + prometheusURL := report.Spec.PrometheusURL + + // 3. Query Prometheus on member cluster for all workload_health metrics + promClient := NewPrometheusClient(prometheusURL, "", nil) + collectedMetrics, collectErr := r.collectAllWorkloadMetrics(ctx, promClient) + + // 5. Update MetricCollectorReport status on hub + now := metav1.Now() + report.Status.LastCollectionTime = &now + report.Status.CollectedMetrics = collectedMetrics + report.Status.WorkloadsMonitored = int32(len(collectedMetrics)) + + if collectErr != nil { + klog.ErrorS(collectErr, "Failed to collect metrics", "prometheusUrl", prometheusURL) + meta.SetStatusCondition(&report.Status.Conditions, metav1.Condition{ + Type: "MetricsCollected", + Status: metav1.ConditionFalse, + ObservedGeneration: report.Generation, + Reason: "CollectionFailed", + Message: fmt.Sprintf("Failed to collect metrics: %v", collectErr), + }) + } else { + klog.V(2).InfoS("Successfully collected metrics", "report", report.Name, "workloads", len(collectedMetrics)) + meta.SetStatusCondition(&report.Status.Conditions, metav1.Condition{ + Type: "MetricsCollected", + Status: metav1.ConditionTrue, + ObservedGeneration: report.Generation, + Reason: "MetricsCollected", + Message: fmt.Sprintf("Successfully collected metrics from %d workloads", len(collectedMetrics)), + }) + } + + if err := r.HubClient.Status().Update(ctx, report); err != nil { + klog.ErrorS(err, "Failed to update MetricCollectorReport status", "report", req.NamespacedName) + return ctrl.Result{}, err + } + + klog.InfoS("Successfully updated MetricCollectorReport", "metricsCount", len(collectedMetrics), "prometheusUrl", prometheusURL) + return ctrl.Result{RequeueAfter: defaultCollectionInterval}, nil +} + +// collectAllWorkloadMetrics queries Prometheus for all workload_health metrics +func (r *Reconciler) collectAllWorkloadMetrics(ctx context.Context, promClient PrometheusClient) ([]localv1alpha1.WorkloadMetrics, error) { + var collectedMetrics []localv1alpha1.WorkloadMetrics + + // Query all workload_health metrics (no filtering) + query := "workload_health" + + result, err := promClient.Query(ctx, query) + if err != nil { + klog.ErrorS(err, "Failed to query Prometheus for workload_health metrics") + return nil, err + } + + // Parse Prometheus response + data, ok := result.(PrometheusData) + if !ok { + return nil, fmt.Errorf("invalid Prometheus response type") + } + + if len(data.Result) == 0 { + klog.V(4).InfoS("No workload_health metrics found in Prometheus") + return collectedMetrics, nil + } + + // Extract metrics from Prometheus result + for _, res := range data.Result { + namespace := res.Metric["namespace"] + workloadName := res.Metric["app"] + + if namespace == "" || workloadName == "" { + continue + } + + // Extract health value from Prometheus result + // Prometheus returns values as [timestamp, value_string] array + // We need at least 2 elements: index 0 is timestamp, index 1 is the metric value + var health float64 + if len(res.Value) >= 2 { + if valueStr, ok := res.Value[1].(string); ok { + fmt.Sscanf(valueStr, "%f", &health) + } + } + + workloadMetrics := localv1alpha1.WorkloadMetrics{ + WorkloadName: workloadName, + Namespace: namespace, + Health: health > 0.5, // Convert float to bool: healthy if > 0.5 + } + collectedMetrics = append(collectedMetrics, workloadMetrics) + } + + klog.V(2).InfoS("Collected workload metrics from Prometheus", "count", len(collectedMetrics)) + return collectedMetrics, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("metriccollector-controller"). + For(&localv1alpha1.MetricCollectorReport{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Complete(r) +} diff --git a/approval-request-metric-collector/scripts/install-on-hub.sh b/approval-request-metric-collector/scripts/install-on-hub.sh new file mode 100755 index 0000000..d2750e3 --- /dev/null +++ b/approval-request-metric-collector/scripts/install-on-hub.sh @@ -0,0 +1,114 @@ +#!/bin/bash +set -e + +# Usage: ./install-on-hub.sh +# Example: ./install-on-hub.sh arvindtestacr.azurecr.io kind-hub + +if [ "$#" -lt 2 ]; then + echo "Usage: $0 " + echo "Example: $0 arvindtestacr.azurecr.io kind-hub" + echo "" + echo "Parameters:" + echo " registry - ACR registry URL (e.g., arvindtestacr.azurecr.io)" + echo " hub-cluster - Hub cluster name (e.g., kind-hub)" + exit 1 +fi + +# Configuration +REGISTRY="$1" +HUB_CLUSTER="$2" +IMAGE_NAME="approval-request-controller" +IMAGE_TAG="${IMAGE_TAG:-latest}" +NAMESPACE="fleet-system" +CHART_NAME="approval-request-controller" + +# Get hub cluster context using kubectl config view (following kubefleet pattern) +HUB_CONTEXT=$(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$HUB_CLUSTER\")].name}") + +if [ -z "$HUB_CONTEXT" ]; then + echo "Error: Could not find context for hub cluster '$HUB_CLUSTER'" + echo "Available clusters:" + kubectl config view -o jsonpath='{.clusters[*].name}' | tr ' ' '\n' + exit 1 +fi + +# Construct full image repository path +IMAGE_REPOSITORY="${REGISTRY}/${IMAGE_NAME}" + +echo "=== Installing ApprovalRequest Controller on hub cluster ===" +echo "Registry: ${REGISTRY}" +echo "Image: ${IMAGE_REPOSITORY}:${IMAGE_TAG}" +echo "Hub cluster: ${HUB_CLUSTER}" +echo "Hub context: ${HUB_CONTEXT}" +echo "Namespace: ${NAMESPACE}" +echo "" + +echo "" + +# Step 1: Verify kubefleet CRDs are installed +echo "Step 1: Verifying required kubefleet CRDs..." +REQUIRED_CRDS=( + "approvalrequests.placement.kubernetes-fleet.io" + "clusterapprovalrequests.placement.kubernetes-fleet.io" + "clusterresourceplacements.placement.kubernetes-fleet.io" + "clusterresourceoverrides.placement.kubernetes-fleet.io" + "clusterstagedupdateruns.placement.kubernetes-fleet.io" + "stagedupdateruns.placement.kubernetes-fleet.io" +) + +MISSING_CRDS=() +for crd in "${REQUIRED_CRDS[@]}"; do + if ! kubectl --context=${HUB_CONTEXT} get crd ${crd} &>/dev/null; then + MISSING_CRDS+=("${crd}") + fi +done + +if [ ${#MISSING_CRDS[@]} -ne 0 ]; then + echo "Error: Missing required CRDs from kubefleet hub-agent:" + for crd in "${MISSING_CRDS[@]}"; do + echo " - ${crd}" + done + echo "" + echo "Please ensure kubefleet hub-agent is installed first." + exit 1 +fi + +echo "✓ All required kubefleet CRDs are installed" +echo "" + +# Step 2: Install helm chart on hub cluster (includes MetricCollector, MetricCollectorReport, WorkloadTracker CRDs) +echo "Step 2: Installing helm chart on hub cluster..." +helm upgrade --install ${CHART_NAME} ../charts/${CHART_NAME} \ + --kube-context=${HUB_CONTEXT} \ + --namespace ${NAMESPACE} \ + --create-namespace \ + --set image.repository=${IMAGE_REPOSITORY} \ + --set image.tag=${IMAGE_TAG} \ + --set image.pullPolicy=Always \ + --set controller.logLevel=2 + +echo "✓ Helm chart installed on hub cluster" +echo "" + +# Step 3: Verify installation +echo "Step 3: Verifying installation..." +echo "Checking CRDs installed by this chart..." +kubectl --context=${HUB_CONTEXT} get crd | grep -E "metriccollectors|metriccollectorreports|workloadtrackers" || echo " (CRDs may take a moment to appear)" + +echo "" +echo "Checking pods in ${NAMESPACE}..." +kubectl --context=${HUB_CONTEXT} get pods -n ${NAMESPACE} -l app.kubernetes.io/name=${CHART_NAME} + +echo "" +echo "=== Installation Complete ===" +echo "" +echo "To check controller logs:" +echo " kubectl --context=${HUB_CONTEXT} logs -n ${NAMESPACE} -l app.kubernetes.io/name=${CHART_NAME} -f" +echo "" +echo "To verify CRDs:" +echo " kubectl --context=${HUB_CONTEXT} get crd | grep autoapprove.kubernetes-fleet.io" +echo "" +echo "Next steps:" +echo " 1. Create a WorkloadTracker to define which workloads to monitor" +echo " 2. ApprovalRequests will be automatically processed when created by staged updates" +echo "" diff --git a/approval-request-metric-collector/scripts/install-on-member.sh b/approval-request-metric-collector/scripts/install-on-member.sh new file mode 100755 index 0000000..fe94abd --- /dev/null +++ b/approval-request-metric-collector/scripts/install-on-member.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -e + +# Usage: ./install-on-member.sh [member-cluster-2] [member-cluster-3] ... +# Example: ./install-on-member.sh arvindtestacr.azurecr.io kind-hub kind-cluster-1 kind-cluster-2 kind-cluster-3 + +if [ "$#" -lt 3 ]; then + echo "Usage: $0 [member-cluster-2] ..." + echo "Example: $0 arvindtestacr.azurecr.io kind-hub kind-cluster-1 kind-cluster-2 kind-cluster-3" + echo "" + echo "Parameters:" + echo " registry - ACR registry URL (e.g., arvindtestacr.azurecr.io)" + echo " hub-cluster - Hub cluster name (e.g., kind-hub)" + echo " member-clusters - One or more member cluster names" + exit 1 +fi + +# Configuration +REGISTRY="$1" +HUB_CLUSTER="$2" +MEMBER_CLUSTERS=("${@:3}") +MEMBER_NAMESPACE="default" +PROMETHEUS_URL="http://prometheus.test-ns:9090" +IMAGE_TAG="${IMAGE_TAG:-latest}" +METRIC_COLLECTOR_IMAGE="metric-collector" +METRIC_APP_IMAGE="metric-app" + +# Get hub cluster context and API server URL using kubectl config view (following kubefleet pattern) +HUB_CONTEXT=$(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$HUB_CLUSTER\")].name}") +HUB_API_SERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"$HUB_CLUSTER\")].cluster.server}") + +if [ -z "$HUB_CONTEXT" ]; then + echo "Error: Could not find context for hub cluster '$HUB_CLUSTER'" + echo "Available clusters:" + kubectl config view -o jsonpath='{.clusters[*].name}' | tr ' ' '\n' + exit 1 +fi + +if [ -z "$HUB_API_SERVER" ]; then + echo "Error: Could not find API server URL for hub cluster '$HUB_CLUSTER'" + exit 1 +fi + +# Construct full image repository paths +METRIC_COLLECTOR_REPOSITORY="${REGISTRY}/${METRIC_COLLECTOR_IMAGE}" +METRIC_APP_REPOSITORY="${REGISTRY}/${METRIC_APP_IMAGE}" + +echo "=== Installing MetricCollector on ${#MEMBER_CLUSTERS[@]} member cluster(s) ===" +echo "Registry: ${REGISTRY}" +echo "Metric Collector Image: ${METRIC_COLLECTOR_REPOSITORY}:${IMAGE_TAG}" +echo "Metric App Image: ${METRIC_APP_REPOSITORY}:${IMAGE_TAG}" +echo "Hub cluster: ${HUB_CLUSTER}" +echo "Hub context: ${HUB_CONTEXT}" +echo "Hub API server: ${HUB_API_SERVER}" +echo "Member clusters: ${MEMBER_CLUSTERS[@]}" +echo "" + +echo "" + +# Install on each member cluster +CLUSTER_INDEX=0 +for MEMBER_CLUSTER in "${MEMBER_CLUSTERS[@]}"; do + CLUSTER_INDEX=$((CLUSTER_INDEX + 1)) + + MEMBER_CONTEXT=$(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$MEMBER_CLUSTER\")].name}") + MEMBER_CLUSTER_NAME="${MEMBER_CLUSTER}" + HUB_NAMESPACE="fleet-member-${MEMBER_CLUSTER_NAME}" + + if [ -z "$MEMBER_CONTEXT" ]; then + echo "Error: Could not find context for member cluster '$MEMBER_CLUSTER'" + echo "Available clusters:" + kubectl config view -o jsonpath='{.clusters[*].name}' | tr ' ' '\n' + exit 1 + fi + + echo "========================================" + echo "Installing on Member Cluster ${CLUSTER_INDEX}/${#MEMBER_CLUSTERS[@]}" + echo " Cluster: ${MEMBER_CLUSTER}" + echo " Context: ${MEMBER_CONTEXT}" + echo " Cluster Name: ${MEMBER_CLUSTER_NAME}" + echo "========================================" + echo "" + + # Step 1: Setup RBAC on hub cluster + echo "Step 1: Setting up RBAC on hub cluster..." + + # Verify namespace exists (should be created by KubeFleet when member cluster joins) + if ! kubectl --context=${HUB_CONTEXT} get namespace ${HUB_NAMESPACE} &>/dev/null; then + echo "Error: Namespace ${HUB_NAMESPACE} does not exist on hub cluster" + echo "This namespace should be automatically created by KubeFleet when the member cluster joins the hub" + echo "Please ensure the member cluster is properly registered with the hub" + exit 1 + fi + + cat <