From 3f42e1f7d05846d92de56fea2a3cb49027d9cf9e Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 15:34:18 +0530 Subject: [PATCH 1/8] Add unified k3d local development with Helm mode (DEV_MODE=helm) Enable all AMP services to run inside the same k3d cluster as OpenChoreo, eliminating the need for Docker Compose network bridging and host.docker.internal hacks. Docker Compose remains the default mode. New files: - deployments/dev-cluster-config.yaml: k3d config with AMP port mappings - deployments/scripts/build-and-import.sh: build images and import into k3d - deployments/scripts/helm-deploy-amp.sh: helm install/upgrade/uninstall Modified files: - Makefile: DEV_MODE dispatch + helm-* targets (sync, build, logs, status) - deployments/scripts/env.sh: AMP_NAMESPACE, AMP_RELEASE_NAME, AMP_IMAGE_TAG - deployments/scripts/setup-k3d.sh: use dev config when DEV_MODE=helm - deployments/values/values-local.yaml: local image repos + in-cluster URLs - CLAUDE.md: document Helm mode commands --- Makefile | 285 +++++++++++++++++++----- deployments/dev-cluster-config.yaml | 114 ++++++++++ deployments/scripts/build-and-import.sh | 86 +++++++ deployments/scripts/env.sh | 5 + deployments/scripts/helm-deploy-amp.sh | 70 ++++++ deployments/scripts/setup-k3d.sh | 36 ++- deployments/values/values-local.yaml | 58 +++++ 7 files changed, 596 insertions(+), 58 deletions(-) create mode 100644 deployments/dev-cluster-config.yaml create mode 100755 deployments/scripts/build-and-import.sh create mode 100755 deployments/scripts/helm-deploy-amp.sh create mode 100644 deployments/values/values-local.yaml diff --git a/Makefile b/Makefile index e8dd7e755..e1b81f5de 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,24 @@ .PHONY: help setup setup-colima setup-k3d setup-openchoreo setup-platform setup-console-local setup-console-local-force dev-up dev-down dev-restart dev-rebuild dev-logs dev-migrate openchoreo-up openchoreo-down openchoreo-status teardown db-connect db-logs service-logs service-shell console-logs port-forward setup-kubeconfig-docker +.PHONY: helm-build helm-build-api helm-build-console helm-import helm-install helm-upgrade helm-sync helm-sync-api helm-sync-console helm-restart helm-status helm-logs helm-api-logs helm-console-logs helm-db-connect status api-logs + +# Development mode: "compose" (default) or "helm" +DEV_MODE ?= compose + +# AMP variables (keep in sync with deployments/scripts/env.sh) +AMP_NAMESPACE := wso2-amp +AMP_RELEASE_NAME := amp +AMP_IMAGE_TAG := 0.0.0-dev +CLUSTER_CONTEXT := k3d-openchoreo-local-v0.14.0 # Default target help: @echo "Agent Manager Platform - Development Commands" @echo "" - @echo "πŸš€ Setup (run once):" - @echo " make setup - Complete setup (Colima + k3d + Thunder + OpenChoreo + Platform)" + @echo "Current mode: DEV_MODE=$(DEV_MODE)" + @echo "Switch modes: DEV_MODE=helm make " + @echo "" + @echo "Setup (run once):" + @echo " make setup - Complete setup (Colima + k3d + OpenChoreo + Platform)" @echo " make setup-colima - Start Colima VM" @echo " make setup-k3d - Create k3d cluster" @echo " make setup-thunder - Setup Thunder" @@ -14,53 +27,89 @@ help: @echo " make setup-console-local - Install console deps (only if changed)" @echo " make setup-console-local-force - Force reinstall console deps" @echo "" - @echo "πŸ’» Daily Development:" - @echo " make dev-up - Start platform services (console, service, db)" + @echo "Daily Development (mode-aware):" + @echo " make dev-up - Start platform services" @echo " make dev-down - Stop platform services" @echo " make dev-restart - Restart platform services" @echo " make dev-rebuild - Rebuild images and restart services" @echo " make dev-logs - Tail all platform logs" - @echo " make dev-migrate - Generate evaluators and run database migrations" + @echo " make dev-migrate - Run database migrations" @echo "" - @echo "☸️ OpenChoreo Runtime:" + @echo "Helm Mode (DEV_MODE=helm):" + @echo " make helm-build - Build all Docker images from source" + @echo " make helm-build-api - Build API image only" + @echo " make helm-build-console - Build Console image only" + @echo " make helm-import - Import all images into k3d" + @echo " make helm-install - First-time Helm install (build + import + deploy)" + @echo " make helm-upgrade - Helm upgrade (redeploy with current values)" + @echo " make helm-sync - Full sync: build all + import + restart pods" + @echo " make helm-sync-api - Fast: build API + import + restart API pod" + @echo " make helm-sync-console - Fast: build Console + import + restart Console pod" + @echo " make helm-restart - Restart all AMP deployments" + @echo " make helm-status - Show pods and services" + @echo " make helm-logs - Tail all AMP logs" + @echo " make helm-api-logs - Tail API logs" + @echo " make helm-console-logs - Tail Console logs" + @echo " make helm-db-connect - psql into PostgreSQL pod" + @echo "" + @echo "OpenChoreo Runtime:" @echo " make openchoreo-up - Start OpenChoreo cluster" @echo " make openchoreo-down - Stop OpenChoreo cluster (saves resources)" @echo " make openchoreo-status - Check OpenChoreo cluster status" @echo " make port-forward - Forward OpenChoreo services to localhost" @echo "" - @echo "πŸ—„οΈ Database:" + @echo "Database (Compose mode):" @echo " make db-connect - Connect to PostgreSQL" @echo " make db-logs - View database logs" @echo "" - @echo "πŸ”§ Service Debugging:" + @echo "Service Debugging (Compose mode):" @echo " make service-logs - View service logs" @echo " make service-shell - Shell into service container" @echo " make console-logs - View console logs" @echo "" - @echo "🧹 Cleanup:" - @echo " make teardown - Remove everything (Kind cluster + platform)" + @echo "Cleanup:" + @echo " make teardown - Remove everything (cluster + platform)" @echo "" -# Complete setup +# ============================================================================ +# Setup +# ============================================================================ + +# Complete setup - dispatches based on DEV_MODE +ifeq ($(DEV_MODE),helm) +setup: setup-colima setup-k3d setup-openchoreo setup-thunder helm-install + @echo "" + @echo "Complete setup finished! (Helm mode)" + @echo "" + @echo "Access your services:" + @echo " Console: http://localhost:3000" + @echo " API: http://localhost:9000" + @echo "" + @echo "Useful commands:" + @echo " make helm-status - Show pod status" + @echo " make helm-sync-api - Rebuild and redeploy API" + @echo " make helm-api-logs - Tail API logs" +else setup: setup-colima setup-k3d setup-openchoreo setup-thunder setup-kubeconfig-docker setup-platform setup-console-local @echo "" - @echo "βœ… Complete setup finished!" + @echo "Complete setup finished!" @echo "" - @echo "🌐 Access your services:" + @echo "Access your services:" @echo " Console: http://localhost:3000" @echo " API: http://localhost:8080" @echo " Traces Observer Service: http://localhost:9098" @echo " Database: localhost:5432" @echo "" - @echo "πŸ“Š To access OpenChoreo services, run:" + @echo "To access OpenChoreo services, run:" @echo " make port-forward" +endif # Setup individual components setup-colima: @cd deployments/scripts && ./setup-colima.sh setup-k3d: - @cd deployments/scripts && ./setup-k3d.sh + @cd deployments/scripts && DEV_MODE=$(DEV_MODE) ./setup-k3d.sh setup-thunder: @cd deployments/scripts && ./setup-amp-thunder.sh @@ -69,9 +118,9 @@ setup-openchoreo: @cd deployments/scripts && ./setup-openchoreo.sh $(CURDIR) gen-keys: - @echo "πŸ”‘ Generating JWT signing keys..." + @echo "Generating JWT signing keys..." @cd agent-manager-service && make gen-keys - @echo "βœ… JWT signing keys generated in agent-manager-service/keys/" + @echo "JWT signing keys generated in agent-manager-service/keys/" setup-platform: gen-keys @cd deployments/scripts && ./setup-platform.sh @@ -82,23 +131,23 @@ setup-platform: gen-keys @mkdir -p .make .make/console-deps-installed: console/rush.json console/common/config/rush/pnpm-lock.yaml | .make - @echo "πŸ“¦ Installing console dependencies locally..." + @echo "Installing console dependencies locally..." @if ! command -v rush &> /dev/null; then \ - echo "⚠️ Rush not found. Installing Rush globally..."; \ + echo "Rush not found. Installing Rush globally..."; \ npm install -g @microsoft/rush@5.157.0; \ fi - @echo "πŸ“₯ Running rush update..." + @echo "Running rush update..." @cd console && rush update --full @touch .make/console-deps-installed .make/console-built: .make/console-deps-installed - @echo "πŸ”¨ Building monorepo packages..." + @echo "Building monorepo packages..." @cd console && rush build @touch .make/console-built - @echo "βœ… Console packages built" + @echo "Console packages built" setup-console-local: .make/console-built - @echo "βœ… Console dependencies are up to date" + @echo "Console dependencies are up to date" # Force rebuild of console dependencies (ignores timestamps) setup-console-local-force: @@ -108,79 +157,202 @@ setup-console-local-force: # Generate Docker-specific kubeconfig using k3d kubeconfig # Always regenerates to ensure it matches the current cluster setup-kubeconfig-docker: - @echo "πŸ”§ Generating Docker kubeconfig..." + @echo "Generating Docker kubeconfig..." @cd deployments/scripts && ./generate-docker-kubeconfig.sh - @echo "βœ… Docker kubeconfig is ready" + @echo "Docker kubeconfig is ready" + +# ============================================================================ +# Daily Development (mode-aware) +# ============================================================================ + +ifeq ($(DEV_MODE),helm) + +dev-up: + @cd deployments/scripts && ./helm-deploy-amp.sh + +dev-down: + @cd deployments/scripts && ./helm-deploy-amp.sh --uninstall + +dev-restart: helm-restart + +dev-rebuild: helm-sync + +dev-logs: helm-logs + +else -# Daily development commands dev-up: setup-console-local setup-kubeconfig-docker gen-keys - @echo "πŸš€ Starting Agent Manager platform..." + @echo "Starting Agent Manager platform..." @cd deployments && docker compose up -d - @echo "βœ… Platform is running!" + @echo "Platform is running!" @echo " Console: http://localhost:3000" @echo " API: http://localhost:8080" dev-down: - @echo "πŸ›‘ Stopping Agent Manager platform..." + @echo "Stopping Agent Manager platform..." @cd deployments && docker compose down - @echo "βœ… Platform stopped" + @echo "Platform stopped" dev-restart: - @echo "πŸ”„ Restarting Agent Manager platform..." + @echo "Restarting Agent Manager platform..." @cd deployments && docker compose restart - @echo "βœ… Platform restarted" + @echo "Platform restarted" dev-rebuild: setup-console-local - @echo "🧹 Stopping services..." + @echo "Stopping services..." @cd deployments && docker compose down - @echo "🧹 Removing console volumes (preserving database)..." + @echo "Removing console volumes (preserving database)..." @docker volume rm deployments_console_node_modules deployments_console_common_temp 2>/dev/null || true - @echo "🧹 Cleaning Rush temp directory..." + @echo "Cleaning Rush temp directory..." @rm -rf console/common/temp - @echo "πŸ”¨ Rebuilding Docker images..." + @echo "Rebuilding Docker images..." @cd deployments && docker compose build --no-cache - @echo "πŸ”„ Starting services..." + @echo "Starting services..." @cd deployments && docker compose up -d - @echo "βœ… Rebuild complete!" + @echo "Rebuild complete!" @echo " Console: http://localhost:3000" @echo " API: http://localhost:8080" dev-logs: @cd deployments && docker compose logs -f +endif + dev-migrate: - @echo "πŸ—„οΈ Running database migrations..." + @echo "Running database migrations..." @docker exec agent-manager-service sh -c "cd /go/src && make dev-migrate" - @echo "βœ… Migrations completed" + @echo "Migrations completed" + +# ============================================================================ +# Helm Mode Targets +# ============================================================================ + +# Build all Docker images from source +helm-build: + @cd deployments/scripts && ./build-and-import.sh api console traces-observer evaluation-job +# Build individual components +helm-build-api: + @echo "Building API image..." + @docker build -t amp-api:$(AMP_IMAGE_TAG) agent-manager-service/ --quiet + @echo "API image built." + +helm-build-console: + @echo "Building Console image..." + @docker build -t amp-console:$(AMP_IMAGE_TAG) console/ --quiet + @echo "Console image built." + +# Import all images into k3d (assumes images are already built) +helm-import: + @echo "Importing images into k3d..." + @k3d image import amp-api:$(AMP_IMAGE_TAG) amp-console:$(AMP_IMAGE_TAG) amp-traces-observer:$(AMP_IMAGE_TAG) amp-evaluation-job:$(AMP_IMAGE_TAG) -c openchoreo-local-v0.14.0 + @echo "Images imported." + +# First-time Helm install: build + import + deploy +helm-install: helm-build + @cd deployments/scripts && ./helm-deploy-amp.sh + +# Helm upgrade (redeploy with current values, no image rebuild) +helm-upgrade: + @cd deployments/scripts && ./helm-deploy-amp.sh + +# Full sync: build all images, import, restart pods +helm-sync: helm-build + @echo "Restarting AMP deployments..." + @kubectl rollout restart deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) 2>/dev/null || true + @echo "Waiting for rollout..." + @kubectl rollout status deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=300s 2>/dev/null || true + @echo "Sync complete." + +# Fast sync: build API only, import, restart API pod +helm-sync-api: helm-build-api + @echo "Importing API image into k3d..." + @k3d image import amp-api:$(AMP_IMAGE_TAG) -c openchoreo-local-v0.14.0 + @echo "Restarting API deployment..." + @kubectl rollout restart deployment/$(AMP_RELEASE_NAME)-agent-manager-service -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) + @kubectl rollout status deployment/$(AMP_RELEASE_NAME)-agent-manager-service -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=120s + @echo "API synced." + +# Fast sync: build Console only, import, restart Console pod +helm-sync-console: helm-build-console + @echo "Importing Console image into k3d..." + @k3d image import amp-console:$(AMP_IMAGE_TAG) -c openchoreo-local-v0.14.0 + @echo "Restarting Console deployment..." + @kubectl rollout restart deployment/$(AMP_RELEASE_NAME)-console -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) + @kubectl rollout status deployment/$(AMP_RELEASE_NAME)-console -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=120s + @echo "Console synced." + +# Restart all AMP deployments (no rebuild) +helm-restart: + @echo "Restarting all AMP deployments..." + @kubectl rollout restart deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) + @kubectl rollout status deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=300s 2>/dev/null || true + @echo "Restart complete." + +# Show pod and service status +helm-status: + @echo "=== Pods ===" + @kubectl get pods -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) 2>/dev/null || echo "Namespace $(AMP_NAMESPACE) not found" + @echo "" + @echo "=== Services ===" + @kubectl get svc -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) 2>/dev/null || echo "Namespace $(AMP_NAMESPACE) not found" + +# Alias for helm-status +status: helm-status + +# Tail all AMP logs +helm-logs: + @kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --all-containers --max-log-requests=10 -l "app.kubernetes.io/instance=$(AMP_RELEASE_NAME)" --prefix 2>/dev/null || \ + kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --all-containers --max-log-requests=10 --prefix + +# Tail API logs +helm-api-logs: + @kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) -l "app.kubernetes.io/component=agent-manager-service" --all-containers --prefix + +# Alias for helm-api-logs +api-logs: helm-api-logs + +# Tail Console logs +helm-console-logs: + @kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) -l "app.kubernetes.io/component=console" --all-containers --prefix + +# psql into PostgreSQL pod +helm-db-connect: + @kubectl exec -it -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) \ + $$(kubectl get pod -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) -l "app.kubernetes.io/name=postgresql" -o jsonpath='{.items[0].metadata.name}') \ + -- psql -U agentmanager -d agentmanager + +# ============================================================================ # OpenChoreo lifecycle management +# ============================================================================ + openchoreo-up: - @echo "πŸš€ Starting OpenChoreo cluster..." - @docker start openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null || (echo "⚠️ Cluster not found. Run 'make setup-k3d setup-openchoreo' first." && exit 1) - @echo "⏳ Waiting for nodes to be ready..." + @echo "Starting OpenChoreo cluster..." + @docker start openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null || (echo "Cluster not found. Run 'make setup-k3d setup-openchoreo' first." && exit 1) + @echo "Waiting for nodes to be ready..." @for i in 1 2 3 4 5 6 7 8 9 10 11 12; do \ kubectl get nodes --context kind-openchoreo-local >/dev/null 2>&1 && \ kubectl wait --for=condition=Ready nodes --all --timeout=10s --context kind-openchoreo-local >/dev/null 2>&1 && break || sleep 10; \ done - @echo "⏳ Waiting for core system pods..." + @echo "Waiting for core system pods..." @kubectl wait --for=condition=Ready pods --all -n kube-system --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "⏳ Waiting for OpenChoreo control plane..." + @echo "Waiting for OpenChoreo control plane..." @kubectl wait --for=condition=Ready pods --all -n openchoreo-control-plane --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "⏳ Waiting for OpenChoreo data plane..." + @echo "Waiting for OpenChoreo data plane..." @kubectl wait --for=condition=Ready pods --all -n openchoreo-data-plane --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "⏳ Waiting for OpenChoreo observability plane..." + @echo "Waiting for OpenChoreo observability plane..." @kubectl wait --for=condition=Ready pods --all -n openchoreo-observability-plane --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "βœ… OpenChoreo cluster is running" + @echo "OpenChoreo cluster is running" @echo "" - @echo "πŸ“Š Cluster status:" + @echo "Cluster status:" @kubectl get pods --all-namespaces --context kind-openchoreo-local | grep -v "Running\|Completed" | head -1 || echo " All pods are running!" openchoreo-down: - @echo "πŸ›‘ Stopping OpenChoreo cluster..." - @docker stop openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null && echo "βœ… OpenChoreo cluster stopped (containers preserved)" || echo "⚠️ Cluster not running" + @echo "Stopping OpenChoreo cluster..." + @docker stop openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null && echo "OpenChoreo cluster stopped (containers preserved)" || echo "Cluster not running" openchoreo-status: - @echo "πŸ“Š OpenChoreo Cluster Status:" + @echo "OpenChoreo Cluster Status:" @echo "" @echo "Docker Containers:" @docker ps -a --filter name=openchoreo-local --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" 2>/dev/null || echo "No containers found" @@ -195,14 +367,16 @@ openchoreo-status: port-forward: @cd deployments/scripts && ./port-forward.sh -# Database commands +# ============================================================================ +# Database & Service Debugging (Compose mode) +# ============================================================================ + db-connect: @docker exec -it agent-manager-db psql -U agentmanager -d agentmanager db-logs: @docker logs -f agent-manager-db -# Service debugging service-logs: @docker logs -f agent-manager-service @@ -212,6 +386,9 @@ service-shell: console-logs: @docker logs -f agent-manager-console +# ============================================================================ # Cleanup +# ============================================================================ + teardown: @cd deployments/scripts && ./teardown.sh diff --git a/deployments/dev-cluster-config.yaml b/deployments/dev-cluster-config.yaml new file mode 100644 index 000000000..a3e92a541 --- /dev/null +++ b/deployments/dev-cluster-config.yaml @@ -0,0 +1,114 @@ +# k3d cluster config for Helm dev mode (DEV_MODE=helm) +# Based on single-cluster-config.yaml with additional AMP service port mappings. +# This allows all services (OpenChoreo + AMP) to run in a single k3d cluster. +apiVersion: k3d.io/v1alpha5 +kind: Simple +metadata: + name: openchoreo-local-v0.14.0 +image: rancher/k3s:v1.32.9-k3s1 +servers: 1 +agents: 0 +kubeAPI: + hostPort: "6550" +ports: + # === OpenChoreo Ports (same as single-cluster-config.yaml) === + # Control Plane uses port range 8xxx + # HTTP traffic to OpenChoreo UI and API (Kgateway LoadBalancer on port 80) + - port: 8080:80 + nodeFilters: + - loadbalancer + # HTTPS traffic to OpenChoreo UI and API (Kgateway LoadBalancer on port 443) + - port: 8443:443 + nodeFilters: + - loadbalancer + # Data Plane uses port range 19xxx + # HTTP traffic to workloads via Gateway + - port: 19080:19080 + nodeFilters: + - loadbalancer + # HTTPS traffic to workloads via Gateway + - port: 19443:19443 + nodeFilters: + - loadbalancer + # Build Plane uses port range 10xxx + # Argo Workflows UI for development testing + - port: 10081:2746 + nodeFilters: + - loadbalancer + # Container Registry for storing built images + - port: 10082:5000 + nodeFilters: + - loadbalancer + # Observability Plane uses port range 11xxx + # Observer API + - port: 11080:8080 + nodeFilters: + - loadbalancer + # OpenSearch Dashboard + - port: 11081:5601 + nodeFilters: + - loadbalancer + # OpenSearch API for Fluent Bit data pushing + - port: 11082:9200 + nodeFilters: + - loadbalancer + + # === AMP Service Ports === + # Console (React frontend) + - port: 3000:3000 + nodeFilters: + - loadbalancer + # Agent Manager API + - port: 9000:9000 + nodeFilters: + - loadbalancer + # Internal API / Gateway Management + - port: 9243:9243 + nodeFilters: + - loadbalancer + # Traces Observer Service + - port: 9098:9098 + nodeFilters: + - loadbalancer + + # === OTel / Observability Ports === + # Data Prepper HTTP source + - port: 21893:21893 + nodeFilters: + - loadbalancer + # OTel gRPC + - port: 22893:22893 + nodeFilters: + - loadbalancer + # OTel HTTP + - port: 22894:22894 + nodeFilters: + - loadbalancer +options: + k3s: + extraArgs: + # Add host.k3d.internal to API server TLS certificate SANs. + # This allows consistent DataPlane configuration across single and multi-cluster setups + # where Control Plane pods can access the API server via host.k3d.internal:6550 + - arg: "--tls-san=host.k3d.internal" + nodeFilters: + - server:* + # Configure kubelet eviction thresholds to prevent resource exhaustion + - arg: "--kubelet-arg=eviction-hard=imagefs.available<1%,nodefs.available<1%" + nodeFilters: + - server:* + - arg: "--kubelet-arg=eviction-minimum-reclaim=imagefs.available=1%,nodefs.available=1%" + nodeFilters: + - server:* + # Disable Traefik to avoid conflicts with OpenChoreo Gateway controller + - arg: "--disable=traefik" + nodeFilters: + - server:* +# Configure insecure registries for HTTP access +# Allows kubelet to pull images from Build Plane registry via HTTP +registries: + config: | + mirrors: + "host.k3d.internal:10082": + endpoint: + - http://host.k3d.internal:10082 diff --git a/deployments/scripts/build-and-import.sh b/deployments/scripts/build-and-import.sh new file mode 100755 index 000000000..56134467b --- /dev/null +++ b/deployments/scripts/build-and-import.sh @@ -0,0 +1,86 @@ +#!/bin/bash +set -e + +# Build Docker images from production Dockerfiles and import them into k3d. +# Usage: +# ./build-and-import.sh # Build and import all components +# ./build-and-import.sh api # Build and import API only +# ./build-and-import.sh console api # Build and import Console and API +# +# Supported components: api, console, traces-observer, evaluation-job + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" + +source "$SCRIPT_DIR/env.sh" + +# Components and their build contexts (relative to ROOT_DIR) +declare -A COMPONENT_CONTEXT=( + [api]="agent-manager-service" + [console]="console" + [traces-observer]="traces-observer-service" + [evaluation-job]="evaluation-job" +) + +declare -A COMPONENT_IMAGE=( + [api]="amp-api" + [console]="amp-console" + [traces-observer]="amp-traces-observer" + [evaluation-job]="amp-evaluation-job" +) + +ALL_COMPONENTS="api console traces-observer evaluation-job" + +# Determine which components to build +if [ $# -eq 0 ]; then + COMPONENTS="$ALL_COMPONENTS" +else + COMPONENTS="$*" +fi + +# Validate component names +for comp in $COMPONENTS; do + if [ -z "${COMPONENT_CONTEXT[$comp]}" ]; then + echo "Unknown component: $comp" + echo "Valid components: $ALL_COMPONENTS" + exit 1 + fi +done + +echo "=== Building and importing images into k3d ===" +echo "" + +# Verify k3d cluster exists +if ! k3d cluster list 2>/dev/null | grep -q "${CLUSTER_NAME}"; then + echo "k3d cluster '${CLUSTER_NAME}' not found. Run 'make setup-k3d' first." + exit 1 +fi + +FAILED="" + +for comp in $COMPONENTS; do + IMAGE="${COMPONENT_IMAGE[$comp]}:${AMP_IMAGE_TAG}" + CONTEXT="${ROOT_DIR}/${COMPONENT_CONTEXT[$comp]}" + + echo "Building ${comp} -> ${IMAGE}..." + if docker build -t "$IMAGE" "$CONTEXT" --quiet; then + echo "Importing ${IMAGE} into k3d cluster..." + if k3d image import "$IMAGE" -c "${CLUSTER_NAME}"; then + echo "${comp} ready." + else + echo "Failed to import ${comp}." + FAILED="$FAILED $comp" + fi + else + echo "Failed to build ${comp}." + FAILED="$FAILED $comp" + fi + echo "" +done + +if [ -n "$FAILED" ]; then + echo "Failed components:${FAILED}" + exit 1 +fi + +echo "All images built and imported successfully." diff --git a/deployments/scripts/env.sh b/deployments/scripts/env.sh index 53f8cc3be..f564c5ba4 100644 --- a/deployments/scripts/env.sh +++ b/deployments/scripts/env.sh @@ -3,3 +3,8 @@ OPENCHOREO_VERSION="0.14.0" OPENCHOREO_PATCH_VERSION="0.0.0-b53c6dc3" CLUSTER_NAME="openchoreo-local-v${OPENCHOREO_VERSION}" CLUSTER_CONTEXT="k3d-${CLUSTER_NAME}" + +# AMP (Agent Management Platform) variables +AMP_NAMESPACE="wso2-amp" +AMP_RELEASE_NAME="amp" +AMP_IMAGE_TAG="0.0.0-dev" diff --git a/deployments/scripts/helm-deploy-amp.sh b/deployments/scripts/helm-deploy-amp.sh new file mode 100755 index 000000000..a1fe427ae --- /dev/null +++ b/deployments/scripts/helm-deploy-amp.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -e + +# Deploy AMP to the k3d cluster using Helm. +# Usage: +# ./helm-deploy-amp.sh # Install or upgrade +# ./helm-deploy-amp.sh --uninstall # Uninstall (preserves cluster) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +CHART_DIR="$ROOT_DIR/deployments/helm-charts/wso2-agent-manager" +VALUES_FILE="$ROOT_DIR/deployments/values/values-local.yaml" + +source "$SCRIPT_DIR/env.sh" + +if [ "$1" = "--uninstall" ]; then + echo "=== Uninstalling AMP from k3d ===" + if helm status "$AMP_RELEASE_NAME" -n "$AMP_NAMESPACE" --kube-context "${CLUSTER_CONTEXT}" &>/dev/null; then + helm uninstall "$AMP_RELEASE_NAME" -n "$AMP_NAMESPACE" --kube-context "${CLUSTER_CONTEXT}" + echo "AMP uninstalled. Cluster and namespace preserved." + else + echo "AMP release '${AMP_RELEASE_NAME}' not found in namespace '${AMP_NAMESPACE}'." + fi + exit 0 +fi + +echo "=== Deploying AMP to k3d cluster ===" +echo "" + +# Verify cluster is accessible +if ! kubectl cluster-info --context "${CLUSTER_CONTEXT}" &>/dev/null; then + echo "k3d cluster '${CLUSTER_NAME}' is not accessible." + echo "Run 'make setup-k3d' or 'k3d cluster start ${CLUSTER_NAME}' first." + exit 1 +fi + +# Create namespace if it doesn't exist +kubectl create namespace "$AMP_NAMESPACE" --context "${CLUSTER_CONTEXT}" --dry-run=client -o yaml | \ + kubectl apply --context "${CLUSTER_CONTEXT}" -f - + +# Update Helm dependencies +echo "Updating Helm chart dependencies..." +helm dependency update "$CHART_DIR" +echo "" + +# Install or upgrade +echo "Running helm upgrade --install..." +helm upgrade --install "$AMP_RELEASE_NAME" "$CHART_DIR" \ + --namespace "$AMP_NAMESPACE" \ + --kube-context "${CLUSTER_CONTEXT}" \ + --values "$VALUES_FILE" \ + --wait \ + --timeout 5m + +echo "" +echo "Waiting for deployments to be ready..." +kubectl wait --for=condition=Available deployment --all \ + -n "$AMP_NAMESPACE" \ + --context "${CLUSTER_CONTEXT}" \ + --timeout=300s 2>/dev/null || true + +echo "" +echo "AMP deployed successfully!" +echo "" +echo "Services:" +echo " Console: http://localhost:3000" +echo " API: http://localhost:9000" +echo "" +echo "Status:" +kubectl get pods -n "$AMP_NAMESPACE" --context "${CLUSTER_CONTEXT}" diff --git a/deployments/scripts/setup-k3d.sh b/deployments/scripts/setup-k3d.sh index 9a7befc7b..f8785ebb2 100755 --- a/deployments/scripts/setup-k3d.sh +++ b/deployments/scripts/setup-k3d.sh @@ -9,7 +9,14 @@ cd "$SCRIPT_DIR" source "$SCRIPT_DIR/env.sh" -echo "=== Setting up k3d Cluster for OpenChoreo ===" +# Select k3d config based on DEV_MODE +if [ "${DEV_MODE}" = "helm" ]; then + K3D_CONFIG="../dev-cluster-config.yaml" + echo "=== Setting up k3d Cluster for OpenChoreo + AMP (Helm mode) ===" +else + K3D_CONFIG="../single-cluster-config.yaml" + echo "=== Setting up k3d Cluster for OpenChoreo ===" +fi # Check prerequisites if ! command -v k3d &> /dev/null; then @@ -52,6 +59,27 @@ if k3d cluster list 2>/dev/null | grep -q "${CLUSTER_NAME}"; then done fi + # When using Helm mode, verify AMP ports are mapped + if [ "${DEV_MODE}" = "helm" ]; then + echo "" + echo "πŸ” Checking AMP port mappings..." + MISSING_PORTS="" + for PORT in 3000 9000; do + if ! docker port "k3d-${CLUSTER_NAME}-serverlb" "${PORT}/tcp" &>/dev/null; then + MISSING_PORTS="${MISSING_PORTS} ${PORT}" + fi + done + if [ -n "$MISSING_PORTS" ]; then + echo "⚠️ AMP ports not mapped:${MISSING_PORTS}" + echo " The cluster was created without AMP port mappings." + echo " To fix, delete and recreate the cluster:" + echo " k3d cluster delete ${CLUSTER_NAME}" + echo " DEV_MODE=helm make setup-k3d" + else + echo "βœ… AMP ports are mapped correctly" + fi + fi + echo "" echo "Cluster info:" kubectl cluster-info --context ${CLUSTER_CONTEXT} @@ -62,9 +90,9 @@ else echo "πŸ“ Creating shared directory for OpenChoreo..." mkdir -p /tmp/k3d-shared - # Create k3d cluster with OpenChoreo configuration - echo "πŸš€ Creating k3d cluster with OpenChoreo configuration..." - k3d cluster create --config ../single-cluster-config.yaml + # Create k3d cluster with appropriate configuration + echo "πŸš€ Creating k3d cluster with config: ${K3D_CONFIG}..." + k3d cluster create --config "${K3D_CONFIG}" echo "" echo "βœ… k3d cluster created successfully!" diff --git a/deployments/values/values-local.yaml b/deployments/values/values-local.yaml new file mode 100644 index 000000000..bda306840 --- /dev/null +++ b/deployments/values/values-local.yaml @@ -0,0 +1,58 @@ +# Local development overrides for helm-based deployment (DEV_MODE=helm) +# Used by: DEV_MODE=helm make dev-up, make helm-install, make helm-upgrade +# +# All services run inside the k3d cluster. Images are built locally and +# imported via `k3d image import` (pullPolicy: Never). + +agentManagerService: + image: + repository: amp-api + tag: "0.0.0-dev" + pullPolicy: Never + service: + type: LoadBalancer + port: 9000 + config: + logLevel: "DEBUG" + corsAllowedOrigin: "*" + # In-cluster OpenChoreo URL (no host.docker.internal needed) + openChoreo: + baseURL: "http://openchoreo-api.openchoreo-control-plane:8080/api/v1" + # In-cluster Thunder IDP + keyManager: + jwksUrl: "http://amp-thunder-extension-service.amp-thunder.svc.cluster.local:8090/oauth2/jwks" + oidc: + tokenUrl: "http://amp-thunder-extension-service.amp-thunder.svc.cluster.local:8090/oauth2/token" + # In-cluster OpenBao + openbao: + url: "http://amp-secrets-openbao.amp-secrets.svc.cluster.local:8200" + # In-cluster OTel endpoint + otel: + exporterEndpoint: "http://obs-gateway-gateway-router.openchoreo-data-plane.svc.cluster.local:22893/otel" + # In-cluster observer + observerURL: "http://observer.openchoreo-observability-plane.svc.cluster.local:8080" + traceObserverURL: "http://amp-traces-observer.openchoreo-observability-plane.svc.cluster.local:9098" + +console: + image: + repository: amp-console + tag: "0.0.0-dev" + pullPolicy: Never + service: + type: LoadBalancer + config: + disableAuth: "true" + apiBaseUrl: "http://localhost:9000" + instrumentationUrl: "http://localhost:21893" + +dbMigration: + image: + repository: amp-api + tag: "0.0.0-dev" + pullPolicy: Never + +postgresql: + primary: + persistence: + enabled: true + size: 5Gi From d9e1cca40c83bc4bbf3e3275d96e6fe6ccffb5ee Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 15:50:34 +0530 Subject: [PATCH 2/8] Add dev-pause and dev-resume targets to stop/start Colima and k3d --- Makefile | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e1b81f5de..2692eb929 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ .PHONY: help setup setup-colima setup-k3d setup-openchoreo setup-platform setup-console-local setup-console-local-force dev-up dev-down dev-restart dev-rebuild dev-logs dev-migrate openchoreo-up openchoreo-down openchoreo-status teardown db-connect db-logs service-logs service-shell console-logs port-forward setup-kubeconfig-docker -.PHONY: helm-build helm-build-api helm-build-console helm-import helm-install helm-upgrade helm-sync helm-sync-api helm-sync-console helm-restart helm-status helm-logs helm-api-logs helm-console-logs helm-db-connect status api-logs +.PHONY: helm-build helm-build-api helm-build-console helm-import helm-install helm-upgrade helm-sync helm-sync-api helm-sync-console helm-restart helm-status helm-logs helm-api-logs helm-console-logs helm-db-connect status api-logs dev-pause dev-resume # Development mode: "compose" (default) or "helm" DEV_MODE ?= compose @@ -67,6 +67,10 @@ help: @echo " make service-shell - Shell into service container" @echo " make console-logs - View console logs" @echo "" + @echo "Pause / Resume (saves laptop resources):" + @echo " make dev-pause - Stop k3d cluster and Colima VM" + @echo " make dev-resume - Start Colima VM and k3d cluster" + @echo "" @echo "Cleanup:" @echo " make teardown - Remove everything (cluster + platform)" @echo "" @@ -386,6 +390,30 @@ service-shell: console-logs: @docker logs -f agent-manager-console +# ============================================================================ +# Pause / Resume (saves laptop resources) +# ============================================================================ + +CLUSTER_NAME := openchoreo-local-v0.14.0 + +dev-pause: + @echo "Stopping k3d cluster..." + @k3d cluster stop $(CLUSTER_NAME) 2>/dev/null || echo "Cluster not running" + @echo "Stopping Colima..." + @colima stop 2>/dev/null || echo "Colima not running" + @echo "All stopped. CPU and memory freed." + +dev-resume: + @echo "Starting Colima..." + @colima start + @echo "Starting k3d cluster..." + @k3d cluster start $(CLUSTER_NAME) + @echo "Waiting for cluster to be ready..." + @for i in 1 2 3 4 5 6 7 8 9 10; do \ + kubectl cluster-info --context $(CLUSTER_CONTEXT) &>/dev/null && break || sleep 3; \ + done + @echo "Cluster is ready." + # ============================================================================ # Cleanup # ============================================================================ From 0aed56dbded6d111575b37e89259db8fe8e10f2b Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 16:05:48 +0530 Subject: [PATCH 3/8] Fix OpenChoreo setup re-run failures Use server-side apply with --force-conflicts on all kubectl apply calls to prevent field ownership conflicts with Helm on re-runs. Add a wait for the data plane agent pod before reading its logs. --- deployments/scripts/setup-openchoreo.sh | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index f66673ca8..f6ed0c6d8 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -59,7 +59,7 @@ echo "" # Create Certificate for Control Plane TLS echo "πŸ“œ Creating Certificate for Control Plane TLS..." -kubectl apply -f - </dev/null | base64 -d || echo "") if [ -n "$CA_CERT" ]; then - kubectl apply -f - </dev/null || \ + kubectl wait --for=condition=Ready pod -l app=cluster-agent -n openchoreo-data-plane --timeout=120s 2>/dev/null || \ + echo "⚠️ Data Plane agent pods may still be starting" +kubectl logs -n openchoreo-data-plane -l app=cluster-agent --tail=10 2>/dev/null || true echo "Verify API Platform Gateway pods:" kubectl get pods -n openchoreo-data-plane --selector="app.kubernetes.io/instance=api-platform-default-gateway" echo "βœ… OpenChoreo Data Plane ready" @@ -172,7 +176,7 @@ helm upgrade --install openchoreo-build-plane oci://ghcr.io/openchoreo/helm-char echo "5️⃣ Registering Build Plane..." BP_CA_CERT=$(kubectl get secret cluster-agent-tls -n openchoreo-build-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null | base64 -d || echo "") if [ -n "$BP_CA_CERT" ]; then - kubectl apply -f - </dev/null | base64 -d || echo "") if [ -n "$OP_CA_CERT" ]; then - kubectl apply -f - < Date: Wed, 4 Mar 2026 16:24:55 +0530 Subject: [PATCH 4/8] Copy cluster-gateway-ca ConfigMap to build plane namespace The build plane cluster-agent pod mounts cluster-gateway-ca but nothing creates it in the openchoreo-build-plane namespace. Copy it from the control plane namespace before installing the build plane chart. --- deployments/scripts/setup-openchoreo.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index f6ed0c6d8..d7578416a 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -165,6 +165,12 @@ helm upgrade --install registry docker-registry \ echo "⏳ Waiting for Docker Registry to be ready..." kubectl wait --for=condition=available deployment/registry-docker-registry -n openchoreo-build-plane --timeout=120s +echo "πŸ“œ Copying cluster-gateway-ca to build plane namespace..." +kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane -o json | \ + jq '.metadata.namespace = "openchoreo-build-plane" | del(.metadata.resourceVersion, .metadata.uid, .metadata.creationTimestamp, .metadata.managedFields, .metadata.annotations)' | \ + kubectl apply --server-side --force-conflicts -f - +echo "βœ… cluster-gateway-ca copied to openchoreo-build-plane" + echo "4️⃣ Installing/Upgrading OpenChoreo Build Plane..." helm upgrade --install openchoreo-build-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-build-plane \ --version ${OPENCHOREO_VERSION} \ From 6805af64c3e421faf32e493f44145cac32602c8f Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 16:30:07 +0530 Subject: [PATCH 5/8] Wait for real CA certificate before installing build plane On re-runs, helm upgrade resets cluster-gateway-ca to a placeholder. The extractor job replaces it with the real cert, but the build plane copy job can race ahead and copy the placeholder. Fix by polling until the CA has a real certificate before proceeding. --- deployments/scripts/setup-openchoreo.sh | 26 +++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index d7578416a..81236957f 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -54,6 +54,26 @@ kubectl wait -n openchoreo-control-plane --for=condition=available --timeout=300 if kubectl get jobs -n openchoreo-control-plane --no-headers 2>/dev/null | grep -q .; then kubectl wait -n openchoreo-control-plane --for=condition=complete --timeout=300s job --all fi + +# Verify the CA extractor has replaced the placeholder with a real certificate. +# The Helm chart deploys a placeholder ConfigMap and a Job that extracts the real +# CA from a TLS secret. On re-runs, helm upgrade resets the ConfigMap to the +# placeholder, so we must wait for the extractor to overwrite it again. +echo "⏳ Waiting for cluster-gateway-ca to contain a real certificate..." +for i in $(seq 1 30); do + CA_CONTENT=$(kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null) + if echo "$CA_CONTENT" | grep -q "BEGIN CERTIFICATE"; then + echo "βœ… cluster-gateway-ca has a valid certificate" + break + fi + if [ "$i" -eq 30 ]; then + echo "⚠️ Timeout waiting for real CA certificate. The extractor job may need to be re-run:" + echo " kubectl delete job cluster-gateway-ca-extractor -n openchoreo-control-plane" + echo " Then re-run: make setup-openchoreo" + fi + sleep 5 +done + echo "βœ… OpenChoreo Control Plane ready" echo "" @@ -165,12 +185,6 @@ helm upgrade --install registry docker-registry \ echo "⏳ Waiting for Docker Registry to be ready..." kubectl wait --for=condition=available deployment/registry-docker-registry -n openchoreo-build-plane --timeout=120s -echo "πŸ“œ Copying cluster-gateway-ca to build plane namespace..." -kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane -o json | \ - jq '.metadata.namespace = "openchoreo-build-plane" | del(.metadata.resourceVersion, .metadata.uid, .metadata.creationTimestamp, .metadata.managedFields, .metadata.annotations)' | \ - kubectl apply --server-side --force-conflicts -f - -echo "βœ… cluster-gateway-ca copied to openchoreo-build-plane" - echo "4️⃣ Installing/Upgrading OpenChoreo Build Plane..." helm upgrade --install openchoreo-build-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-build-plane \ --version ${OPENCHOREO_VERSION} \ From de54a7fb843dbcc769a73c520fb3409e2eee26ea Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 16:35:31 +0530 Subject: [PATCH 6/8] Wait for plane agents before reading logs in setup-openchoreo Add waits for copy-ca jobs and agent pods in build plane and observability plane verification sections. Prevents failures from reading logs of pods that are still in ContainerCreating state. --- deployments/scripts/setup-openchoreo.sh | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index 81236957f..5a625f7cb 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -217,11 +217,20 @@ else fi echo "" +# Wait for build plane copy-ca job and agent +echo "⏳ Waiting for build plane CA copy job..." +if kubectl get jobs -n openchoreo-build-plane --no-headers 2>/dev/null | grep -q copy-ca; then + kubectl wait -n openchoreo-build-plane --for=condition=complete --timeout=120s job -l app=cluster-agent 2>/dev/null || true +fi +echo "⏳ Waiting for build plane agent..." +kubectl wait --for=condition=Ready pod -l app=cluster-agent -n openchoreo-build-plane --timeout=120s 2>/dev/null || \ + echo "⚠️ Build plane agent pods may still be starting" + # Verify BuildPlane echo "" echo "πŸ” Verifying BuildPlane ..." kubectl get buildplane -n default -kubectl logs -n openchoreo-build-plane -l app=cluster-agent --tail=10 +kubectl logs -n openchoreo-build-plane -l app=cluster-agent --tail=10 2>/dev/null || true echo "βœ… OpenChoreo Build Plane ready" echo "" @@ -340,11 +349,20 @@ else fi echo "" +# Wait for observability plane copy-ca job and agent +echo "⏳ Waiting for observability plane CA copy job..." +if kubectl get jobs -n openchoreo-observability-plane --no-headers 2>/dev/null | grep -q copy-ca; then + kubectl wait -n openchoreo-observability-plane --for=condition=complete --timeout=120s job -l app=cluster-agent 2>/dev/null || true +fi +echo "⏳ Waiting for observability plane agent..." +kubectl wait --for=condition=Ready pod -l app=cluster-agent -n openchoreo-observability-plane --timeout=120s 2>/dev/null || \ + echo "⚠️ Observability plane agent pods may still be starting" + # Verify ObservabilityPlane echo "" echo "πŸ” Verifying ObservabilityPlane ..." kubectl get observabilityplane -n default -kubectl logs -n openchoreo-observability-plane -l app=cluster-agent --tail=10 +kubectl logs -n openchoreo-observability-plane -l app=cluster-agent --tail=10 2>/dev/null || true echo "βœ… OpenChoreo Observability Plane ready" echo "" From d2a36dc613a558b0a014ad6216884f48e36b3fb7 Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 16:38:23 +0530 Subject: [PATCH 7/8] Fix recurring cluster-gateway-ca ownership conflict on re-runs The CA extractor job uses kubectl apply (client-side) to write the real certificate, which claims field ownership from Helm. On subsequent helm upgrade calls, this causes a conflict. Fix by removing the kubectl-client-side-apply field manager before running helm upgrade. --- deployments/scripts/setup-openchoreo.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index 5a625f7cb..dfd446485 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -42,6 +42,23 @@ echo "" # Step 1: Install OpenChoreo Control Plane echo "1️⃣ Installing/Upgrading OpenChoreo Control Plane..." echo " This may take up to 10 minutes..." + +# On re-runs, the CA extractor job uses kubectl apply (client-side) to write the +# real cert into cluster-gateway-ca, which claims field ownership. The next helm +# upgrade then conflicts. Fix by removing the client-side-apply field manager +# before upgrading, so Helm can take ownership cleanly. +if kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane &>/dev/null; then + kubectl annotate configmap cluster-gateway-ca -n openchoreo-control-plane \ + kubectl.kubernetes.io/last-applied-configuration- --overwrite 2>/dev/null || true + # Remove the kubectl-client-side-apply managed field entry + FIELD_INDEX=$(kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane \ + --show-managed-fields -o json | jq '.metadata.managedFields | to_entries[] | select(.value.manager == "kubectl-client-side-apply") | .key' 2>/dev/null) + if [ -n "$FIELD_INDEX" ]; then + kubectl patch configmap cluster-gateway-ca -n openchoreo-control-plane \ + --type=json -p="[{\"op\":\"remove\",\"path\":\"/metadata/managedFields/${FIELD_INDEX}\"}]" 2>/dev/null || true + fi +fi + helm upgrade --install openchoreo-control-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-control-plane \ --version ${OPENCHOREO_PATCH_VERSION} \ --namespace openchoreo-control-plane \ From 1d85f1018d0d5e50d02d254d08d80933e82d843c Mon Sep 17 00:00:00 2001 From: Asanka Abeyweera Date: Wed, 4 Mar 2026 16:42:57 +0530 Subject: [PATCH 8/8] Delete stale CA jobs before helm install/upgrade on re-runs helm upgrade resets the CA ConfigMap to a placeholder but doesn't recreate completed jobs. The extractor and copy-ca jobs need to run again to populate the real certificate. Delete them before each helm install/upgrade so they get recreated. --- deployments/scripts/setup-openchoreo.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index dfd446485..b1e6e38db 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -47,10 +47,15 @@ echo " This may take up to 10 minutes..." # real cert into cluster-gateway-ca, which claims field ownership. The next helm # upgrade then conflicts. Fix by removing the client-side-apply field manager # before upgrading, so Helm can take ownership cleanly. +# On re-runs, fix two issues: +# 1. The CA extractor job uses kubectl apply (client-side) to write the real cert, +# which claims field ownership. Remove the field manager so helm upgrade won't conflict. +# 2. helm upgrade resets the CA ConfigMap to a placeholder, but the extractor job +# won't re-run because Helm doesn't recreate completed jobs. Delete it so Helm +# recreates it and it extracts the real cert again. if kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane &>/dev/null; then kubectl annotate configmap cluster-gateway-ca -n openchoreo-control-plane \ kubectl.kubernetes.io/last-applied-configuration- --overwrite 2>/dev/null || true - # Remove the kubectl-client-side-apply managed field entry FIELD_INDEX=$(kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane \ --show-managed-fields -o json | jq '.metadata.managedFields | to_entries[] | select(.value.manager == "kubectl-client-side-apply") | .key' 2>/dev/null) if [ -n "$FIELD_INDEX" ]; then @@ -58,6 +63,7 @@ if kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane &>/dev/n --type=json -p="[{\"op\":\"remove\",\"path\":\"/metadata/managedFields/${FIELD_INDEX}\"}]" 2>/dev/null || true fi fi +kubectl delete job cluster-gateway-ca-extractor -n openchoreo-control-plane 2>/dev/null || true helm upgrade --install openchoreo-control-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-control-plane \ --version ${OPENCHOREO_PATCH_VERSION} \ @@ -117,6 +123,8 @@ echo "" # Step 2: Install OpenChoreo Data Plane echo "2️⃣ Installing/Upgrading OpenChoreo Data Plane..." echo " This may take up to 10 minutes..." +# Delete completed copy-ca job so helm recreates it on upgrade +kubectl delete job -n openchoreo-data-plane -l app=cluster-agent 2>/dev/null || true helm upgrade --install openchoreo-data-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-data-plane \ --version ${OPENCHOREO_VERSION} \ --namespace openchoreo-data-plane \ @@ -203,6 +211,8 @@ echo "⏳ Waiting for Docker Registry to be ready..." kubectl wait --for=condition=available deployment/registry-docker-registry -n openchoreo-build-plane --timeout=120s echo "4️⃣ Installing/Upgrading OpenChoreo Build Plane..." +# Delete completed copy-ca job so helm recreates it on upgrade +kubectl delete job -n openchoreo-build-plane -l app=cluster-agent 2>/dev/null || true helm upgrade --install openchoreo-build-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-build-plane \ --version ${OPENCHOREO_VERSION} \ --namespace openchoreo-build-plane \ @@ -299,6 +309,8 @@ else kubectl apply --server-side --force-conflicts -f $1/deployments/values/oc-collector-configmap.yaml -n openchoreo-observability-plane + # Delete completed copy-ca job so helm recreates it on upgrade + kubectl delete job -n openchoreo-observability-plane -l app=cluster-agent 2>/dev/null || true helm install openchoreo-observability-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-observability-plane \ --version ${OPENCHOREO_VERSION} \ --namespace openchoreo-observability-plane \