diff --git a/Makefile b/Makefile index e8dd7e755..2692eb929 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,24 @@ .PHONY: help setup setup-colima setup-k3d setup-openchoreo setup-platform setup-console-local setup-console-local-force dev-up dev-down dev-restart dev-rebuild dev-logs dev-migrate openchoreo-up openchoreo-down openchoreo-status teardown db-connect db-logs service-logs service-shell console-logs port-forward setup-kubeconfig-docker +.PHONY: helm-build helm-build-api helm-build-console helm-import helm-install helm-upgrade helm-sync helm-sync-api helm-sync-console helm-restart helm-status helm-logs helm-api-logs helm-console-logs helm-db-connect status api-logs dev-pause dev-resume + +# Development mode: "compose" (default) or "helm" +DEV_MODE ?= compose + +# AMP variables (keep in sync with deployments/scripts/env.sh) +AMP_NAMESPACE := wso2-amp +AMP_RELEASE_NAME := amp +AMP_IMAGE_TAG := 0.0.0-dev +CLUSTER_CONTEXT := k3d-openchoreo-local-v0.14.0 # Default target help: @echo "Agent Manager Platform - Development Commands" @echo "" - @echo "πŸš€ Setup (run once):" - @echo " make setup - Complete setup (Colima + k3d + Thunder + OpenChoreo + Platform)" + @echo "Current mode: DEV_MODE=$(DEV_MODE)" + @echo "Switch modes: DEV_MODE=helm make " + @echo "" + @echo "Setup (run once):" + @echo " make setup - Complete setup (Colima + k3d + OpenChoreo + Platform)" @echo " make setup-colima - Start Colima VM" @echo " make setup-k3d - Create k3d cluster" @echo " make setup-thunder - Setup Thunder" @@ -14,53 +27,93 @@ help: @echo " make setup-console-local - Install console deps (only if changed)" @echo " make setup-console-local-force - Force reinstall console deps" @echo "" - @echo "πŸ’» Daily Development:" - @echo " make dev-up - Start platform services (console, service, db)" + @echo "Daily Development (mode-aware):" + @echo " make dev-up - Start platform services" @echo " make dev-down - Stop platform services" @echo " make dev-restart - Restart platform services" @echo " make dev-rebuild - Rebuild images and restart services" @echo " make dev-logs - Tail all platform logs" - @echo " make dev-migrate - Generate evaluators and run database migrations" + @echo " make dev-migrate - Run database migrations" @echo "" - @echo "☸️ OpenChoreo Runtime:" + @echo "Helm Mode (DEV_MODE=helm):" + @echo " make helm-build - Build all Docker images from source" + @echo " make helm-build-api - Build API image only" + @echo " make helm-build-console - Build Console image only" + @echo " make helm-import - Import all images into k3d" + @echo " make helm-install - First-time Helm install (build + import + deploy)" + @echo " make helm-upgrade - Helm upgrade (redeploy with current values)" + @echo " make helm-sync - Full sync: build all + import + restart pods" + @echo " make helm-sync-api - Fast: build API + import + restart API pod" + @echo " make helm-sync-console - Fast: build Console + import + restart Console pod" + @echo " make helm-restart - Restart all AMP deployments" + @echo " make helm-status - Show pods and services" + @echo " make helm-logs - Tail all AMP logs" + @echo " make helm-api-logs - Tail API logs" + @echo " make helm-console-logs - Tail Console logs" + @echo " make helm-db-connect - psql into PostgreSQL pod" + @echo "" + @echo "OpenChoreo Runtime:" @echo " make openchoreo-up - Start OpenChoreo cluster" @echo " make openchoreo-down - Stop OpenChoreo cluster (saves resources)" @echo " make openchoreo-status - Check OpenChoreo cluster status" @echo " make port-forward - Forward OpenChoreo services to localhost" @echo "" - @echo "πŸ—„οΈ Database:" + @echo "Database (Compose mode):" @echo " make db-connect - Connect to PostgreSQL" @echo " make db-logs - View database logs" @echo "" - @echo "πŸ”§ Service Debugging:" + @echo "Service Debugging (Compose mode):" @echo " make service-logs - View service logs" @echo " make service-shell - Shell into service container" @echo " make console-logs - View console logs" @echo "" - @echo "🧹 Cleanup:" - @echo " make teardown - Remove everything (Kind cluster + platform)" + @echo "Pause / Resume (saves laptop resources):" + @echo " make dev-pause - Stop k3d cluster and Colima VM" + @echo " make dev-resume - Start Colima VM and k3d cluster" + @echo "" + @echo "Cleanup:" + @echo " make teardown - Remove everything (cluster + platform)" @echo "" -# Complete setup +# ============================================================================ +# Setup +# ============================================================================ + +# Complete setup - dispatches based on DEV_MODE +ifeq ($(DEV_MODE),helm) +setup: setup-colima setup-k3d setup-openchoreo setup-thunder helm-install + @echo "" + @echo "Complete setup finished! (Helm mode)" + @echo "" + @echo "Access your services:" + @echo " Console: http://localhost:3000" + @echo " API: http://localhost:9000" + @echo "" + @echo "Useful commands:" + @echo " make helm-status - Show pod status" + @echo " make helm-sync-api - Rebuild and redeploy API" + @echo " make helm-api-logs - Tail API logs" +else setup: setup-colima setup-k3d setup-openchoreo setup-thunder setup-kubeconfig-docker setup-platform setup-console-local @echo "" - @echo "βœ… Complete setup finished!" + @echo "Complete setup finished!" @echo "" - @echo "🌐 Access your services:" + @echo "Access your services:" @echo " Console: http://localhost:3000" @echo " API: http://localhost:8080" @echo " Traces Observer Service: http://localhost:9098" @echo " Database: localhost:5432" @echo "" - @echo "πŸ“Š To access OpenChoreo services, run:" + @echo "To access OpenChoreo services, run:" @echo " make port-forward" +endif # Setup individual components setup-colima: @cd deployments/scripts && ./setup-colima.sh setup-k3d: - @cd deployments/scripts && ./setup-k3d.sh + @cd deployments/scripts && DEV_MODE=$(DEV_MODE) ./setup-k3d.sh setup-thunder: @cd deployments/scripts && ./setup-amp-thunder.sh @@ -69,9 +122,9 @@ setup-openchoreo: @cd deployments/scripts && ./setup-openchoreo.sh $(CURDIR) gen-keys: - @echo "πŸ”‘ Generating JWT signing keys..." + @echo "Generating JWT signing keys..." @cd agent-manager-service && make gen-keys - @echo "βœ… JWT signing keys generated in agent-manager-service/keys/" + @echo "JWT signing keys generated in agent-manager-service/keys/" setup-platform: gen-keys @cd deployments/scripts && ./setup-platform.sh @@ -82,23 +135,23 @@ setup-platform: gen-keys @mkdir -p .make .make/console-deps-installed: console/rush.json console/common/config/rush/pnpm-lock.yaml | .make - @echo "πŸ“¦ Installing console dependencies locally..." + @echo "Installing console dependencies locally..." @if ! command -v rush &> /dev/null; then \ - echo "⚠️ Rush not found. Installing Rush globally..."; \ + echo "Rush not found. Installing Rush globally..."; \ npm install -g @microsoft/rush@5.157.0; \ fi - @echo "πŸ“₯ Running rush update..." + @echo "Running rush update..." @cd console && rush update --full @touch .make/console-deps-installed .make/console-built: .make/console-deps-installed - @echo "πŸ”¨ Building monorepo packages..." + @echo "Building monorepo packages..." @cd console && rush build @touch .make/console-built - @echo "βœ… Console packages built" + @echo "Console packages built" setup-console-local: .make/console-built - @echo "βœ… Console dependencies are up to date" + @echo "Console dependencies are up to date" # Force rebuild of console dependencies (ignores timestamps) setup-console-local-force: @@ -108,79 +161,202 @@ setup-console-local-force: # Generate Docker-specific kubeconfig using k3d kubeconfig # Always regenerates to ensure it matches the current cluster setup-kubeconfig-docker: - @echo "πŸ”§ Generating Docker kubeconfig..." + @echo "Generating Docker kubeconfig..." @cd deployments/scripts && ./generate-docker-kubeconfig.sh - @echo "βœ… Docker kubeconfig is ready" + @echo "Docker kubeconfig is ready" + +# ============================================================================ +# Daily Development (mode-aware) +# ============================================================================ + +ifeq ($(DEV_MODE),helm) + +dev-up: + @cd deployments/scripts && ./helm-deploy-amp.sh + +dev-down: + @cd deployments/scripts && ./helm-deploy-amp.sh --uninstall + +dev-restart: helm-restart + +dev-rebuild: helm-sync + +dev-logs: helm-logs + +else -# Daily development commands dev-up: setup-console-local setup-kubeconfig-docker gen-keys - @echo "πŸš€ Starting Agent Manager platform..." + @echo "Starting Agent Manager platform..." @cd deployments && docker compose up -d - @echo "βœ… Platform is running!" + @echo "Platform is running!" @echo " Console: http://localhost:3000" @echo " API: http://localhost:8080" dev-down: - @echo "πŸ›‘ Stopping Agent Manager platform..." + @echo "Stopping Agent Manager platform..." @cd deployments && docker compose down - @echo "βœ… Platform stopped" + @echo "Platform stopped" dev-restart: - @echo "πŸ”„ Restarting Agent Manager platform..." + @echo "Restarting Agent Manager platform..." @cd deployments && docker compose restart - @echo "βœ… Platform restarted" + @echo "Platform restarted" dev-rebuild: setup-console-local - @echo "🧹 Stopping services..." + @echo "Stopping services..." @cd deployments && docker compose down - @echo "🧹 Removing console volumes (preserving database)..." + @echo "Removing console volumes (preserving database)..." @docker volume rm deployments_console_node_modules deployments_console_common_temp 2>/dev/null || true - @echo "🧹 Cleaning Rush temp directory..." + @echo "Cleaning Rush temp directory..." @rm -rf console/common/temp - @echo "πŸ”¨ Rebuilding Docker images..." + @echo "Rebuilding Docker images..." @cd deployments && docker compose build --no-cache - @echo "πŸ”„ Starting services..." + @echo "Starting services..." @cd deployments && docker compose up -d - @echo "βœ… Rebuild complete!" + @echo "Rebuild complete!" @echo " Console: http://localhost:3000" @echo " API: http://localhost:8080" dev-logs: @cd deployments && docker compose logs -f +endif + dev-migrate: - @echo "πŸ—„οΈ Running database migrations..." + @echo "Running database migrations..." @docker exec agent-manager-service sh -c "cd /go/src && make dev-migrate" - @echo "βœ… Migrations completed" + @echo "Migrations completed" + +# ============================================================================ +# Helm Mode Targets +# ============================================================================ + +# Build all Docker images from source +helm-build: + @cd deployments/scripts && ./build-and-import.sh api console traces-observer evaluation-job + +# Build individual components +helm-build-api: + @echo "Building API image..." + @docker build -t amp-api:$(AMP_IMAGE_TAG) agent-manager-service/ --quiet + @echo "API image built." + +helm-build-console: + @echo "Building Console image..." + @docker build -t amp-console:$(AMP_IMAGE_TAG) console/ --quiet + @echo "Console image built." +# Import all images into k3d (assumes images are already built) +helm-import: + @echo "Importing images into k3d..." + @k3d image import amp-api:$(AMP_IMAGE_TAG) amp-console:$(AMP_IMAGE_TAG) amp-traces-observer:$(AMP_IMAGE_TAG) amp-evaluation-job:$(AMP_IMAGE_TAG) -c openchoreo-local-v0.14.0 + @echo "Images imported." + +# First-time Helm install: build + import + deploy +helm-install: helm-build + @cd deployments/scripts && ./helm-deploy-amp.sh + +# Helm upgrade (redeploy with current values, no image rebuild) +helm-upgrade: + @cd deployments/scripts && ./helm-deploy-amp.sh + +# Full sync: build all images, import, restart pods +helm-sync: helm-build + @echo "Restarting AMP deployments..." + @kubectl rollout restart deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) 2>/dev/null || true + @echo "Waiting for rollout..." + @kubectl rollout status deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=300s 2>/dev/null || true + @echo "Sync complete." + +# Fast sync: build API only, import, restart API pod +helm-sync-api: helm-build-api + @echo "Importing API image into k3d..." + @k3d image import amp-api:$(AMP_IMAGE_TAG) -c openchoreo-local-v0.14.0 + @echo "Restarting API deployment..." + @kubectl rollout restart deployment/$(AMP_RELEASE_NAME)-agent-manager-service -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) + @kubectl rollout status deployment/$(AMP_RELEASE_NAME)-agent-manager-service -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=120s + @echo "API synced." + +# Fast sync: build Console only, import, restart Console pod +helm-sync-console: helm-build-console + @echo "Importing Console image into k3d..." + @k3d image import amp-console:$(AMP_IMAGE_TAG) -c openchoreo-local-v0.14.0 + @echo "Restarting Console deployment..." + @kubectl rollout restart deployment/$(AMP_RELEASE_NAME)-console -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) + @kubectl rollout status deployment/$(AMP_RELEASE_NAME)-console -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=120s + @echo "Console synced." + +# Restart all AMP deployments (no rebuild) +helm-restart: + @echo "Restarting all AMP deployments..." + @kubectl rollout restart deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) + @kubectl rollout status deployment -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --timeout=300s 2>/dev/null || true + @echo "Restart complete." + +# Show pod and service status +helm-status: + @echo "=== Pods ===" + @kubectl get pods -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) 2>/dev/null || echo "Namespace $(AMP_NAMESPACE) not found" + @echo "" + @echo "=== Services ===" + @kubectl get svc -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) 2>/dev/null || echo "Namespace $(AMP_NAMESPACE) not found" + +# Alias for helm-status +status: helm-status + +# Tail all AMP logs +helm-logs: + @kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --all-containers --max-log-requests=10 -l "app.kubernetes.io/instance=$(AMP_RELEASE_NAME)" --prefix 2>/dev/null || \ + kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) --all-containers --max-log-requests=10 --prefix + +# Tail API logs +helm-api-logs: + @kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) -l "app.kubernetes.io/component=agent-manager-service" --all-containers --prefix + +# Alias for helm-api-logs +api-logs: helm-api-logs + +# Tail Console logs +helm-console-logs: + @kubectl logs -f -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) -l "app.kubernetes.io/component=console" --all-containers --prefix + +# psql into PostgreSQL pod +helm-db-connect: + @kubectl exec -it -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) \ + $$(kubectl get pod -n $(AMP_NAMESPACE) --context $(CLUSTER_CONTEXT) -l "app.kubernetes.io/name=postgresql" -o jsonpath='{.items[0].metadata.name}') \ + -- psql -U agentmanager -d agentmanager + +# ============================================================================ # OpenChoreo lifecycle management +# ============================================================================ + openchoreo-up: - @echo "πŸš€ Starting OpenChoreo cluster..." - @docker start openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null || (echo "⚠️ Cluster not found. Run 'make setup-k3d setup-openchoreo' first." && exit 1) - @echo "⏳ Waiting for nodes to be ready..." + @echo "Starting OpenChoreo cluster..." + @docker start openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null || (echo "Cluster not found. Run 'make setup-k3d setup-openchoreo' first." && exit 1) + @echo "Waiting for nodes to be ready..." @for i in 1 2 3 4 5 6 7 8 9 10 11 12; do \ kubectl get nodes --context kind-openchoreo-local >/dev/null 2>&1 && \ kubectl wait --for=condition=Ready nodes --all --timeout=10s --context kind-openchoreo-local >/dev/null 2>&1 && break || sleep 10; \ done - @echo "⏳ Waiting for core system pods..." + @echo "Waiting for core system pods..." @kubectl wait --for=condition=Ready pods --all -n kube-system --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "⏳ Waiting for OpenChoreo control plane..." + @echo "Waiting for OpenChoreo control plane..." @kubectl wait --for=condition=Ready pods --all -n openchoreo-control-plane --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "⏳ Waiting for OpenChoreo data plane..." + @echo "Waiting for OpenChoreo data plane..." @kubectl wait --for=condition=Ready pods --all -n openchoreo-data-plane --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "⏳ Waiting for OpenChoreo observability plane..." + @echo "Waiting for OpenChoreo observability plane..." @kubectl wait --for=condition=Ready pods --all -n openchoreo-observability-plane --timeout=90s --context kind-openchoreo-local 2>/dev/null || true - @echo "βœ… OpenChoreo cluster is running" + @echo "OpenChoreo cluster is running" @echo "" - @echo "πŸ“Š Cluster status:" + @echo "Cluster status:" @kubectl get pods --all-namespaces --context kind-openchoreo-local | grep -v "Running\|Completed" | head -1 || echo " All pods are running!" openchoreo-down: - @echo "πŸ›‘ Stopping OpenChoreo cluster..." - @docker stop openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null && echo "βœ… OpenChoreo cluster stopped (containers preserved)" || echo "⚠️ Cluster not running" + @echo "Stopping OpenChoreo cluster..." + @docker stop openchoreo-local-control-plane openchoreo-local-worker 2>/dev/null && echo "OpenChoreo cluster stopped (containers preserved)" || echo "Cluster not running" openchoreo-status: - @echo "πŸ“Š OpenChoreo Cluster Status:" + @echo "OpenChoreo Cluster Status:" @echo "" @echo "Docker Containers:" @docker ps -a --filter name=openchoreo-local --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" 2>/dev/null || echo "No containers found" @@ -195,14 +371,16 @@ openchoreo-status: port-forward: @cd deployments/scripts && ./port-forward.sh -# Database commands +# ============================================================================ +# Database & Service Debugging (Compose mode) +# ============================================================================ + db-connect: @docker exec -it agent-manager-db psql -U agentmanager -d agentmanager db-logs: @docker logs -f agent-manager-db -# Service debugging service-logs: @docker logs -f agent-manager-service @@ -212,6 +390,33 @@ service-shell: console-logs: @docker logs -f agent-manager-console +# ============================================================================ +# Pause / Resume (saves laptop resources) +# ============================================================================ + +CLUSTER_NAME := openchoreo-local-v0.14.0 + +dev-pause: + @echo "Stopping k3d cluster..." + @k3d cluster stop $(CLUSTER_NAME) 2>/dev/null || echo "Cluster not running" + @echo "Stopping Colima..." + @colima stop 2>/dev/null || echo "Colima not running" + @echo "All stopped. CPU and memory freed." + +dev-resume: + @echo "Starting Colima..." + @colima start + @echo "Starting k3d cluster..." + @k3d cluster start $(CLUSTER_NAME) + @echo "Waiting for cluster to be ready..." + @for i in 1 2 3 4 5 6 7 8 9 10; do \ + kubectl cluster-info --context $(CLUSTER_CONTEXT) &>/dev/null && break || sleep 3; \ + done + @echo "Cluster is ready." + +# ============================================================================ # Cleanup +# ============================================================================ + teardown: @cd deployments/scripts && ./teardown.sh diff --git a/deployments/dev-cluster-config.yaml b/deployments/dev-cluster-config.yaml new file mode 100644 index 000000000..a3e92a541 --- /dev/null +++ b/deployments/dev-cluster-config.yaml @@ -0,0 +1,114 @@ +# k3d cluster config for Helm dev mode (DEV_MODE=helm) +# Based on single-cluster-config.yaml with additional AMP service port mappings. +# This allows all services (OpenChoreo + AMP) to run in a single k3d cluster. +apiVersion: k3d.io/v1alpha5 +kind: Simple +metadata: + name: openchoreo-local-v0.14.0 +image: rancher/k3s:v1.32.9-k3s1 +servers: 1 +agents: 0 +kubeAPI: + hostPort: "6550" +ports: + # === OpenChoreo Ports (same as single-cluster-config.yaml) === + # Control Plane uses port range 8xxx + # HTTP traffic to OpenChoreo UI and API (Kgateway LoadBalancer on port 80) + - port: 8080:80 + nodeFilters: + - loadbalancer + # HTTPS traffic to OpenChoreo UI and API (Kgateway LoadBalancer on port 443) + - port: 8443:443 + nodeFilters: + - loadbalancer + # Data Plane uses port range 19xxx + # HTTP traffic to workloads via Gateway + - port: 19080:19080 + nodeFilters: + - loadbalancer + # HTTPS traffic to workloads via Gateway + - port: 19443:19443 + nodeFilters: + - loadbalancer + # Build Plane uses port range 10xxx + # Argo Workflows UI for development testing + - port: 10081:2746 + nodeFilters: + - loadbalancer + # Container Registry for storing built images + - port: 10082:5000 + nodeFilters: + - loadbalancer + # Observability Plane uses port range 11xxx + # Observer API + - port: 11080:8080 + nodeFilters: + - loadbalancer + # OpenSearch Dashboard + - port: 11081:5601 + nodeFilters: + - loadbalancer + # OpenSearch API for Fluent Bit data pushing + - port: 11082:9200 + nodeFilters: + - loadbalancer + + # === AMP Service Ports === + # Console (React frontend) + - port: 3000:3000 + nodeFilters: + - loadbalancer + # Agent Manager API + - port: 9000:9000 + nodeFilters: + - loadbalancer + # Internal API / Gateway Management + - port: 9243:9243 + nodeFilters: + - loadbalancer + # Traces Observer Service + - port: 9098:9098 + nodeFilters: + - loadbalancer + + # === OTel / Observability Ports === + # Data Prepper HTTP source + - port: 21893:21893 + nodeFilters: + - loadbalancer + # OTel gRPC + - port: 22893:22893 + nodeFilters: + - loadbalancer + # OTel HTTP + - port: 22894:22894 + nodeFilters: + - loadbalancer +options: + k3s: + extraArgs: + # Add host.k3d.internal to API server TLS certificate SANs. + # This allows consistent DataPlane configuration across single and multi-cluster setups + # where Control Plane pods can access the API server via host.k3d.internal:6550 + - arg: "--tls-san=host.k3d.internal" + nodeFilters: + - server:* + # Configure kubelet eviction thresholds to prevent resource exhaustion + - arg: "--kubelet-arg=eviction-hard=imagefs.available<1%,nodefs.available<1%" + nodeFilters: + - server:* + - arg: "--kubelet-arg=eviction-minimum-reclaim=imagefs.available=1%,nodefs.available=1%" + nodeFilters: + - server:* + # Disable Traefik to avoid conflicts with OpenChoreo Gateway controller + - arg: "--disable=traefik" + nodeFilters: + - server:* +# Configure insecure registries for HTTP access +# Allows kubelet to pull images from Build Plane registry via HTTP +registries: + config: | + mirrors: + "host.k3d.internal:10082": + endpoint: + - http://host.k3d.internal:10082 diff --git a/deployments/scripts/build-and-import.sh b/deployments/scripts/build-and-import.sh new file mode 100755 index 000000000..56134467b --- /dev/null +++ b/deployments/scripts/build-and-import.sh @@ -0,0 +1,86 @@ +#!/bin/bash +set -e + +# Build Docker images from production Dockerfiles and import them into k3d. +# Usage: +# ./build-and-import.sh # Build and import all components +# ./build-and-import.sh api # Build and import API only +# ./build-and-import.sh console api # Build and import Console and API +# +# Supported components: api, console, traces-observer, evaluation-job + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" + +source "$SCRIPT_DIR/env.sh" + +# Components and their build contexts (relative to ROOT_DIR) +declare -A COMPONENT_CONTEXT=( + [api]="agent-manager-service" + [console]="console" + [traces-observer]="traces-observer-service" + [evaluation-job]="evaluation-job" +) + +declare -A COMPONENT_IMAGE=( + [api]="amp-api" + [console]="amp-console" + [traces-observer]="amp-traces-observer" + [evaluation-job]="amp-evaluation-job" +) + +ALL_COMPONENTS="api console traces-observer evaluation-job" + +# Determine which components to build +if [ $# -eq 0 ]; then + COMPONENTS="$ALL_COMPONENTS" +else + COMPONENTS="$*" +fi + +# Validate component names +for comp in $COMPONENTS; do + if [ -z "${COMPONENT_CONTEXT[$comp]}" ]; then + echo "Unknown component: $comp" + echo "Valid components: $ALL_COMPONENTS" + exit 1 + fi +done + +echo "=== Building and importing images into k3d ===" +echo "" + +# Verify k3d cluster exists +if ! k3d cluster list 2>/dev/null | grep -q "${CLUSTER_NAME}"; then + echo "k3d cluster '${CLUSTER_NAME}' not found. Run 'make setup-k3d' first." + exit 1 +fi + +FAILED="" + +for comp in $COMPONENTS; do + IMAGE="${COMPONENT_IMAGE[$comp]}:${AMP_IMAGE_TAG}" + CONTEXT="${ROOT_DIR}/${COMPONENT_CONTEXT[$comp]}" + + echo "Building ${comp} -> ${IMAGE}..." + if docker build -t "$IMAGE" "$CONTEXT" --quiet; then + echo "Importing ${IMAGE} into k3d cluster..." + if k3d image import "$IMAGE" -c "${CLUSTER_NAME}"; then + echo "${comp} ready." + else + echo "Failed to import ${comp}." + FAILED="$FAILED $comp" + fi + else + echo "Failed to build ${comp}." + FAILED="$FAILED $comp" + fi + echo "" +done + +if [ -n "$FAILED" ]; then + echo "Failed components:${FAILED}" + exit 1 +fi + +echo "All images built and imported successfully." diff --git a/deployments/scripts/env.sh b/deployments/scripts/env.sh index 53f8cc3be..f564c5ba4 100644 --- a/deployments/scripts/env.sh +++ b/deployments/scripts/env.sh @@ -3,3 +3,8 @@ OPENCHOREO_VERSION="0.14.0" OPENCHOREO_PATCH_VERSION="0.0.0-b53c6dc3" CLUSTER_NAME="openchoreo-local-v${OPENCHOREO_VERSION}" CLUSTER_CONTEXT="k3d-${CLUSTER_NAME}" + +# AMP (Agent Management Platform) variables +AMP_NAMESPACE="wso2-amp" +AMP_RELEASE_NAME="amp" +AMP_IMAGE_TAG="0.0.0-dev" diff --git a/deployments/scripts/helm-deploy-amp.sh b/deployments/scripts/helm-deploy-amp.sh new file mode 100755 index 000000000..a1fe427ae --- /dev/null +++ b/deployments/scripts/helm-deploy-amp.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -e + +# Deploy AMP to the k3d cluster using Helm. +# Usage: +# ./helm-deploy-amp.sh # Install or upgrade +# ./helm-deploy-amp.sh --uninstall # Uninstall (preserves cluster) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +CHART_DIR="$ROOT_DIR/deployments/helm-charts/wso2-agent-manager" +VALUES_FILE="$ROOT_DIR/deployments/values/values-local.yaml" + +source "$SCRIPT_DIR/env.sh" + +if [ "$1" = "--uninstall" ]; then + echo "=== Uninstalling AMP from k3d ===" + if helm status "$AMP_RELEASE_NAME" -n "$AMP_NAMESPACE" --kube-context "${CLUSTER_CONTEXT}" &>/dev/null; then + helm uninstall "$AMP_RELEASE_NAME" -n "$AMP_NAMESPACE" --kube-context "${CLUSTER_CONTEXT}" + echo "AMP uninstalled. Cluster and namespace preserved." + else + echo "AMP release '${AMP_RELEASE_NAME}' not found in namespace '${AMP_NAMESPACE}'." + fi + exit 0 +fi + +echo "=== Deploying AMP to k3d cluster ===" +echo "" + +# Verify cluster is accessible +if ! kubectl cluster-info --context "${CLUSTER_CONTEXT}" &>/dev/null; then + echo "k3d cluster '${CLUSTER_NAME}' is not accessible." + echo "Run 'make setup-k3d' or 'k3d cluster start ${CLUSTER_NAME}' first." + exit 1 +fi + +# Create namespace if it doesn't exist +kubectl create namespace "$AMP_NAMESPACE" --context "${CLUSTER_CONTEXT}" --dry-run=client -o yaml | \ + kubectl apply --context "${CLUSTER_CONTEXT}" -f - + +# Update Helm dependencies +echo "Updating Helm chart dependencies..." +helm dependency update "$CHART_DIR" +echo "" + +# Install or upgrade +echo "Running helm upgrade --install..." +helm upgrade --install "$AMP_RELEASE_NAME" "$CHART_DIR" \ + --namespace "$AMP_NAMESPACE" \ + --kube-context "${CLUSTER_CONTEXT}" \ + --values "$VALUES_FILE" \ + --wait \ + --timeout 5m + +echo "" +echo "Waiting for deployments to be ready..." +kubectl wait --for=condition=Available deployment --all \ + -n "$AMP_NAMESPACE" \ + --context "${CLUSTER_CONTEXT}" \ + --timeout=300s 2>/dev/null || true + +echo "" +echo "AMP deployed successfully!" +echo "" +echo "Services:" +echo " Console: http://localhost:3000" +echo " API: http://localhost:9000" +echo "" +echo "Status:" +kubectl get pods -n "$AMP_NAMESPACE" --context "${CLUSTER_CONTEXT}" diff --git a/deployments/scripts/setup-k3d.sh b/deployments/scripts/setup-k3d.sh index 9a7befc7b..f8785ebb2 100755 --- a/deployments/scripts/setup-k3d.sh +++ b/deployments/scripts/setup-k3d.sh @@ -9,7 +9,14 @@ cd "$SCRIPT_DIR" source "$SCRIPT_DIR/env.sh" -echo "=== Setting up k3d Cluster for OpenChoreo ===" +# Select k3d config based on DEV_MODE +if [ "${DEV_MODE}" = "helm" ]; then + K3D_CONFIG="../dev-cluster-config.yaml" + echo "=== Setting up k3d Cluster for OpenChoreo + AMP (Helm mode) ===" +else + K3D_CONFIG="../single-cluster-config.yaml" + echo "=== Setting up k3d Cluster for OpenChoreo ===" +fi # Check prerequisites if ! command -v k3d &> /dev/null; then @@ -52,6 +59,27 @@ if k3d cluster list 2>/dev/null | grep -q "${CLUSTER_NAME}"; then done fi + # When using Helm mode, verify AMP ports are mapped + if [ "${DEV_MODE}" = "helm" ]; then + echo "" + echo "πŸ” Checking AMP port mappings..." + MISSING_PORTS="" + for PORT in 3000 9000; do + if ! docker port "k3d-${CLUSTER_NAME}-serverlb" "${PORT}/tcp" &>/dev/null; then + MISSING_PORTS="${MISSING_PORTS} ${PORT}" + fi + done + if [ -n "$MISSING_PORTS" ]; then + echo "⚠️ AMP ports not mapped:${MISSING_PORTS}" + echo " The cluster was created without AMP port mappings." + echo " To fix, delete and recreate the cluster:" + echo " k3d cluster delete ${CLUSTER_NAME}" + echo " DEV_MODE=helm make setup-k3d" + else + echo "βœ… AMP ports are mapped correctly" + fi + fi + echo "" echo "Cluster info:" kubectl cluster-info --context ${CLUSTER_CONTEXT} @@ -62,9 +90,9 @@ else echo "πŸ“ Creating shared directory for OpenChoreo..." mkdir -p /tmp/k3d-shared - # Create k3d cluster with OpenChoreo configuration - echo "πŸš€ Creating k3d cluster with OpenChoreo configuration..." - k3d cluster create --config ../single-cluster-config.yaml + # Create k3d cluster with appropriate configuration + echo "πŸš€ Creating k3d cluster with config: ${K3D_CONFIG}..." + k3d cluster create --config "${K3D_CONFIG}" echo "" echo "βœ… k3d cluster created successfully!" diff --git a/deployments/scripts/setup-openchoreo.sh b/deployments/scripts/setup-openchoreo.sh index f66673ca8..b1e6e38db 100755 --- a/deployments/scripts/setup-openchoreo.sh +++ b/deployments/scripts/setup-openchoreo.sh @@ -42,6 +42,29 @@ echo "" # Step 1: Install OpenChoreo Control Plane echo "1️⃣ Installing/Upgrading OpenChoreo Control Plane..." echo " This may take up to 10 minutes..." + +# On re-runs, the CA extractor job uses kubectl apply (client-side) to write the +# real cert into cluster-gateway-ca, which claims field ownership. The next helm +# upgrade then conflicts. Fix by removing the client-side-apply field manager +# before upgrading, so Helm can take ownership cleanly. +# On re-runs, fix two issues: +# 1. The CA extractor job uses kubectl apply (client-side) to write the real cert, +# which claims field ownership. Remove the field manager so helm upgrade won't conflict. +# 2. helm upgrade resets the CA ConfigMap to a placeholder, but the extractor job +# won't re-run because Helm doesn't recreate completed jobs. Delete it so Helm +# recreates it and it extracts the real cert again. +if kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane &>/dev/null; then + kubectl annotate configmap cluster-gateway-ca -n openchoreo-control-plane \ + kubectl.kubernetes.io/last-applied-configuration- --overwrite 2>/dev/null || true + FIELD_INDEX=$(kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane \ + --show-managed-fields -o json | jq '.metadata.managedFields | to_entries[] | select(.value.manager == "kubectl-client-side-apply") | .key' 2>/dev/null) + if [ -n "$FIELD_INDEX" ]; then + kubectl patch configmap cluster-gateway-ca -n openchoreo-control-plane \ + --type=json -p="[{\"op\":\"remove\",\"path\":\"/metadata/managedFields/${FIELD_INDEX}\"}]" 2>/dev/null || true + fi +fi +kubectl delete job cluster-gateway-ca-extractor -n openchoreo-control-plane 2>/dev/null || true + helm upgrade --install openchoreo-control-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-control-plane \ --version ${OPENCHOREO_PATCH_VERSION} \ --namespace openchoreo-control-plane \ @@ -54,12 +77,32 @@ kubectl wait -n openchoreo-control-plane --for=condition=available --timeout=300 if kubectl get jobs -n openchoreo-control-plane --no-headers 2>/dev/null | grep -q .; then kubectl wait -n openchoreo-control-plane --for=condition=complete --timeout=300s job --all fi + +# Verify the CA extractor has replaced the placeholder with a real certificate. +# The Helm chart deploys a placeholder ConfigMap and a Job that extracts the real +# CA from a TLS secret. On re-runs, helm upgrade resets the ConfigMap to the +# placeholder, so we must wait for the extractor to overwrite it again. +echo "⏳ Waiting for cluster-gateway-ca to contain a real certificate..." +for i in $(seq 1 30); do + CA_CONTENT=$(kubectl get configmap cluster-gateway-ca -n openchoreo-control-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null) + if echo "$CA_CONTENT" | grep -q "BEGIN CERTIFICATE"; then + echo "βœ… cluster-gateway-ca has a valid certificate" + break + fi + if [ "$i" -eq 30 ]; then + echo "⚠️ Timeout waiting for real CA certificate. The extractor job may need to be re-run:" + echo " kubectl delete job cluster-gateway-ca-extractor -n openchoreo-control-plane" + echo " Then re-run: make setup-openchoreo" + fi + sleep 5 +done + echo "βœ… OpenChoreo Control Plane ready" echo "" # Create Certificate for Control Plane TLS echo "πŸ“œ Creating Certificate for Control Plane TLS..." -kubectl apply -f - </dev/null || true helm upgrade --install openchoreo-data-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-data-plane \ --version ${OPENCHOREO_VERSION} \ --namespace openchoreo-data-plane \ @@ -88,7 +133,7 @@ helm upgrade --install openchoreo-data-plane oci://ghcr.io/openchoreo/helm-chart # Create Certificate for Gateway TLS echo "πŸ“œ Creating Certificate for Gateway TLS..." -kubectl apply -f - </dev/null | base64 -d || echo "") if [ -n "$CA_CERT" ]; then - kubectl apply -f - </dev/null || \ + kubectl wait --for=condition=Ready pod -l app=cluster-agent -n openchoreo-data-plane --timeout=120s 2>/dev/null || \ + echo "⚠️ Data Plane agent pods may still be starting" +kubectl logs -n openchoreo-data-plane -l app=cluster-agent --tail=10 2>/dev/null || true echo "Verify API Platform Gateway pods:" kubectl get pods -n openchoreo-data-plane --selector="app.kubernetes.io/instance=api-platform-default-gateway" echo "βœ… OpenChoreo Data Plane ready" @@ -162,6 +211,8 @@ echo "⏳ Waiting for Docker Registry to be ready..." kubectl wait --for=condition=available deployment/registry-docker-registry -n openchoreo-build-plane --timeout=120s echo "4️⃣ Installing/Upgrading OpenChoreo Build Plane..." +# Delete completed copy-ca job so helm recreates it on upgrade +kubectl delete job -n openchoreo-build-plane -l app=cluster-agent 2>/dev/null || true helm upgrade --install openchoreo-build-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-build-plane \ --version ${OPENCHOREO_VERSION} \ --namespace openchoreo-build-plane \ @@ -172,7 +223,7 @@ helm upgrade --install openchoreo-build-plane oci://ghcr.io/openchoreo/helm-char echo "5️⃣ Registering Build Plane..." BP_CA_CERT=$(kubectl get secret cluster-agent-tls -n openchoreo-build-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null | base64 -d || echo "") if [ -n "$BP_CA_CERT" ]; then - kubectl apply -f - </dev/null | grep -q copy-ca; then + kubectl wait -n openchoreo-build-plane --for=condition=complete --timeout=120s job -l app=cluster-agent 2>/dev/null || true +fi +echo "⏳ Waiting for build plane agent..." +kubectl wait --for=condition=Ready pod -l app=cluster-agent -n openchoreo-build-plane --timeout=120s 2>/dev/null || \ + echo "⚠️ Build plane agent pods may still be starting" + # Verify BuildPlane echo "" echo "πŸ” Verifying BuildPlane ..." kubectl get buildplane -n default -kubectl logs -n openchoreo-build-plane -l app=cluster-agent --tail=10 +kubectl logs -n openchoreo-build-plane -l app=cluster-agent --tail=10 2>/dev/null || true echo "βœ… OpenChoreo Build Plane ready" echo "" @@ -247,8 +307,10 @@ else echo " This may take up to 15 minutes..." kubectl create namespace openchoreo-observability-plane --dry-run=client -o yaml | kubectl apply -f - - kubectl apply -f $1/deployments/values/oc-collector-configmap.yaml -n openchoreo-observability-plane + kubectl apply --server-side --force-conflicts -f $1/deployments/values/oc-collector-configmap.yaml -n openchoreo-observability-plane + # Delete completed copy-ca job so helm recreates it on upgrade + kubectl delete job -n openchoreo-observability-plane -l app=cluster-agent 2>/dev/null || true helm install openchoreo-observability-plane oci://ghcr.io/openchoreo/helm-charts/openchoreo-observability-plane \ --version ${OPENCHOREO_VERSION} \ --namespace openchoreo-observability-plane \ @@ -277,7 +339,7 @@ fi echo "5️⃣ Registering Observability Plane..." OP_CA_CERT=$(kubectl get secret cluster-agent-tls -n openchoreo-observability-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null | base64 -d || echo "") if [ -n "$OP_CA_CERT" ]; then - kubectl apply -f - </dev/null | grep -q copy-ca; then + kubectl wait -n openchoreo-observability-plane --for=condition=complete --timeout=120s job -l app=cluster-agent 2>/dev/null || true +fi +echo "⏳ Waiting for observability plane agent..." +kubectl wait --for=condition=Ready pod -l app=cluster-agent -n openchoreo-observability-plane --timeout=120s 2>/dev/null || \ + echo "⚠️ Observability plane agent pods may still be starting" + # Verify ObservabilityPlane echo "" echo "πŸ” Verifying ObservabilityPlane ..." kubectl get observabilityplane -n default -kubectl logs -n openchoreo-observability-plane -l app=cluster-agent --tail=10 +kubectl logs -n openchoreo-observability-plane -l app=cluster-agent --tail=10 2>/dev/null || true echo "βœ… OpenChoreo Observability Plane ready" echo "" @@ -358,13 +429,13 @@ echo " Creating local development config..." cp "${SCRIPT_DIR}/../values/api-platform-operator-full-config.yaml" "${SCRIPT_DIR}/../values/api-platform-operator-local-config.yaml" # Update JWKS URI for local development sed -i '' 's|http://amp-api.wso2-amp.svc.cluster.local:9000/auth/external/jwks.json|http://host.docker.internal:9000/auth/external/jwks.json|g' "${SCRIPT_DIR}/../values/api-platform-operator-local-config.yaml" -kubectl apply -f "${SCRIPT_DIR}/../values/api-platform-operator-local-config.yaml" +kubectl apply --server-side --force-conflicts -f "${SCRIPT_DIR}/../values/api-platform-operator-local-config.yaml" echo "βœ… Gateway configuration applied" echo "" # Apply Gateway and API Resources echo "1️⃣3️⃣ Applying Gateway and API Resources..." -kubectl apply -f "${SCRIPT_DIR}/../values/obs-gateway.yaml" +kubectl apply --server-side --force-conflicts -f "${SCRIPT_DIR}/../values/obs-gateway.yaml" echo "⏳ Waiting for Gateway to be ready..." if kubectl wait --for=condition=Programmed gateway/obs-gateway -n openchoreo-data-plane --timeout=180s; then @@ -378,7 +449,7 @@ echo "Gateway status:" kubectl get gateway obs-gateway -n openchoreo-data-plane -o yaml echo "" -kubectl apply -f "${SCRIPT_DIR}/../values/otel-collector-rest-api.yaml" +kubectl apply --server-side --force-conflicts -f "${SCRIPT_DIR}/../values/otel-collector-rest-api.yaml" echo "⏳ Waiting for RestApi to be programmed..." if kubectl wait --for=condition=Programmed restapi/traces-api-secure -n openchoreo-data-plane --timeout=120s; then diff --git a/deployments/values/values-local.yaml b/deployments/values/values-local.yaml new file mode 100644 index 000000000..bda306840 --- /dev/null +++ b/deployments/values/values-local.yaml @@ -0,0 +1,58 @@ +# Local development overrides for helm-based deployment (DEV_MODE=helm) +# Used by: DEV_MODE=helm make dev-up, make helm-install, make helm-upgrade +# +# All services run inside the k3d cluster. Images are built locally and +# imported via `k3d image import` (pullPolicy: Never). + +agentManagerService: + image: + repository: amp-api + tag: "0.0.0-dev" + pullPolicy: Never + service: + type: LoadBalancer + port: 9000 + config: + logLevel: "DEBUG" + corsAllowedOrigin: "*" + # In-cluster OpenChoreo URL (no host.docker.internal needed) + openChoreo: + baseURL: "http://openchoreo-api.openchoreo-control-plane:8080/api/v1" + # In-cluster Thunder IDP + keyManager: + jwksUrl: "http://amp-thunder-extension-service.amp-thunder.svc.cluster.local:8090/oauth2/jwks" + oidc: + tokenUrl: "http://amp-thunder-extension-service.amp-thunder.svc.cluster.local:8090/oauth2/token" + # In-cluster OpenBao + openbao: + url: "http://amp-secrets-openbao.amp-secrets.svc.cluster.local:8200" + # In-cluster OTel endpoint + otel: + exporterEndpoint: "http://obs-gateway-gateway-router.openchoreo-data-plane.svc.cluster.local:22893/otel" + # In-cluster observer + observerURL: "http://observer.openchoreo-observability-plane.svc.cluster.local:8080" + traceObserverURL: "http://amp-traces-observer.openchoreo-observability-plane.svc.cluster.local:9098" + +console: + image: + repository: amp-console + tag: "0.0.0-dev" + pullPolicy: Never + service: + type: LoadBalancer + config: + disableAuth: "true" + apiBaseUrl: "http://localhost:9000" + instrumentationUrl: "http://localhost:21893" + +dbMigration: + image: + repository: amp-api + tag: "0.0.0-dev" + pullPolicy: Never + +postgresql: + primary: + persistence: + enabled: true + size: 5Gi