From 05ea2f4bf7c54e2a161ee7d299c31983a045d209 Mon Sep 17 00:00:00 2001 From: Stu Alexander Date: Thu, 15 Jan 2026 20:04:18 +0000 Subject: [PATCH 01/45] Added k8s deploy and wip with service instances # Conflicts: # ushadow/backend/src/routers/tailscale.py # ushadow/backend/src/services/deployment_manager.py --- ushadow/backend/src/models/deployment.py | 71 ++ ushadow/backend/src/models/instance.py | 3 + ushadow/backend/src/models/kubernetes.py | 67 + ushadow/backend/src/models/unode.py | 10 +- ushadow/backend/src/routers/instances.py | 43 + ushadow/backend/src/routers/kubernetes.py | 362 +++++- .../src/services/deployment_backends.py | 565 +++++++++ .../src/services/deployment_manager.py | 495 ++++++-- .../backend/src/services/instance_manager.py | 16 +- .../src/services/kubernetes_manager.py | 847 ++++++++++++- .../src/components/DeployToK8sModal.tsx | 499 ++++++++ .../frontend/src/components/EnvVarEditor.tsx | 194 +++ .../chronicle/ChronicleConversations.tsx | 2 +- .../src/components/memories/MemoryTable.tsx | 4 +- .../src/components/wiring/WiringBoard.tsx | 86 ++ ushadow/frontend/src/pages/ClusterPage.tsx | 102 +- ushadow/frontend/src/pages/InstancesPage.tsx | 1119 +++++++++++++++-- .../src/pages/KubernetesClustersPage.tsx | 452 ++++++- ushadow/frontend/src/services/api.ts | 41 + 19 files changed, 4691 insertions(+), 287 deletions(-) create mode 100644 ushadow/backend/src/services/deployment_backends.py create mode 100644 ushadow/frontend/src/components/DeployToK8sModal.tsx create mode 100644 ushadow/frontend/src/components/EnvVarEditor.tsx diff --git a/ushadow/backend/src/models/deployment.py b/ushadow/backend/src/models/deployment.py index 2d3c7ff0..1bf2a2a3 100644 --- a/ushadow/backend/src/models/deployment.py +++ b/ushadow/backend/src/models/deployment.py @@ -82,6 +82,67 @@ class Config: use_enum_values = True +class ResolvedServiceDefinition(BaseModel): + """ + A fully resolved service definition with all variables substituted. + + This model represents a service after docker-compose config resolution, + where all ${VAR:-default} syntax has been replaced with actual values. + Used as input for all deployment targets (local docker, unode, kubernetes). + """ + service_id: str = Field(..., description="Unique identifier for the service") + name: str = Field(..., description="Service name from compose") + image: str = Field(..., description="Fully resolved Docker image (no variables)") + + # Ports as list of strings in Docker format: "host:container" or "container" + ports: List[str] = Field( + default_factory=list, + description="Resolved port mappings: ['3000:8080', '9090']" + ) + + # Fully resolved environment variables + environment: Dict[str, str] = Field( + default_factory=dict, + description="Resolved environment variables (no placeholders)" + ) + + # Container configuration (already resolved) + volumes: List[str] = Field(default_factory=list) + command: Optional[str] = None + restart_policy: str = Field(default="unless-stopped") + network: Optional[str] = None + + # Health check configuration + health_check_path: Optional[str] = None + health_check_port: Optional[int] = None + + # Original compose file reference + compose_file: str = Field(..., description="Source compose file path") + compose_service_name: str = Field(..., description="Service name in compose file") + + # Metadata + description: Optional[str] = None + namespace: Optional[str] = None # From x-ushadow metadata + requires: List[str] = Field(default_factory=list) # Capability dependencies + + class Config: + json_schema_extra = { + "example": { + "service_id": "openmemory-compose:mem0-ui", + "name": "mem0-ui", + "image": "ghcr.io/ushadow-io/u-mem0-ui:latest", + "ports": ["3002:3000"], + "environment": { + "VITE_API_URL": "http://localhost:8765", + "API_URL": "http://mem0:8765" + }, + "compose_file": "/compose/openmemory-compose.yaml", + "compose_service_name": "mem0-ui", + "namespace": "openmemory" + } + } + + class Deployment(BaseModel): """ A service deployed to a specific node. @@ -133,6 +194,16 @@ class Deployment(BaseModel): description="Primary exposed port for the service" ) + # Backend information + backend_type: str = Field( + default="docker", + description="Deployment backend type: 'docker' or 'kubernetes'" + ) + backend_metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Backend-specific metadata (container_id for Docker, pod info for K8s)" + ) + class Config: use_enum_values = True diff --git a/ushadow/backend/src/models/instance.py b/ushadow/backend/src/models/instance.py index ef5fa81b..4bcd3ee9 100644 --- a/ushadow/backend/src/models/instance.py +++ b/ushadow/backend/src/models/instance.py @@ -75,6 +75,9 @@ class Template(BaseModel): # Availability status (for local providers - whether service is running) available: bool = Field(default=True, description="Whether local service is running/reachable") + # Installation status (for compose services - whether service is installed) + installed: bool = Field(default=True, description="Whether service is installed (default or user-added)") + class InstanceConfig(BaseModel): """Configuration values for an instance.""" diff --git a/ushadow/backend/src/models/kubernetes.py b/ushadow/backend/src/models/kubernetes.py index bcbc0d8b..c5ebcbb6 100644 --- a/ushadow/backend/src/models/kubernetes.py +++ b/ushadow/backend/src/models/kubernetes.py @@ -27,6 +27,12 @@ class KubernetesCluster(BaseModel): node_count: Optional[int] = Field(None, description="Number of nodes in cluster") namespace: str = Field("default", description="Default namespace for deployments") + # Infrastructure scan results (cached per namespace) + infra_scans: Dict[str, Dict[str, Any]] = Field( + default_factory=dict, + description="Cached infrastructure scan results per namespace. Key: namespace, Value: scan results" + ) + # Labels for organization labels: Dict[str, str] = Field(default_factory=dict) @@ -46,6 +52,61 @@ class Config: } +class KubernetesNode(BaseModel): + """Represents a node in a Kubernetes cluster.""" + + name: str = Field(..., description="Node name") + cluster_id: str = Field(..., description="Parent cluster ID") + + # Node status + status: str = Field(..., description="Node status: Ready, NotReady, Unknown") + ready: bool = Field(False, description="Whether node is ready") + + # Node info + kubelet_version: Optional[str] = Field(None, description="Kubelet version") + os_image: Optional[str] = Field(None, description="OS image") + kernel_version: Optional[str] = Field(None, description="Kernel version") + container_runtime: Optional[str] = Field(None, description="Container runtime") + + # Capacity and allocatable resources + cpu_capacity: Optional[str] = Field(None, description="Total CPU capacity") + memory_capacity: Optional[str] = Field(None, description="Total memory capacity") + cpu_allocatable: Optional[str] = Field(None, description="Allocatable CPU") + memory_allocatable: Optional[str] = Field(None, description="Allocatable memory") + + # Node roles + roles: List[str] = Field(default_factory=list, description="Node roles: control-plane, worker") + + # Addresses + internal_ip: Optional[str] = Field(None, description="Internal IP address") + external_ip: Optional[str] = Field(None, description="External IP address") + hostname: Optional[str] = Field(None, description="Hostname") + + # Taints and labels + taints: List[Dict[str, str]] = Field(default_factory=list, description="Node taints") + labels: Dict[str, str] = Field(default_factory=dict, description="Node labels") + + class Config: + json_schema_extra = { + "example": { + "name": "node-1", + "cluster_id": "prod-us-west", + "status": "Ready", + "ready": True, + "kubelet_version": "v1.28.3", + "os_image": "Ubuntu 22.04 LTS", + "container_runtime": "containerd://1.7.2", + "cpu_capacity": "4", + "memory_capacity": "16Gi", + "cpu_allocatable": "3.5", + "memory_allocatable": "14Gi", + "roles": ["worker"], + "internal_ip": "10.0.1.5", + "labels": {"node.kubernetes.io/instance-type": "n2-standard-4"} + } + } + + class KubernetesClusterCreate(BaseModel): """Request to add a new Kubernetes cluster.""" @@ -90,6 +151,12 @@ class KubernetesDeploymentSpec(BaseModel): } ) + # Health checks + health_check_path: Optional[str] = Field( + None, + description="HTTP path for liveness/readiness probes. Set to None to disable health checks. Default: /health" + ) + # Advanced options annotations: Dict[str, str] = Field(default_factory=dict) labels: Dict[str, str] = Field(default_factory=dict) diff --git a/ushadow/backend/src/models/unode.py b/ushadow/backend/src/models/unode.py index 0add67c6..38de68dd 100644 --- a/ushadow/backend/src/models/unode.py +++ b/ushadow/backend/src/models/unode.py @@ -30,9 +30,16 @@ class UNodePlatform(str, Enum): UNKNOWN = "unknown" +class UNodeType(str, Enum): + """Type of deployment target.""" + DOCKER = "docker" # Traditional Docker host (worker/leader) + KUBERNETES = "kubernetes" # Kubernetes cluster + + class UNodeCapabilities(BaseModel): """Capabilities of a u-node.""" can_run_docker: bool = True + can_run_kubernetes: bool = False can_run_gpu: bool = False can_become_leader: bool = False available_memory_mb: int = 0 @@ -42,8 +49,9 @@ class UNodeCapabilities(BaseModel): class UNodeBase(BaseModel): """Base u-node model.""" - hostname: str = Field(..., description="Tailscale hostname") + hostname: str = Field(..., description="Tailscale hostname or K8s cluster ID") display_name: Optional[str] = None + type: UNodeType = Field(UNodeType.DOCKER, description="Deployment target type") role: UNodeRole = UNodeRole.WORKER platform: UNodePlatform = UNodePlatform.UNKNOWN tailscale_ip: Optional[str] = None diff --git a/ushadow/backend/src/routers/instances.py b/ushadow/backend/src/routers/instances.py index 7e277deb..038903e3 100644 --- a/ushadow/backend/src/routers/instances.py +++ b/ushadow/backend/src/routers/instances.py @@ -60,9 +60,50 @@ async def list_templates( try: from src.services.compose_registry import get_compose_registry registry = get_compose_registry() + settings = get_settings_store() + + # Get installed service names (same logic as ServiceOrchestrator) + default_services = await settings.get("default_services") or [] + installed_names = set(default_services) + removed_names = set() + + logger.info(f"Loading templates - default_services from settings: {default_services}") + logger.info(f"Loading templates - installed_names: {installed_names}") + + user_installed = await settings.get("installed_services") or {} + for service_name, state in user_installed.items(): + if hasattr(state, 'items'): + state_dict = dict(state) + else: + state_dict = state if isinstance(state, dict) else {} + + is_removed = state_dict.get("removed") == True + is_added = state_dict.get("added") == True + + if is_removed: + installed_names.discard(service_name) + removed_names.add(service_name) + elif is_added: + installed_names.add(service_name) + for service in registry.get_services(): if source and source != "compose": continue + + # Check if service is installed + is_installed = False + if service.service_name in removed_names: + is_installed = False + elif service.service_name in installed_names: + is_installed = True + else: + compose_base = service.compose_file.stem.replace('-compose', '') + if compose_base in installed_names: + is_installed = True + + # Debug logging + logger.info(f"Service: {service.service_name}, installed: {is_installed}, installed_names: {installed_names}") + templates.append(Template( id=service.service_id, source=TemplateSource.COMPOSE, @@ -75,6 +116,7 @@ async def list_templates( compose_file=str(service.namespace) if service.namespace else None, service_name=service.service_name, mode="local", + installed=is_installed, )) except Exception as e: logger.warning(f"Failed to load compose templates: {e}") @@ -133,6 +175,7 @@ async def list_templates( tags=provider.tags, configured=is_configured, available=is_available, + installed=True, # Providers are always "installed" (discoverable) )) except Exception as e: logger.warning(f"Failed to load provider templates: {e}") diff --git a/ushadow/backend/src/routers/kubernetes.py b/ushadow/backend/src/routers/kubernetes.py index 897369e1..e5bb056f 100644 --- a/ushadow/backend/src/routers/kubernetes.py +++ b/ushadow/backend/src/routers/kubernetes.py @@ -1,15 +1,20 @@ """Kubernetes cluster management API endpoints.""" import logging -from typing import List +import os +from typing import List, Dict, Any, Optional from fastapi import APIRouter, HTTPException, Depends +from pydantic import BaseModel, Field from src.models.kubernetes import ( KubernetesCluster, KubernetesClusterCreate, + KubernetesDeploymentSpec, + KubernetesNode, ) from src.services.kubernetes_manager import get_kubernetes_manager +from src.services.compose_registry import get_compose_registry from src.services.auth import get_current_user from src.models.user import User @@ -17,6 +22,24 @@ router = APIRouter() +# Request/Response models +class ScanInfraRequest(BaseModel): + namespace: str = "default" + + +class DeployServiceRequest(BaseModel): + service_id: str + namespace: str = "default" + node_name: Optional[str] = Field(None, description="Target K8s node name for deployment (uses nodeSelector)") + k8s_spec: Optional[KubernetesDeploymentSpec] = None + + +class CreateEnvmapRequest(BaseModel): + service_name: str + namespace: str = "default" + env_vars: Dict[str, str] + + @router.post("", response_model=KubernetesCluster) async def add_cluster( cluster_data: KubernetesClusterCreate, @@ -62,6 +85,29 @@ async def get_cluster( return cluster +@router.get("/{cluster_id}/nodes", response_model=List[KubernetesNode]) +async def list_cluster_nodes( + cluster_id: str, + current_user: User = Depends(get_current_user) +): + """ + List all nodes in a Kubernetes cluster. + + Returns node information including status, capacity, roles, and labels. + Useful for selecting target nodes for deployments. + """ + k8s_manager = await get_kubernetes_manager() + + try: + nodes = await k8s_manager.list_nodes(cluster_id) + return nodes + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) + except Exception as e: + logger.error(f"Error listing nodes for cluster {cluster_id}: {e}") + raise HTTPException(status_code=500, detail="Failed to list nodes") + + @router.delete("/{cluster_id}") async def remove_cluster( cluster_id: str, @@ -76,3 +122,317 @@ async def remove_cluster( raise HTTPException(status_code=404, detail="Cluster not found") return {"success": True, "message": f"Cluster {cluster_id} removed"} + + +@router.get("/services/available") +async def get_available_services( + current_user: User = Depends(get_current_user) +): + """ + Get list of all available services from compose registry. + + Returns discovered services that can be deployed to Kubernetes. + """ + registry = get_compose_registry() + services = registry.get_services() + + # Convert to serializable format + return { + "services": [ + { + "service_id": svc.service_id, + "service_name": svc.service_name, + "display_name": svc.display_name or svc.service_name, + "description": svc.description, + "image": svc.image, + "requires": svc.requires, + "namespace": svc.namespace, + } + for svc in services + ] + } + + +@router.get("/services/infra") +async def get_infra_services( + current_user: User = Depends(get_current_user) +): + """ + Get list of infrastructure services. + + Returns services identified as infrastructure (databases, caches, etc.). + """ + registry = get_compose_registry() + services = registry.get_services() + + # Filter for infrastructure services + # Infrastructure typically doesn't have 'requires' (it's what others require) + # Or it's in the infra namespace/has infra in the name + infra_services = [ + svc for svc in services + if (not svc.requires or # No dependencies = infrastructure + "infra" in svc.namespace.lower() if svc.namespace else False or + any(name in svc.service_name.lower() + for name in ["mongo", "redis", "postgres", "qdrant", "neo4j"])) + ] + + return { + "services": [ + { + "service_id": svc.service_id, + "service_name": svc.service_name, + "display_name": svc.display_name or svc.service_name, + "type": svc.service_name.split("-")[0] if "-" in svc.service_name else svc.service_name, + "image": svc.image, + } + for svc in infra_services + ] + } + + +@router.post("/{cluster_id}/scan-infra") +async def scan_cluster_for_infra( + cluster_id: str, + request: ScanInfraRequest, + current_user: User = Depends(get_current_user) +): + """ + Scan a Kubernetes cluster for existing infrastructure services. + + Checks if common infrastructure (mongo, redis, postgres, etc.) + is already running in the cluster. + """ + k8s_manager = await get_kubernetes_manager() + + # Verify cluster exists + cluster = await k8s_manager.get_cluster(cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + + results = await k8s_manager.scan_cluster_for_infra_services( + cluster_id, + request.namespace + ) + + # Store scan results in cluster document for caching + await k8s_manager.update_cluster_infra_scan( + cluster_id, + request.namespace, + results + ) + + return { + "cluster_id": cluster_id, + "namespace": request.namespace, + "infra_services": results + } + + +@router.post("/{cluster_id}/envmap") +async def create_or_update_envmap( + cluster_id: str, + request: CreateEnvmapRequest, + current_user: User = Depends(get_current_user) +): + """ + Create or update ConfigMap and Secret for a service. + + Takes environment variables and automatically separates: + - Sensitive values (keys, passwords) → Kubernetes Secret + - Non-sensitive values → Kubernetes ConfigMap + """ + k8s_manager = await get_kubernetes_manager() + + # Verify cluster exists + cluster = await k8s_manager.get_cluster(cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + + try: + configmap_name, secret_name = await k8s_manager.get_or_create_envmap( + cluster_id, + request.namespace, + request.service_name, + request.env_vars + ) + + return { + "success": True, + "configmap": configmap_name or None, + "secret": secret_name or None, + "namespace": request.namespace + } + except Exception as e: + logger.error(f"Failed to create envmap: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/{cluster_id}/deploy") +async def deploy_service_to_cluster( + cluster_id: str, + request: DeployServiceRequest, + current_user: User = Depends(get_current_user) +): + """ + Deploy a service to a Kubernetes cluster. + + Uses centralized resolution via deployment_manager to ensure all variables + are resolved before generating K8s manifests. + + Supports targeting specific nodes via node_name parameter. + """ + from src.services.deployment_manager import get_deployment_manager + + k8s_manager = await get_kubernetes_manager() + deployment_manager = get_deployment_manager() + + # Verify cluster exists + cluster = await k8s_manager.get_cluster(cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + + # Resolve service with all variables substituted + try: + resolved_service = await deployment_manager.resolve_service_for_deployment( + request.service_id + ) + except ValueError as e: + logger.error(f"Failed to resolve service {request.service_id}: {e}") + raise HTTPException(status_code=400, detail=f"Service resolution failed: {e}") + + # Convert ResolvedServiceDefinition to dict format for kubernetes_manager + service_def = { + "service_id": resolved_service.service_id, + "name": resolved_service.name, + "image": resolved_service.image, + "environment": resolved_service.environment, + "ports": resolved_service.ports, # Already in ["3002:3000"] format + } + + # Add node selector if node_name specified + k8s_spec = request.k8s_spec or KubernetesDeploymentSpec() + if request.node_name: + # Add node selector to ensure pod runs on specific node + if not k8s_spec.labels: + k8s_spec.labels = {} + k8s_spec.labels["kubernetes.io/hostname"] = request.node_name + logger.info(f"Targeting node: {request.node_name}") + + # Deploy + success, message = await k8s_manager.deploy_to_kubernetes( + cluster_id, + service_def, + request.namespace, + k8s_spec + ) + + if not success: + raise HTTPException(status_code=500, detail=message) + + return { + "success": True, + "message": message, + "service_id": resolved_service.service_id, + "namespace": request.namespace, + "node_name": request.node_name + } + + +@router.get("/{cluster_id}/pods") +async def list_pods( + cluster_id: str, + namespace: str = "ushadow", + current_user: User = Depends(get_current_user) +): + """ + List all pods in a namespace. + + Returns pod name, status, restarts, age, and labels. + """ + k8s_manager = await get_kubernetes_manager() + + # Verify cluster exists + cluster = await k8s_manager.get_cluster(cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + + try: + pods = await k8s_manager.list_pods(cluster_id, namespace) + return {"pods": pods, "namespace": namespace} + except Exception as e: + logger.error(f"Failed to list pods: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/{cluster_id}/pods/{pod_name}/logs") +async def get_pod_logs( + cluster_id: str, + pod_name: str, + namespace: str = "ushadow", + previous: bool = False, + tail_lines: int = 100, + current_user: User = Depends(get_current_user) +): + """ + Get logs from a pod. + + Args: + cluster_id: The cluster ID + pod_name: Name of the pod + namespace: Kubernetes namespace (default: ushadow) + previous: Get logs from previous (crashed) container + tail_lines: Number of lines to return from end of logs + """ + k8s_manager = await get_kubernetes_manager() + + # Verify cluster exists + cluster = await k8s_manager.get_cluster(cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + + try: + logs = await k8s_manager.get_pod_logs( + cluster_id, + pod_name, + namespace, + previous=previous, + tail_lines=tail_lines + ) + return { + "pod_name": pod_name, + "namespace": namespace, + "previous": previous, + "logs": logs + } + except Exception as e: + logger.error(f"Failed to get pod logs: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/{cluster_id}/pods/{pod_name}/events") +async def get_pod_events( + cluster_id: str, + pod_name: str, + namespace: str = "ushadow", + current_user: User = Depends(get_current_user) +): + """ + Get events for a pod (useful for debugging why pod won't start). + """ + k8s_manager = await get_kubernetes_manager() + + # Verify cluster exists + cluster = await k8s_manager.get_cluster(cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + + try: + events = await k8s_manager.get_pod_events(cluster_id, pod_name, namespace) + return { + "pod_name": pod_name, + "namespace": namespace, + "events": events + } + except Exception as e: + logger.error(f"Failed to get pod events: {e}") + raise HTTPException(status_code=500, detail=str(e)) diff --git a/ushadow/backend/src/services/deployment_backends.py b/ushadow/backend/src/services/deployment_backends.py new file mode 100644 index 00000000..33cde746 --- /dev/null +++ b/ushadow/backend/src/services/deployment_backends.py @@ -0,0 +1,565 @@ +"""Deployment backend implementations for different target types.""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional, List +import logging +import httpx +from datetime import datetime + +from src.models.deployment import ResolvedServiceDefinition, Deployment, DeploymentStatus +from src.models.unode import UNode, UNodeType +from src.services.kubernetes_manager import KubernetesManager +import docker + +logger = logging.getLogger(__name__) + + +class DeploymentBackend(ABC): + """Base class for deployment backends.""" + + @abstractmethod + async def deploy( + self, + unode: UNode, + resolved_service: ResolvedServiceDefinition, + deployment_id: str, + namespace: Optional[str] = None, + ) -> Deployment: + """ + Deploy a service to this backend. + + Args: + unode: The target unode (Docker host or K8s cluster) + resolved_service: Fully resolved service definition + deployment_id: Unique deployment identifier + namespace: Optional namespace (K8s only) + + Returns: + Deployment object with status and metadata + """ + pass + + @abstractmethod + async def get_status( + self, + unode: UNode, + deployment: Deployment + ) -> DeploymentStatus: + """Get current status of a deployment.""" + pass + + @abstractmethod + async def stop( + self, + unode: UNode, + deployment: Deployment + ) -> bool: + """Stop a running deployment.""" + pass + + @abstractmethod + async def remove( + self, + unode: UNode, + deployment: Deployment + ) -> bool: + """Remove a deployment completely.""" + pass + + @abstractmethod + async def get_logs( + self, + unode: UNode, + deployment: Deployment, + tail: int = 100 + ) -> List[str]: + """Get logs from a deployment.""" + pass + + +class DockerDeploymentBackend(DeploymentBackend): + """Deployment backend for Docker hosts (traditional unodes).""" + + UNODE_MANAGER_PORT = 8444 + + def _is_local_deployment(self, unode: UNode) -> bool: + """Check if this is a local deployment (same host as backend).""" + import os + env_name = os.getenv("COMPOSE_PROJECT_NAME", "").strip() or "ushadow" + return unode.hostname == env_name or unode.hostname == "localhost" + + def _get_target_ip(self, unode: UNode) -> str: + """Get target IP for unode (localhost for local, tailscale IP for remote).""" + if self._is_local_deployment(unode): + return "localhost" + elif unode.tailscale_ip: + return unode.tailscale_ip + else: + raise ValueError(f"Unode {unode.hostname} has no Tailscale IP configured") + + async def _deploy_local( + self, + unode: UNode, + resolved_service: ResolvedServiceDefinition, + deployment_id: str, + container_name: str + ) -> Deployment: + """Deploy directly to local Docker (bypasses unode manager).""" + try: + docker_client = docker.from_env() + + # Parse ports to Docker format + port_bindings = {} + exposed_ports = {} + for port_str in resolved_service.ports: + if ":" in port_str: + host_port, container_port = port_str.split(":") + port_key = f"{container_port}/tcp" + port_bindings[port_key] = int(host_port) + exposed_ports[port_key] = {} + else: + port_key = f"{port_str}/tcp" + exposed_ports[port_key] = {} + + # Create container + logger.info(f"Creating container {container_name} from image {resolved_service.image}") + container = docker_client.containers.run( + image=resolved_service.image, + name=container_name, + environment=resolved_service.environment, + ports=port_bindings, + volumes=resolved_service.volumes if resolved_service.volumes else None, + command=resolved_service.command, + restart_policy={"Name": resolved_service.restart_policy or "unless-stopped"}, + network=resolved_service.network or "bridge", + detach=True, + remove=False, + ) + + logger.info(f"Container {container_name} created: {container.id[:12]}") + + # Extract exposed port + exposed_port = None + if resolved_service.ports: + first_port = resolved_service.ports[0] + if ":" in first_port: + exposed_port = int(first_port.split(":")[0]) + else: + exposed_port = int(first_port) + + # Build deployment object + deployment = Deployment( + id=deployment_id, + service_id=resolved_service.service_id, + unode_hostname=unode.hostname, + status=DeploymentStatus.RUNNING, + container_id=container.id, + container_name=container_name, + deployed_config={ + "image": resolved_service.image, + "ports": resolved_service.ports, + "environment": resolved_service.environment, + }, + exposed_port=exposed_port, + backend_type="docker", + backend_metadata={ + "container_id": container.id, + "local_deployment": True, + } + ) + + logger.info(f"✅ Local Docker deployment successful: {container_name}") + return deployment + + except docker.errors.ImageNotFound as e: + logger.error(f"Image not found: {resolved_service.image}") + raise ValueError(f"Docker image not found: {resolved_service.image}") + except docker.errors.APIError as e: + logger.error(f"Docker API error: {e}") + raise ValueError(f"Docker deployment failed: {str(e)}") + except Exception as e: + logger.error(f"Local deployment error: {e}", exc_info=True) + raise ValueError(f"Local deployment error: {str(e)}") + + async def deploy( + self, + unode: UNode, + resolved_service: ResolvedServiceDefinition, + deployment_id: str, + namespace: Optional[str] = None, + ) -> Deployment: + """Deploy to a Docker host via unode manager API or local Docker.""" + logger.info(f"Deploying {resolved_service.service_id} to Docker host {unode.hostname}") + + # Generate container name + container_name = f"{resolved_service.compose_service_name}-{deployment_id[:8]}" + + # Check if this is a local deployment + if self._is_local_deployment(unode): + # Use Docker directly for local deployments + logger.info("Using local Docker for deployment") + return await self._deploy_local(unode, resolved_service, deployment_id, container_name) + + # Build deploy payload for remote unode manager + payload = { + "service_id": resolved_service.service_id, + "container_name": container_name, + "image": resolved_service.image, + "ports": resolved_service.ports, + "environment": resolved_service.environment, + "volumes": resolved_service.volumes, + "command": resolved_service.command, + "restart_policy": resolved_service.restart_policy, + "network": resolved_service.network, + "health_check_path": resolved_service.health_check_path, + } + + # Get target IP (tailscale IP for remote) + target_ip = self._get_target_ip(unode) + logger.info(f"Deploying to remote unode via {target_ip}") + + # Send deploy command to unode manager + url = f"http://{target_ip}:{self.UNODE_MANAGER_PORT}/api/deploy" + + async with httpx.AsyncClient(timeout=300.0) as client: + try: + response = await client.post(url, json=payload) + response.raise_for_status() + result = response.json() + + # Build deployment object + deployment = Deployment( + id=deployment_id, + service_id=resolved_service.service_id, + unode_hostname=unode.hostname, + status=DeploymentStatus.RUNNING, + container_id=result.get("container_id"), + container_name=container_name, + deployed_config={ + "image": resolved_service.image, + "ports": resolved_service.ports, + "environment": resolved_service.environment, + }, + access_url=result.get("access_url"), + exposed_port=result.get("exposed_port"), + backend_type="docker", + backend_metadata={ + "container_id": result.get("container_id"), + "unode_manager_port": self.UNODE_MANAGER_PORT, + } + ) + + logger.info(f"✅ Docker deployment successful: {container_name}") + return deployment + + except httpx.HTTPStatusError as e: + logger.error(f"Deploy failed: {e.response.text}") + raise ValueError(f"Deploy failed: {e.response.text}") + except Exception as e: + logger.error(f"Deploy error: {str(e)}") + raise ValueError(f"Deploy error: {str(e)}") + + async def get_status( + self, + unode: UNode, + deployment: Deployment + ) -> DeploymentStatus: + """Get container status from Docker host.""" + target_ip = self._get_target_ip(unode) + url = f"http://{target_ip}:{self.UNODE_MANAGER_PORT}/api/status/{deployment.container_name}" + + async with httpx.AsyncClient(timeout=10.0) as client: + try: + response = await client.get(url) + response.raise_for_status() + result = response.json() + + status_map = { + "running": DeploymentStatus.RUNNING, + "exited": DeploymentStatus.STOPPED, + "dead": DeploymentStatus.FAILED, + "paused": DeploymentStatus.STOPPED, + } + + return status_map.get(result.get("status", ""), DeploymentStatus.FAILED) + + except Exception as e: + logger.error(f"Failed to get status: {e}") + return DeploymentStatus.FAILED + + async def stop( + self, + unode: UNode, + deployment: Deployment + ) -> bool: + """Stop a Docker container.""" + target_ip = self._get_target_ip(unode) + url = f"http://{target_ip}:{self.UNODE_MANAGER_PORT}/api/stop/{deployment.container_name}" + + async with httpx.AsyncClient(timeout=30.0) as client: + try: + response = await client.post(url) + response.raise_for_status() + return True + except Exception as e: + logger.error(f"Failed to stop container: {e}") + return False + + async def remove( + self, + unode: UNode, + deployment: Deployment + ) -> bool: + """Remove a Docker container.""" + target_ip = self._get_target_ip(unode) + url = f"http://{target_ip}:{self.UNODE_MANAGER_PORT}/api/remove/{deployment.container_name}" + + async with httpx.AsyncClient(timeout=30.0) as client: + try: + response = await client.delete(url) + response.raise_for_status() + return True + except Exception as e: + logger.error(f"Failed to remove container: {e}") + return False + + async def get_logs( + self, + unode: UNode, + deployment: Deployment, + tail: int = 100 + ) -> List[str]: + """Get Docker container logs.""" + target_ip = self._get_target_ip(unode) + url = f"http://{target_ip}:{self.UNODE_MANAGER_PORT}/api/logs/{deployment.container_name}?tail={tail}" + + async with httpx.AsyncClient(timeout=30.0) as client: + try: + response = await client.get(url) + response.raise_for_status() + result = response.json() + return result.get("logs", []) + except Exception as e: + logger.error(f"Failed to get logs: {e}") + return [f"Error getting logs: {str(e)}"] + + +class KubernetesDeploymentBackend(DeploymentBackend): + """Deployment backend for Kubernetes clusters.""" + + def __init__(self, k8s_manager: KubernetesManager): + self.k8s_manager = k8s_manager + + async def deploy( + self, + unode: UNode, + resolved_service: ResolvedServiceDefinition, + deployment_id: str, + namespace: Optional[str] = None, + ) -> Deployment: + """Deploy to a Kubernetes cluster.""" + logger.info(f"Deploying {resolved_service.service_id} to K8s cluster {unode.hostname}") + + # Use unode.hostname as cluster_id for K8s unodes + cluster_id = unode.hostname + namespace = namespace or unode.metadata.get("default_namespace", "default") + + # Use kubernetes_manager.deploy_to_kubernetes + result = await self.k8s_manager.deploy_to_kubernetes( + cluster_id=cluster_id, + service_id=resolved_service.service_id, + namespace=namespace, + ) + + # Build deployment object + deployment = Deployment( + id=deployment_id, + service_id=resolved_service.service_id, + unode_hostname=unode.hostname, + status=DeploymentStatus.RUNNING, + container_id=None, # K8s uses pod names, not container IDs + container_name=result["deployment_name"], + deployed_config={ + "image": resolved_service.image, + "namespace": namespace, + }, + backend_type="kubernetes", + backend_metadata={ + "cluster_id": cluster_id, + "namespace": namespace, + "deployment_name": result["deployment_name"], + "instance_id": result["instance_id"], + } + ) + + logger.info(f"✅ K8s deployment successful: {result['deployment_name']}") + return deployment + + async def get_status( + self, + unode: UNode, + deployment: Deployment + ) -> DeploymentStatus: + """Get pod status from Kubernetes.""" + cluster_id = unode.hostname + namespace = deployment.backend_metadata.get("namespace", "default") + deployment_name = deployment.backend_metadata.get("deployment_name") + + try: + # Get deployment status from K8s + client = await self.k8s_manager.get_client(cluster_id) + apps_v1 = client.AppsV1Api() + + k8s_deployment = apps_v1.read_namespaced_deployment( + name=deployment_name, + namespace=namespace + ) + + # Check replicas + if k8s_deployment.status.ready_replicas and k8s_deployment.status.ready_replicas > 0: + return DeploymentStatus.RUNNING + elif k8s_deployment.status.replicas == 0: + return DeploymentStatus.STOPPED + else: + return DeploymentStatus.DEPLOYING + + except Exception as e: + logger.error(f"Failed to get K8s status: {e}") + return DeploymentStatus.FAILED + + async def stop( + self, + unode: UNode, + deployment: Deployment + ) -> bool: + """Scale K8s deployment to 0 replicas.""" + cluster_id = unode.hostname + namespace = deployment.backend_metadata.get("namespace", "default") + deployment_name = deployment.backend_metadata.get("deployment_name") + + try: + client = await self.k8s_manager.get_client(cluster_id) + apps_v1 = client.AppsV1Api() + + # Scale to 0 + body = {"spec": {"replicas": 0}} + apps_v1.patch_namespaced_deployment_scale( + name=deployment_name, + namespace=namespace, + body=body + ) + + logger.info(f"Scaled K8s deployment {deployment_name} to 0 replicas") + return True + + except Exception as e: + logger.error(f"Failed to stop K8s deployment: {e}") + return False + + async def remove( + self, + unode: UNode, + deployment: Deployment + ) -> bool: + """Delete K8s deployment, service, and configmaps.""" + cluster_id = unode.hostname + namespace = deployment.backend_metadata.get("namespace", "default") + deployment_name = deployment.backend_metadata.get("deployment_name") + + try: + client = await self.k8s_manager.get_client(cluster_id) + apps_v1 = client.AppsV1Api() + core_v1 = client.CoreV1Api() + + # Delete deployment + apps_v1.delete_namespaced_deployment( + name=deployment_name, + namespace=namespace + ) + + # Delete service (same name as deployment) + try: + core_v1.delete_namespaced_service( + name=deployment_name, + namespace=namespace + ) + except: + pass # Service might not exist + + # Delete configmaps (named with deployment prefix) + try: + configmaps = core_v1.list_namespaced_config_map( + namespace=namespace, + label_selector=f"app.kubernetes.io/instance={deployment_name}" + ) + for cm in configmaps.items: + core_v1.delete_namespaced_config_map( + name=cm.metadata.name, + namespace=namespace + ) + except: + pass + + logger.info(f"Deleted K8s deployment {deployment_name}") + return True + + except Exception as e: + logger.error(f"Failed to remove K8s deployment: {e}") + return False + + async def get_logs( + self, + unode: UNode, + deployment: Deployment, + tail: int = 100 + ) -> List[str]: + """Get logs from K8s pods.""" + cluster_id = unode.hostname + namespace = deployment.backend_metadata.get("namespace", "default") + deployment_name = deployment.backend_metadata.get("deployment_name") + + try: + client = await self.k8s_manager.get_client(cluster_id) + core_v1 = client.CoreV1Api() + + # Find pods for this deployment + pods = core_v1.list_namespaced_pod( + namespace=namespace, + label_selector=f"app.kubernetes.io/name={deployment_name}" + ) + + if not pods.items: + return [f"No pods found for deployment {deployment_name}"] + + # Get logs from first pod + pod_name = pods.items[0].metadata.name + logs = core_v1.read_namespaced_pod_log( + name=pod_name, + namespace=namespace, + tail_lines=tail + ) + + return logs.split("\n") + + except Exception as e: + logger.error(f"Failed to get K8s logs: {e}") + return [f"Error getting logs: {str(e)}"] + + +def get_deployment_backend(unode: UNode, k8s_manager: Optional[KubernetesManager] = None) -> DeploymentBackend: + """ + Factory function to get the appropriate deployment backend for a unode. + + Args: + unode: The target unode + k8s_manager: KubernetesManager instance (required for K8s backends) + + Returns: + Appropriate DeploymentBackend implementation + """ + if unode.type == UNodeType.KUBERNETES: + if not k8s_manager: + raise ValueError("KubernetesManager required for K8s deployments") + return KubernetesDeploymentBackend(k8s_manager) + else: + return DockerDeploymentBackend() diff --git a/ushadow/backend/src/services/deployment_manager.py b/ushadow/backend/src/services/deployment_manager.py index d0e1e27d..bc9f06bd 100644 --- a/ushadow/backend/src/services/deployment_manager.py +++ b/ushadow/backend/src/services/deployment_manager.py @@ -16,8 +16,11 @@ ServiceDefinitionUpdate, Deployment, DeploymentStatus, + ResolvedServiceDefinition, ) +from src.models.unode import UNode from src.services.compose_registry import get_compose_registry +from src.services.deployment_backends import get_deployment_backend logger = logging.getLogger(__name__) @@ -80,10 +83,31 @@ async def initialize(self): await self.deployments_collection.create_index("id", unique=True) await self.deployments_collection.create_index("service_id") await self.deployments_collection.create_index("unode_hostname") - await self.deployments_collection.create_index( - [("service_id", 1), ("unode_hostname", 1)], - unique=True - ) + + # Handle compound index with potential conflicts from old versions + try: + await self.deployments_collection.create_index( + [("service_id", 1), ("unode_hostname", 1)], + unique=True + ) + except Exception as e: + # If index exists with different spec (e.g., with partialFilterExpression), + # drop it and recreate + if "IndexKeySpecsConflict" in str(e) or "index has the same name" in str(e): + logger.warning("Dropping conflicting index 'service_id_1_unode_hostname_1' and recreating") + try: + await self.deployments_collection.drop_index("service_id_1_unode_hostname_1") + await self.deployments_collection.create_index( + [("service_id", 1), ("unode_hostname", 1)], + unique=True + ) + except Exception as drop_error: + logger.error(f"Failed to drop and recreate index: {drop_error}") + # Index might not exist or other issue, continue anyway + else: + # Re-raise if it's a different error + raise + logger.info("DeploymentManager initialized") async def _get_session(self) -> aiohttp.ClientSession: @@ -99,6 +123,223 @@ async def close(self): if self._http_session and not self._http_session.closed: await self._http_session.close() + # ========================================================================= + # Centralized Service Resolution + # ========================================================================= + + async def resolve_service_for_deployment( + self, + service_id: str + ) -> "ResolvedServiceDefinition": + """ + Resolve all variables for a service using docker-compose config. + + This is the single source of truth for variable resolution across all + deployment targets (local docker, remote unode, kubernetes). + + Steps: + 1. Get service from compose registry + 2. Get user's saved env configuration + 3. Run `docker-compose -f config ` with resolved env vars + 4. Parse the resolved YAML output (all ${VAR:-default} substituted) + 5. Return ResolvedServiceDefinition with clean values + + Args: + service_id: Service identifier (e.g., "openmemory-compose:mem0-ui") + + Returns: + ResolvedServiceDefinition with all variables resolved + + Raises: + ValueError: If service not found or resolution fails + """ + import subprocess + import yaml + from pathlib import Path + from src.models.deployment import ResolvedServiceDefinition + + compose_registry = get_compose_registry() + + # Get service from compose registry + service = compose_registry.get_service(service_id) + if not service: + raise ValueError(f"Service not found: {service_id}") + + # Get user's saved env configuration (same as docker_manager does) + from src.services.docker_manager import get_docker_manager + docker_manager = get_docker_manager() + + # Build environment variables with user configuration + subprocess_env, container_env = await docker_manager._build_env_vars_for_service( + service.service_name + ) + + # Get compose file path (DiscoveredService has compose_file as direct attribute) + compose_file = str(service.compose_file) + if not compose_file: + raise ValueError(f"Service {service_id} has no compose_file") + + # Translate to container paths (same logic as docker_manager) + if compose_file.startswith("compose/"): + compose_path = Path("/") / compose_file + elif compose_file.startswith("docker-compose"): + compose_path = Path("/config").parent / compose_file + if not compose_path.exists(): + compose_path = Path(".") / compose_file + else: + compose_path = Path(compose_file) + + if not compose_path.exists(): + raise ValueError(f"Compose file not found: {compose_path}") + + compose_dir = compose_path.parent if compose_path.parent.exists() else Path(".") + + # Determine project name (namespace) + project_name = service.namespace if service.namespace else None + if not project_name: + project_name = subprocess_env.get("COMPOSE_PROJECT_NAME", "ushadow") + + # Run docker-compose config to resolve all variables + cmd = ["docker", "compose", "-f", str(compose_path)] + if project_name: + cmd.extend(["-p", project_name]) + cmd.append("config") + + logger.info( + f"Resolving service {service_id} with docker-compose config: " + f"{' '.join(cmd)}" + ) + + try: + result = subprocess.run( + cmd, + env=subprocess_env, # All env vars for ${VAR} substitution + cwd=str(compose_dir), + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + logger.error(f"docker-compose config failed: {result.stderr}") + raise ValueError(f"Failed to resolve compose file: {result.stderr}") + + # Parse the resolved YAML + resolved_compose = yaml.safe_load(result.stdout) + services = resolved_compose.get("services", {}) + + # Get our specific service + resolved_service = services.get(service.service_name) + if not resolved_service: + raise ValueError( + f"Service {service.service_name} not found in resolved compose output" + ) + + # Extract resolved values + image = resolved_service.get("image", "") + if not image: + raise ValueError(f"Service {service.service_name} has no image defined") + + # Parse ports from resolved compose + ports = [] + for port_def in resolved_service.get("ports", []): + if isinstance(port_def, dict): + # Long format: {target: 3000, published: 3002} + target = port_def.get("target") + published = port_def.get("published") + if target and published: + ports.append(f"{published}:{target}") + elif target: + ports.append(str(target)) + else: + # Short format: "3002:3000" or "3000" + ports.append(str(port_def)) + + # Get resolved environment + environment = resolved_service.get("environment", {}) + if isinstance(environment, list): + # Convert list format ["KEY=value"] to dict + env_dict = {} + for env_item in environment: + if "=" in env_item: + key, value = env_item.split("=", 1) + env_dict[key] = value + environment = env_dict + + # Get other fields - handle volumes (can be list of strings or dicts) + volumes_raw = resolved_service.get("volumes", []) + volumes = [] + for vol in volumes_raw: + if isinstance(vol, str): + # Already in string format: "/host:/container" + volumes.append(vol) + elif isinstance(vol, dict): + # Long format: {"type": "volume", "source": "name", "target": "/path"} + # or {"type": "bind", "source": "/host/path", "target": "/container/path"} + vol_type = vol.get("type", "volume") + source = vol.get("source", "") + target = vol.get("target", "") + read_only = vol.get("read_only", False) + + if source and target: + vol_str = f"{source}:{target}" + if read_only: + vol_str += ":ro" + volumes.append(vol_str) + elif target: + # Anonymous volume + volumes.append(target) + + command = resolved_service.get("command") + if isinstance(command, list): + command = " ".join(command) + + restart_policy = resolved_service.get("restart", "unless-stopped") + + # Handle networks (can be list or dict) + networks = resolved_service.get("networks", {}) + if isinstance(networks, list): + network = networks[0] if networks else None + elif isinstance(networks, dict): + # Dict format: {"infra-network": null} - get first key + network = list(networks.keys())[0] if networks else None + else: + network = None + + # Create ResolvedServiceDefinition + resolved = ResolvedServiceDefinition( + service_id=service_id, + name=service.service_name, + image=image, + ports=ports, + environment=environment, + volumes=volumes, + command=command, + restart_policy=restart_policy, + network=network, + compose_file=str(compose_path), + compose_service_name=service.service_name, + description=service.description, + namespace=service.namespace, + requires=service.requires # Direct attribute on DiscoveredService + ) + + logger.info( + f"Resolved service {service_id}: image={image}, " + f"ports={ports}, env_vars={len(environment)}" + ) + + return resolved + + except subprocess.TimeoutExpired: + raise ValueError("docker-compose config timed out") + except Exception as e: + import traceback + logger.error(f"Failed to resolve service {service_id}: {e}") + logger.error(f"Exception type: {type(e).__name__}") + logger.error(f"Traceback: {traceback.format_exc()}") + raise ValueError(f"Service resolution failed: {e}") + # ========================================================================= # Service Definition CRUD # ========================================================================= @@ -198,38 +439,38 @@ async def delete_service(self, service_id: str) -> bool: async def deploy_service( self, service_id: str, - unode_hostname: str + unode_hostname: str, + namespace: Optional[str] = None ) -> Deployment: - """Deploy a service to a u-node.""" - # Get service definition from deployment definitions (MongoDB) - service = await self.get_service(service_id) - - # If not in MongoDB, try compose registry and create ServiceDefinition - if not service: - compose_registry = get_compose_registry() - discovered = compose_registry.get_service(service_id) - if discovered: - # Convert DiscoveredService to ServiceDefinition - service = ServiceDefinition( - service_id=discovered.service_id, - name=discovered.service_name, - description=discovered.description or "", - image=discovered.image or "", - ports={}, # Ports are handled by compose file - environment={}, # Environment is handled by compose file - ) - logger.info(f"Using compose registry service: {service_id}") - else: - raise ValueError(f"Service not found: {service_id}") + """ + Deploy a service to any deployment target (Docker unode or K8s cluster). + + Uses centralized resolution via resolve_service_for_deployment() to ensure + all variables are resolved before sending to target. + + Args: + service_id: Service to deploy + unode_hostname: Target unode hostname (Docker host or K8s cluster ID) + namespace: Optional K8s namespace (only used for K8s deployments) + """ + # Resolve service with all variables substituted + try: + resolved_service = await self.resolve_service_for_deployment(service_id) + except ValueError as e: + logger.error(f"Failed to resolve service {service_id}: {e}") + raise ValueError(f"Service resolution failed: {e}") # Get u-node - unode = await self.unodes_collection.find_one({"hostname": unode_hostname}) - if not unode: + unode_dict = await self.unodes_collection.find_one({"hostname": unode_hostname}) + if not unode_dict: raise ValueError(f"U-node not found: {unode_hostname}") - if unode.get("status") != "online": + if unode_dict.get("status") != "online": raise ValueError(f"U-node is not online: {unode_hostname}") + # Convert to UNode model + unode = UNode(**unode_dict) + # Check if already deployed existing = await self.deployments_collection.find_one({ "service_id": service_id, @@ -243,20 +484,78 @@ async def deploy_service( f"Service {service_id} already deployed to {unode_hostname}" ) - # Create deployment record + # Create deployment ID deployment_id = str(uuid.uuid4())[:8] - container_name = f"{service.service_id}-{deployment_id}" - now = datetime.now(timezone.utc) - deployment = Deployment( - id=deployment_id, - service_id=service_id, - unode_hostname=unode_hostname, - status=DeploymentStatus.DEPLOYING, - container_name=container_name, - created_at=now, - deployed_config=service.model_dump(), - ) + # Get appropriate deployment backend + k8s_manager = None + from src.models.unode import UNodeType + if unode.type == UNodeType.KUBERNETES: + from src.services.kubernetes_manager import get_kubernetes_manager + k8s_manager = await get_kubernetes_manager() + + backend = get_deployment_backend(unode, k8s_manager) + + # Deploy using the backend + try: + deployment = await backend.deploy( + unode=unode, + resolved_service=resolved_service, + deployment_id=deployment_id, + namespace=namespace + ) + + # For Docker deployments, update tailscale serve routes + if deployment.backend_type == "docker": + is_local = _is_local_deployment(unode_hostname) + if is_local and deployment.exposed_port: + _update_tailscale_serve_route( + service_id, + deployment.container_name, + deployment.exposed_port, + add=True + ) + + # Set access URL using tailscale helper + if deployment.exposed_port: + from src.services.tailscale_serve import get_service_access_url + access_url = get_service_access_url( + unode_hostname, + deployment.exposed_port, + is_local=is_local + ) + if access_url: + if is_local: + # Local services have path-based routing + deployment.access_url = f"{access_url}/{service_id}" + else: + deployment.access_url = access_url + + deployment.deployed_at = datetime.now(timezone.utc) + + except Exception as e: + logger.error(f"Deploy failed for {service_id} on {unode_hostname}: {e}") + # Create failed deployment record + deployment = Deployment( + id=deployment_id, + service_id=service_id, + unode_hostname=unode_hostname, + status=DeploymentStatus.FAILED, + created_at=datetime.now(timezone.utc), + deployed_config=resolved_service.model_dump(), + error=str(e), + backend_type=unode.type.value + ) + + # Upsert failed deployment record + await self.deployments_collection.replace_one( + {"service_id": service_id, "unode_hostname": unode_hostname}, + deployment.model_dump(), + upsert=True + ) + + # Re-raise exception so API returns proper error status + raise # Upsert deployment (replace if exists) await self.deployments_collection.replace_one( @@ -319,20 +618,31 @@ async def stop_deployment(self, deployment_id: str) -> Deployment: if not deployment: raise ValueError(f"Deployment not found: {deployment_id}") - unode = await self.unodes_collection.find_one({ + unode_dict = await self.unodes_collection.find_one({ "hostname": deployment.unode_hostname }) - if not unode: + if not unode_dict: raise ValueError(f"U-node not found: {deployment.unode_hostname}") + unode = UNode(**unode_dict) + + # Get appropriate backend + k8s_manager = None + from src.models.unode import UNodeType + if unode.type == UNodeType.KUBERNETES: + from src.services.kubernetes_manager import get_kubernetes_manager + k8s_manager = await get_kubernetes_manager() + + backend = get_deployment_backend(unode, k8s_manager) + try: - result = await self._send_stop_command(unode, deployment.container_name) + success = await backend.stop(unode, deployment) - if result.get("success"): + if success: deployment.status = DeploymentStatus.STOPPED deployment.stopped_at = datetime.now(timezone.utc) else: - deployment.error = result.get("error", "Stop failed") + deployment.error = "Stop failed" except Exception as e: logger.error(f"Stop failed for deployment {deployment_id}: {e}") @@ -376,23 +686,33 @@ async def restart_deployment(self, deployment_id: str) -> Deployment: return deployment async def remove_deployment(self, deployment_id: str) -> bool: - """Remove a deployment (stop container and delete record).""" + """Remove a deployment (stop and delete).""" deployment = await self.get_deployment(deployment_id) if not deployment: return False - unode = await self.unodes_collection.find_one({ + unode_dict = await self.unodes_collection.find_one({ "hostname": deployment.unode_hostname }) - if unode: + if unode_dict: + unode = UNode(**unode_dict) + + # Get appropriate backend + k8s_manager = None + if unode.type.value == "kubernetes": + from src.services.kubernetes_manager import get_kubernetes_manager + k8s_manager = await get_kubernetes_manager() + + backend = get_deployment_backend(unode, k8s_manager) + try: - await self._send_remove_command(unode, deployment.container_name) + await backend.remove(unode, deployment) except Exception as e: - logger.warning(f"Failed to remove container on node: {e}") + logger.warning(f"Failed to remove deployment on node: {e}") - # Remove tailscale serve route for local deployments - if _is_local_deployment(deployment.unode_hostname): + # Remove tailscale serve route for local Docker deployments + if deployment.backend_type == "docker" and _is_local_deployment(deployment.unode_hostname): _update_tailscale_serve_route(deployment.service_id, "", 0, add=False) await self.deployments_collection.delete_one({"id": deployment_id}) @@ -434,24 +754,29 @@ async def get_deployment_logs( if not deployment: return None - unode = await self.unodes_collection.find_one({ + unode_dict = await self.unodes_collection.find_one({ "hostname": deployment.unode_hostname }) - if not unode: + if not unode_dict: return None + unode = UNode(**unode_dict) + + # Get appropriate backend + k8s_manager = None + from src.models.unode import UNodeType + if unode.type == UNodeType.KUBERNETES: + from src.services.kubernetes_manager import get_kubernetes_manager + k8s_manager = await get_kubernetes_manager() + + backend = get_deployment_backend(unode, k8s_manager) + try: - result = await self._send_logs_command( - unode, - deployment.container_name, - tail - ) - if result.get("success"): - return result.get("logs", "") + logs = await backend.get_logs(unode, deployment, tail) + return "\n".join(logs) except Exception as e: logger.error(f"Failed to get logs for {deployment_id}: {e}") - - return None + return None # ========================================================================= # Node Communication @@ -487,28 +812,50 @@ async def _get_node_secret(self, unode: Dict[str, Any]) -> str: async def _send_deploy_command( self, unode: Dict[str, Any], - service: ServiceDefinition, + resolved_service: ResolvedServiceDefinition, container_name: str ) -> Dict[str, Any]: - """Send deploy command to a u-node.""" + """ + Send deploy command to a u-node. + + Args: + unode: U-node document + resolved_service: Fully resolved service definition (all vars substituted) + container_name: Container name to use + + Returns: + Deploy result from remote node + """ session = await self._get_session() url = await self._get_node_url(unode) secret = await self._get_node_secret(unode) + # Convert port list ["3002:3000", "8080"] to Docker API format {3000: 3002, 8080: 8080} + ports_dict = {} + for port_str in resolved_service.ports: + if ":" in port_str: + host_port, container_port = port_str.split(":", 1) + ports_dict[f"{container_port}/tcp"] = int(host_port) + else: + ports_dict[f"{port_str}/tcp"] = int(port_str) + payload = { "container_name": container_name, - "image": service.image, - "ports": service.ports, - "environment": service.environment, - "volumes": service.volumes, - "restart_policy": service.restart_policy, - "network": service.network, - "command": service.command, + "image": resolved_service.image, + "ports": ports_dict, + "environment": resolved_service.environment, + "volumes": resolved_service.volumes, + "restart_policy": resolved_service.restart_policy, + "network": resolved_service.network, + "command": resolved_service.command, } headers = {"X-Node-Secret": secret} - logger.info(f"Deploying {container_name} to {unode.get('hostname')} (secret length={len(secret)})") + logger.info( + f"Deploying {container_name} to {unode.get('hostname')}: " + f"image={resolved_service.image}, ports={resolved_service.ports}" + ) async with session.post( f"{url}/deploy", diff --git a/ushadow/backend/src/services/instance_manager.py b/ushadow/backend/src/services/instance_manager.py index 7d905e7f..5915954f 100644 --- a/ushadow/backend/src/services/instance_manager.py +++ b/ushadow/backend/src/services/instance_manager.py @@ -477,15 +477,18 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: instance.status = InstanceStatus.DEPLOYING self._save_instances() + # Use service_name (not template_id) for orchestrator calls + service_name = compose_service.service_name + try: - result = await orchestrator.start_service(instance.template_id) + result = await orchestrator.start_service(service_name, instance_id=instance_id) if result.success: # Get the service status to find access URL - status_info = await orchestrator.get_service_status(instance.template_id) + status_info = await orchestrator.get_service_status(service_name) access_url = None if status_info and status_info.get("status") == "running": # Try to get the access URL from docker details - details = await orchestrator.get_docker_details(instance.template_id) + details = await orchestrator.get_docker_details(service_name) if details and details.ports: # Use first mapped port for port_info in details.ports: @@ -543,11 +546,14 @@ async def undeploy_instance(self, instance_id: str) -> tuple[bool, str]: from src.services.service_orchestrator import get_service_orchestrator orchestrator = get_service_orchestrator() + # Use service_name (not template_id) for orchestrator calls + service_name = compose_service.service_name + try: - result = await orchestrator.stop_service(instance.template_id) + result = await orchestrator.stop_service(service_name, instance_id=instance_id) if result.success: self.update_instance_status(instance_id, InstanceStatus.STOPPED) - return True, f"Service {instance.template_id} stopped" + return True, f"Service {service_name} stopped" else: return False, result.message except Exception as e: diff --git a/ushadow/backend/src/services/kubernetes_manager.py b/ushadow/backend/src/services/kubernetes_manager.py index 5e82d6ce..903b4938 100644 --- a/ushadow/backend/src/services/kubernetes_manager.py +++ b/ushadow/backend/src/services/kubernetes_manager.py @@ -7,8 +7,9 @@ import os import secrets import tempfile +import yaml from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from cryptography.fernet import Fernet, InvalidToken @@ -170,6 +171,141 @@ async def get_cluster(self, cluster_id: str) -> Optional[KubernetesCluster]: return KubernetesCluster(**doc) return None + async def list_nodes(self, cluster_id: str) -> List["KubernetesNode"]: + """ + List all nodes in a Kubernetes cluster. + + Args: + cluster_id: The cluster ID + + Returns: + List of KubernetesNode objects + + Raises: + ValueError: If cluster not found or API call fails + """ + from src.models.kubernetes import KubernetesNode + + # Verify cluster exists + cluster = await self.get_cluster(cluster_id) + if not cluster: + raise ValueError(f"Cluster not found: {cluster_id}") + + try: + core_api, _ = self._get_kube_client(cluster_id) + + # List all nodes + nodes_list = core_api.list_node() + + k8s_nodes = [] + for node in nodes_list.items: + # Extract node status + conditions = node.status.conditions or [] + ready = False + status = "Unknown" + for condition in conditions: + if condition.type == "Ready": + ready = condition.status == "True" + status = "Ready" if ready else "NotReady" + break + + # Extract node roles from labels + labels = node.metadata.labels or {} + roles = [] + if "node-role.kubernetes.io/control-plane" in labels or "node-role.kubernetes.io/master" in labels: + roles.append("control-plane") + if not roles or "node-role.kubernetes.io/worker" in labels: + roles.append("worker") + + # Extract addresses + addresses = node.status.addresses or [] + internal_ip = None + external_ip = None + hostname = None + for addr in addresses: + if addr.type == "InternalIP": + internal_ip = addr.address + elif addr.type == "ExternalIP": + external_ip = addr.address + elif addr.type == "Hostname": + hostname = addr.address + + # Extract node info + node_info = node.status.node_info + kubelet_version = node_info.kubelet_version if node_info else None + os_image = node_info.os_image if node_info else None + kernel_version = node_info.kernel_version if node_info else None + container_runtime = node_info.container_runtime_version if node_info else None + + # Extract capacity and allocatable + capacity = node.status.capacity or {} + allocatable = node.status.allocatable or {} + + # Extract taints + taints = [] + for taint in (node.spec.taints or []): + taints.append({ + "key": taint.key, + "value": taint.value or "", + "effect": taint.effect + }) + + k8s_node = KubernetesNode( + name=node.metadata.name, + cluster_id=cluster_id, + status=status, + ready=ready, + kubelet_version=kubelet_version, + os_image=os_image, + kernel_version=kernel_version, + container_runtime=container_runtime, + cpu_capacity=capacity.get("cpu"), + memory_capacity=capacity.get("memory"), + cpu_allocatable=allocatable.get("cpu"), + memory_allocatable=allocatable.get("memory"), + roles=roles, + internal_ip=internal_ip, + external_ip=external_ip, + hostname=hostname, + taints=taints, + labels=labels + ) + k8s_nodes.append(k8s_node) + + logger.info(f"Listed {len(k8s_nodes)} nodes for cluster {cluster_id}") + return k8s_nodes + + except Exception as e: + logger.error(f"Error listing nodes for cluster {cluster_id}: {e}") + raise ValueError(f"Failed to list nodes: {e}") + + async def update_cluster_infra_scan( + self, + cluster_id: str, + namespace: str, + scan_results: Dict[str, Dict] + ) -> bool: + """ + Update cached infrastructure scan results for a cluster namespace. + + Args: + cluster_id: The cluster ID + namespace: The namespace that was scanned + scan_results: The scan results from scan_cluster_for_infra_services + + Returns: + True if update was successful + """ + try: + result = await self.clusters_collection.update_one( + {"cluster_id": cluster_id}, + {"$set": {f"infra_scans.{namespace}": scan_results}} + ) + return result.modified_count > 0 or result.matched_count > 0 + except Exception as e: + logger.error(f"Error updating cluster infra scan: {e}") + return False + async def remove_cluster(self, cluster_id: str) -> bool: """Remove a cluster and its kubeconfig.""" # Delete encrypted kubeconfig file @@ -222,6 +358,44 @@ def _get_kube_client(self, cluster_id: str) -> Tuple[client.CoreV1Api, client.Ap else: raise FileNotFoundError(f"Kubeconfig not found for cluster {cluster_id}") + def _resolve_image_variables(self, image: str, environment: Dict[str, str]) -> str: + """ + Resolve environment variables in Docker image names. + + Handles Docker Compose variable syntax like: + - ${VAR} + - ${VAR:-default} + - ${VAR-default} + + Args: + image: Image name possibly containing variables + environment: Environment variables to use for resolution + + Returns: + Resolved image name + """ + import re + + def replace_var(match): + var_expr = match.group(1) + + # Handle ${VAR:-default} or ${VAR-default} + if ":-" in var_expr: + var_name, default = var_expr.split(":-", 1) + elif "-" in var_expr and not var_expr.startswith("-"): + var_name, default = var_expr.split("-", 1) + else: + var_name = var_expr + default = "" + + # Look up in environment, fall back to OS env, then default + value = environment.get(var_name) or os.environ.get(var_name) or default + return value + + # Replace ${...} patterns + resolved = re.sub(r'\$\{([^}]+)\}', replace_var, image) + return resolved + async def compile_service_to_k8s( self, service_def: Dict, @@ -245,19 +419,48 @@ async def compile_service_to_k8s( image = service_def.get("image", "") environment = service_def.get("environment", {}) ports = service_def.get("ports", []) + volumes = service_def.get("volumes", []) + + # Resolve any environment variables in the image name + image = self._resolve_image_variables(image, environment) + + # Sanitize service_id for use as Kubernetes label value + # K8s labels can only contain alphanumeric, '-', '_', '.' + # Replace colons and other invalid chars with hyphens + safe_service_id = service_id.replace(":", "-").replace("/", "-") # Use provided spec or defaults spec = k8s_spec or KubernetesDeploymentSpec() # Parse ports (Docker format: "8080:8080" or "8080") - container_port = 8000 # default + # Support multiple ports with unique names + container_ports = [] if ports: - port_str = ports[0] - if ":" in port_str: - _, container_port = port_str.split(":") - container_port = int(container_port) - else: - container_port = int(port_str) + for idx, port in enumerate(ports): + port_str = str(port) + # Skip if port is None or empty + if not port_str or port_str.lower() in ('none', ''): + continue + + try: + if ":" in port_str: + _, port_num = port_str.split(":", 1) + port_num = int(port_num) + else: + port_num = int(port_str) + + # Generate unique port name (http, http-2, http-3, etc.) + port_name = "http" if idx == 0 else f"http-{idx + 1}" + container_ports.append({ + "name": port_name, + "port": port_num + }) + except (ValueError, TypeError) as e: + logger.warning(f"Invalid port format '{port_str}', skipping: {e}") + + # Default to port 8000 if no valid ports found + if not container_ports: + container_ports = [{"name": "http", "port": 8000}] # Separate sensitive from non-sensitive env vars # Pattern: anything with SECRET, KEY, PASSWORD, TOKEN in name @@ -273,10 +476,81 @@ async def compile_service_to_k8s( else: config_data[key] = str(value) + # Parse volumes - separate config files from persistent volumes + # Volumes can be: + # - Bind mounts: "/host/path:/container/path:ro" or "${VAR}/path:/container/path" + # - Named volumes: "volume_name:/container/path" + config_files = {} # Files to include in ConfigMap + volume_mounts = [] # Volume mounts for container + k8s_volumes = [] # Volume definitions for pod + + for volume_def in volumes: + if isinstance(volume_def, str): + # Parse "source:dest" or "source:dest:options" format + parts = volume_def.split(":") + if len(parts) >= 2: + source, dest = parts[0], parts[1] + is_readonly = len(parts) > 2 and 'ro' in parts[2] + + # Resolve environment variables in source path + import os + source = os.path.expandvars(source) + + # Check if source is a file (for config files) or directory (for data volumes) + from pathlib import Path + source_path = Path(source) + + if source_path.is_file(): + # Config file - add to ConfigMap + try: + with open(source_path, 'r') as f: + file_content = f.read() + file_name = source_path.name + config_files[file_name] = file_content + logger.info(f"Adding config file {file_name} to ConfigMap (source: {source})") + + # Add volume mount for this file + volume_mounts.append({ + "name": "config-files", + "mountPath": dest, + "subPath": file_name, + "readOnly": is_readonly + }) + except Exception as e: + logger.warning(f"Could not read config file {source}: {e}") + + elif source_path.is_dir() or not source_path.exists(): + # Directory or named volume - create emptyDir for now + # TODO: Could support PVCs for persistent storage in future + volume_name = source_path.name.replace("_", "-") if source_path.name else "data" + + # Only add volume definition if not already added + if not any(v.get("name") == volume_name for v in k8s_volumes): + k8s_volumes.append({ + "name": volume_name, + "emptyDir": {} + }) + + volume_mounts.append({ + "name": volume_name, + "mountPath": dest, + "readOnly": is_readonly + }) + logger.info(f"Adding emptyDir volume {volume_name} mounted at {dest}") + + # Add config-files volume if we have config files + if config_files: + k8s_volumes.append({ + "name": "config-files", + "configMap": { + "name": f"{name}-files" + } + }) + # Generate manifests matching friend-lite pattern labels = { "app.kubernetes.io/name": name, - "app.kubernetes.io/instance": service_id, + "app.kubernetes.io/instance": safe_service_id, "app.kubernetes.io/managed-by": "ushadow", **spec.labels } @@ -310,6 +584,19 @@ async def compile_service_to_k8s( "data": secret_data } + # ConfigMap for config files (separate from env var ConfigMap) + if config_files: + manifests["config_files_map"] = { + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": { + "name": f"{name}-files", + "namespace": namespace, + "labels": labels + }, + "data": config_files + } + # Deployment manifests["deployment"] = { "apiVersion": "apps/v1", @@ -324,14 +611,14 @@ async def compile_service_to_k8s( "selector": { "matchLabels": { "app.kubernetes.io/name": name, - "app.kubernetes.io/instance": service_id + "app.kubernetes.io/instance": safe_service_id } }, "template": { "metadata": { "labels": { "app.kubernetes.io/name": name, - "app.kubernetes.io/instance": service_id + "app.kubernetes.io/instance": safe_service_id }, "annotations": spec.annotations }, @@ -340,45 +627,68 @@ async def compile_service_to_k8s( "name": name, "image": image, "imagePullPolicy": "Always", - "ports": [{ - "name": "http", - "containerPort": container_port, - "protocol": "TCP" - }], + "ports": [ + { + "name": port_info["name"], + "containerPort": port_info["port"], + "protocol": "TCP" + } + for port_info in container_ports + ], # Use envFrom like friend-lite pattern **({"envFrom": [ *([{"configMapRef": {"name": f"{name}-config"}}] if config_data else []), *([{"secretRef": {"name": f"{name}-secrets"}}] if secret_data else []) ]} if (config_data or secret_data) else {}), - "livenessProbe": { - "httpGet": { - "path": "/health", - "port": "http" - }, - "initialDelaySeconds": 30, - "periodSeconds": 60 - }, - "readinessProbe": { - "httpGet": { - "path": "/health", - "port": "http" + # Only add health probes if health_check_path is provided + **({ + "livenessProbe": { + "httpGet": { + "path": spec.health_check_path or "/health", + "port": "http" + }, + "initialDelaySeconds": 30, + "periodSeconds": 60, + "failureThreshold": 3 }, - "initialDelaySeconds": 10, - "periodSeconds": 30 - }, + "readinessProbe": { + "httpGet": { + "path": spec.health_check_path or "/health", + "port": "http" + }, + "initialDelaySeconds": 10, + "periodSeconds": 30, + "failureThreshold": 3 + } + } if spec.health_check_path is not None else {}), **({"resources": spec.resources} if spec.resources else { "resources": { "limits": {"cpu": "500m", "memory": "512Mi"}, "requests": {"cpu": "100m", "memory": "128Mi"} } - }) - }] + }), + # Add volumeMounts if any volumes are defined + **({"volumeMounts": volume_mounts} if volume_mounts else {}) + }], + # Add volumes to pod spec if any are defined + **({"volumes": k8s_volumes} if k8s_volumes else {}) } } } } # Service (NodePort by default, matching friend-lite pattern) + # Create service ports for each container port + service_ports = [ + { + "port": port_info["port"], + "targetPort": port_info["name"], + "protocol": "TCP", + "name": port_info["name"] + } + for port_info in container_ports + ] + manifests["service"] = { "apiVersion": "v1", "kind": "Service", @@ -389,15 +699,10 @@ async def compile_service_to_k8s( }, "spec": { "type": spec.service_type, - "ports": [{ - "port": container_port, - "targetPort": "http", - "protocol": "TCP", - "name": "http" - }], + "ports": service_ports, "selector": { "app.kubernetes.io/name": name, - "app.kubernetes.io/instance": service_id + "app.kubernetes.io/instance": safe_service_id } } } @@ -433,7 +738,7 @@ async def compile_service_to_k8s( "backend": { "service": { "name": name, - "port": {"number": container_port} + "port": {"number": container_ports[0]["port"]} } } }] @@ -444,6 +749,405 @@ async def compile_service_to_k8s( return manifests + async def ensure_namespace_exists( + self, + cluster_id: str, + namespace: str + ) -> bool: + """ + Ensure a namespace exists in the cluster, creating it if necessary. + + Returns True if namespace exists or was created successfully. + """ + try: + core_api, _ = self._get_kube_client(cluster_id) + + # Check if namespace exists + try: + core_api.read_namespace(name=namespace) + logger.info(f"Namespace {namespace} already exists") + return True + except ApiException as e: + if e.status == 404: + # Namespace doesn't exist, create it + namespace_manifest = { + "apiVersion": "v1", + "kind": "Namespace", + "metadata": { + "name": namespace, + "labels": { + "app.kubernetes.io/managed-by": "ushadow" + } + } + } + core_api.create_namespace(body=namespace_manifest) + logger.info(f"Created namespace {namespace}") + return True + else: + # Some other error occurred + raise + + except Exception as e: + logger.error(f"Error ensuring namespace exists: {e}") + raise + + async def scan_cluster_for_infra_services( + self, + cluster_id: str, + namespace: str = "ushadow" + ) -> Dict[str, Dict]: + """ + Scan a Kubernetes cluster for running infrastructure services. + + Looks for common infra services: mongo, redis, postgres, qdrant, neo4j. + Scans across multiple common namespaces (default, kube-system, infra, target namespace). + Returns dict mapping service_name -> {found: bool, endpoints: [], type: str, namespace: str} + """ + try: + core_api, _ = self._get_kube_client(cluster_id) + + # Infrastructure services we look for + infra_services = { + "mongo": {"names": ["mongo", "mongodb"], "port": 27017}, + "redis": {"names": ["redis"], "port": 6379}, + "postgres": {"names": ["postgres", "postgresql"], "port": 5432}, + "qdrant": {"names": ["qdrant"], "port": 6333}, + "neo4j": {"names": ["neo4j"], "port": 7687}, + } + + # Common namespaces where infrastructure might be deployed + # Check target namespace first, then common infra namespaces + namespaces_to_scan = [namespace, "default", "kube-system", "infra", "infrastructure"] + # Remove duplicates while preserving order + namespaces_to_scan = list(dict.fromkeys(namespaces_to_scan)) + + results = {} + + # Scan each namespace for infrastructure services + for ns in namespaces_to_scan: + try: + services = core_api.list_namespaced_service(namespace=ns) + except ApiException: + # Namespace might not exist, skip it + continue + + # Check each infra service + for infra_name, config in infra_services.items(): + # Skip if we already found this service in a previous namespace + if results.get(infra_name, {}).get("found"): + continue + + for svc in services.items: + svc_name_lower = svc.metadata.name.lower() + + # Match by name patterns + if any(pattern in svc_name_lower for pattern in config["names"]): + # Found it! Extract connection info + endpoints = [] + ports = [p.port for p in svc.spec.ports] + + # Build connection strings using the actual namespace where service was found + for port in ports: + if svc.spec.type == "ClusterIP": + endpoints.append(f"{svc.metadata.name}.{ns}.svc.cluster.local:{port}") + elif svc.spec.type == "NodePort": + endpoints.append(f":{port}") + elif svc.spec.type == "LoadBalancer": + if svc.status.load_balancer.ingress: + lb_ip = svc.status.load_balancer.ingress[0].ip + endpoints.append(f"{lb_ip}:{port}") + + results[infra_name] = { + "found": True, + "endpoints": endpoints, + "type": infra_name, + "namespace": ns, # Track which namespace it was found in + "default_port": config["port"] + } + break # Found this service, move to next infra type + + # Fill in "not found" for any missing services + for infra_name in infra_services.keys(): + if infra_name not in results: + results[infra_name] = { + "found": False, + "endpoints": [], + "type": infra_name, + "namespace": None, + "default_port": infra_services[infra_name]["port"] + } + + return results + + except Exception as e: + logger.error(f"Error scanning cluster for infra services: {e}") + return {name: {"found": False, "endpoints": [], "type": name, "error": str(e)} + for name in ["mongo", "redis", "postgres", "qdrant", "neo4j"]} + + async def list_pods(self, cluster_id: str, namespace: str = "ushadow") -> List[Dict[str, Any]]: + """ + List all pods in a namespace. + + Returns list of pods with name, status, restarts, age, and labels. + """ + try: + core_api, _ = self._get_kube_client(cluster_id) + pods_list = core_api.list_namespaced_pod(namespace=namespace) + + pods = [] + for pod in pods_list.items: + # Get pod status + status = "Unknown" + restarts = 0 + if pod.status.container_statuses: + # Count total restarts + restarts = sum(cs.restart_count for cs in pod.status.container_statuses) + + # Determine overall status + if pod.status.phase == "Running": + all_ready = all(cs.ready for cs in pod.status.container_statuses) + status = "Running" if all_ready else "Starting" + else: + status = pod.status.phase + + # Check for specific error states + for cs in pod.status.container_statuses: + if cs.state.waiting: + status = cs.state.waiting.reason or "Waiting" + elif cs.state.terminated: + status = cs.state.terminated.reason or "Terminated" + else: + status = pod.status.phase or "Pending" + + # Calculate age + age = "" + if pod.metadata.creation_timestamp: + from datetime import datetime, timezone + age_seconds = (datetime.now(timezone.utc) - pod.metadata.creation_timestamp).total_seconds() + if age_seconds < 60: + age = f"{int(age_seconds)}s" + elif age_seconds < 3600: + age = f"{int(age_seconds / 60)}m" + elif age_seconds < 86400: + age = f"{int(age_seconds / 3600)}h" + else: + age = f"{int(age_seconds / 86400)}d" + + pods.append({ + "name": pod.metadata.name, + "namespace": pod.metadata.namespace, + "status": status, + "restarts": restarts, + "age": age, + "labels": pod.metadata.labels or {}, + "node": pod.spec.node_name or "N/A" + }) + + return pods + + except ApiException as e: + logger.error(f"Failed to list pods: {e}") + raise Exception(f"Failed to list pods: {e.reason}") + except Exception as e: + logger.error(f"Error listing pods: {e}") + raise + + async def get_pod_logs( + self, + cluster_id: str, + pod_name: str, + namespace: str = "ushadow", + previous: bool = False, + tail_lines: int = 100 + ) -> str: + """ + Get logs from a pod. + + Args: + cluster_id: The cluster ID + pod_name: Name of the pod + namespace: Kubernetes namespace + previous: Get logs from previous (crashed) container + tail_lines: Number of lines to return from end of logs + + Returns: + Pod logs as a string + """ + try: + core_api, _ = self._get_kube_client(cluster_id) + + logs = core_api.read_namespaced_pod_log( + name=pod_name, + namespace=namespace, + previous=previous, + tail_lines=tail_lines + ) + + return logs + + except ApiException as e: + if e.status == 404: + raise Exception(f"Pod '{pod_name}' not found in namespace '{namespace}'") + elif e.status == 400: + # Pod might not have started yet or logs not available + raise Exception(f"Logs not available for pod '{pod_name}': {e.reason}") + else: + logger.error(f"Failed to get pod logs: {e}") + raise Exception(f"Failed to get pod logs: {e.reason}") + except Exception as e: + logger.error(f"Error getting pod logs: {e}") + raise + + async def get_pod_events( + self, + cluster_id: str, + pod_name: str, + namespace: str = "ushadow" + ) -> List[Dict[str, Any]]: + """ + Get events for a specific pod. + + This is useful for debugging why a pod won't start. + Shows events like ImagePullBackOff, CrashLoopBackOff, etc. + + Returns: + List of events with type, reason, message, and timestamp + """ + try: + core_api, _ = self._get_kube_client(cluster_id) + + # Get events for this pod + field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}" + events_list = core_api.list_namespaced_event( + namespace=namespace, + field_selector=field_selector + ) + + events = [] + for event in events_list.items: + events.append({ + "type": event.type, # Normal, Warning, Error + "reason": event.reason, # BackOff, Failed, Pulled, etc. + "message": event.message, + "count": event.count, + "first_timestamp": event.first_timestamp.isoformat() if event.first_timestamp else None, + "last_timestamp": event.last_timestamp.isoformat() if event.last_timestamp else None, + }) + + # Sort by last timestamp, most recent first + events.sort(key=lambda e: e["last_timestamp"] or "", reverse=True) + + return events + + except ApiException as e: + logger.error(f"Failed to get pod events: {e}") + raise Exception(f"Failed to get pod events: {e.reason}") + except Exception as e: + logger.error(f"Error getting pod events: {e}") + raise + + async def get_or_create_envmap( + self, + cluster_id: str, + namespace: str, + service_name: str, + env_vars: Dict[str, str] + ) -> Tuple[str, str]: + """ + Get or create ConfigMap and Secret for service environment variables. + + Separates sensitive (keys, passwords) from non-sensitive values. + Returns tuple of (configmap_name, secret_name). + """ + try: + # Ensure namespace exists first + await self.ensure_namespace_exists(cluster_id, namespace) + + core_api, _ = self._get_kube_client(cluster_id) + + # Separate sensitive from non-sensitive + sensitive_patterns = ('SECRET', 'KEY', 'PASSWORD', 'TOKEN', 'PASS', 'CREDENTIALS') + config_data = {} + secret_data = {} + + for key, value in env_vars.items(): + if any(pattern in key.upper() for pattern in sensitive_patterns): + # Base64 encode for Secret + import base64 + secret_data[key] = base64.b64encode(str(value).encode()).decode() + else: + config_data[key] = str(value) + + configmap_name = f"{service_name}-config" + secret_name = f"{service_name}-secrets" + + # Create or update ConfigMap + if config_data: + configmap = { + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": { + "name": configmap_name, + "namespace": namespace, + "labels": { + "app.kubernetes.io/name": service_name, + "app.kubernetes.io/managed-by": "ushadow" + } + }, + "data": config_data + } + + try: + core_api.create_namespaced_config_map(namespace=namespace, body=configmap) + logger.info(f"Created ConfigMap {configmap_name}") + except ApiException as e: + if e.status == 409: # Already exists + core_api.patch_namespaced_config_map( + name=configmap_name, + namespace=namespace, + body=configmap + ) + logger.info(f"Updated ConfigMap {configmap_name}") + else: + raise + + # Create or update Secret + if secret_data: + secret = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": { + "name": secret_name, + "namespace": namespace, + "labels": { + "app.kubernetes.io/name": service_name, + "app.kubernetes.io/managed-by": "ushadow" + } + }, + "data": secret_data + } + + try: + core_api.create_namespaced_secret(namespace=namespace, body=secret) + logger.info(f"Created Secret {secret_name}") + except ApiException as e: + if e.status == 409: + core_api.patch_namespaced_secret( + name=secret_name, + namespace=namespace, + body=secret + ) + logger.info(f"Updated Secret {secret_name}") + else: + raise + + return configmap_name if config_data else "", secret_name if secret_data else "" + + except Exception as e: + logger.error(f"Error creating envmap: {e}") + raise + async def deploy_to_kubernetes( self, cluster_id: str, @@ -457,9 +1161,30 @@ async def deploy_to_kubernetes( Compiles the service definition to K8s manifests and applies them. """ try: + service_name = service_def.get("name", "unknown") + logger.info(f"Starting deployment of {service_name} to cluster {cluster_id}, namespace {namespace}") + logger.info(f"Service definition: image={service_def.get('image')}, ports={service_def.get('ports')}") + + # Ensure namespace exists first + await self.ensure_namespace_exists(cluster_id, namespace) + # Compile manifests manifests = await self.compile_service_to_k8s(service_def, namespace, k8s_spec) + # Log generated manifests for debugging + logger.info(f"Generated manifests for {service_name}:") + for manifest_type, manifest in manifests.items(): + logger.debug(f"{manifest_type}:\n{yaml.dump(manifest, default_flow_style=False)}") + + # Optionally save manifests to disk for debugging + manifest_dir = Path("/tmp/k8s-manifests") / cluster_id / namespace + manifest_dir.mkdir(parents=True, exist_ok=True) + for manifest_type, manifest in manifests.items(): + manifest_file = manifest_dir / f"{service_name}-{manifest_type}.yaml" + with open(manifest_file, 'w') as f: + yaml.dump(manifest, f, default_flow_style=False) + logger.info(f"Manifests saved to {manifest_dir}") + # Get API clients core_api, apps_api = self._get_kube_client(cluster_id) networking_api = client.NetworkingV1Api() @@ -500,6 +1225,26 @@ async def deploy_to_kubernetes( else: raise + # Apply ConfigMap for config files + if "config_files_map" in manifests: + try: + core_api.create_namespaced_config_map( + namespace=namespace, + body=manifests["config_files_map"] + ) + logger.info(f"Created ConfigMap for config files") + except ApiException as e: + if e.status == 409: # Already exists, update it + name = manifests["config_files_map"]["metadata"]["name"] + core_api.patch_namespaced_config_map( + name=name, + namespace=namespace, + body=manifests["config_files_map"] + ) + logger.info(f"Updated ConfigMap for config files") + else: + raise + # Apply Deployment deployment_name = manifests["deployment"]["metadata"]["name"] try: @@ -558,13 +1303,31 @@ async def deploy_to_kubernetes( else: raise - return True, f"Deployed to {namespace}/{deployment_name}" + # Log success and return details + deployed_resources = [] + if "config_map" in manifests: + deployed_resources.append(f"ConfigMap/{manifests['config_map']['metadata']['name']}") + if "secret" in manifests: + deployed_resources.append(f"Secret/{manifests['secret']['metadata']['name']}") + if "config_files_map" in manifests: + deployed_resources.append(f"ConfigMap/{manifests['config_files_map']['metadata']['name']}") + deployed_resources.append(f"Deployment/{deployment_name}") + deployed_resources.append(f"Service/{service_name}") + if "ingress" in manifests: + deployed_resources.append(f"Ingress/{manifests['ingress']['metadata']['name']}") + + result_msg = f"Successfully deployed {deployment_name} to {namespace}. Resources: {', '.join(deployed_resources)}" + logger.info(result_msg) + return True, result_msg except ApiException as e: logger.error(f"K8s API error during deployment: {e}") + logger.error(f"Response body: {e.body if hasattr(e, 'body') else 'N/A'}") return False, f"Deployment failed: {e.reason}" except Exception as e: logger.error(f"Error deploying to K8s: {e}") + import traceback + logger.error(traceback.format_exc()) return False, str(e) diff --git a/ushadow/frontend/src/components/DeployToK8sModal.tsx b/ushadow/frontend/src/components/DeployToK8sModal.tsx new file mode 100644 index 00000000..03ae1404 --- /dev/null +++ b/ushadow/frontend/src/components/DeployToK8sModal.tsx @@ -0,0 +1,499 @@ +import { useState, useEffect } from 'react' +import { CheckCircle, Loader, ChevronRight } from 'lucide-react' +import Modal from './Modal' +import EnvVarEditor from './EnvVarEditor' +import { kubernetesApi, servicesApi, instancesApi, KubernetesCluster, EnvVarInfo, EnvVarConfig } from '../services/api' + +interface DeployToK8sModalProps { + isOpen: boolean + onClose: () => void + cluster?: KubernetesCluster // Optional - if not provided, show cluster selection + availableClusters?: KubernetesCluster[] // For cluster selection + infraServices?: Record + preselectedServiceId?: string // If provided, skip service selection step +} + +interface ServiceOption { + service_id: string + service_name: string + display_name: string + description?: string + image?: string + requires?: string[] +} + +export default function DeployToK8sModal({ isOpen, onClose, cluster: initialCluster, availableClusters = [], infraServices: initialInfraServices = {}, preselectedServiceId }: DeployToK8sModalProps) { + const [step, setStep] = useState<'cluster' | 'select' | 'configure' | 'deploying' | 'complete'>( + !initialCluster && availableClusters.length > 1 ? 'cluster' : + preselectedServiceId ? 'configure' : 'select' + ) + const [selectedCluster, setSelectedCluster] = useState(initialCluster || null) + const [infraServices, setInfraServices] = useState>(initialInfraServices) + + // Sync infra services from prop to state when it changes + useEffect(() => { + if (isOpen) { + console.log('🚀 DeployToK8sModal infra services updated:', initialInfraServices) + setInfraServices(initialInfraServices) + } + }, [isOpen, initialInfraServices]) + const [services, setServices] = useState([]) + const [selectedService, setSelectedService] = useState(null) + const [namespace, setNamespace] = useState('ushadow') + const [envVars, setEnvVars] = useState([]) + const [envConfigs, setEnvConfigs] = useState>({}) + const [error, setError] = useState(null) + const [deploymentResult, setDeploymentResult] = useState(null) + + useEffect(() => { + if (isOpen) { + loadServices() + } + }, [isOpen]) + + // Auto-select service if preselected + useEffect(() => { + if (preselectedServiceId && services.length > 0 && !selectedService) { + const service = services.find(s => s.service_id === preselectedServiceId) + if (service) { + handleSelectService(service) + } + } + }, [preselectedServiceId, services, selectedService]) + + const loadServices = async () => { + try { + const response = await kubernetesApi.getAvailableServices() + setServices(response.data.services) + } catch (err: any) { + console.error('Failed to load services:', err) + setError('Failed to load services') + } + } + + const formatError = (err: any): string => { + if (typeof err === 'string') return err + + // Handle Pydantic validation errors (array of error objects) + if (Array.isArray(err)) { + return err.map(e => e.msg || JSON.stringify(e)).join(', ') + } + + // Handle error response from API + const detail = err.response?.data?.detail + if (detail) { + if (typeof detail === 'string') return detail + if (Array.isArray(detail)) { + return detail.map(e => e.msg || JSON.stringify(e)).join(', ') + } + return JSON.stringify(detail) + } + + return err.message || 'An error occurred' + } + + const handleSelectService = async (service: ServiceOption) => { + setSelectedService(service) + setError(null) + + try { + console.log('📦 Selected service:', service.service_id) + console.log('🔧 Current infraServices state:', infraServices) + + // Load environment variable schema with suggestions from settingsStore + const envResponse = await servicesApi.getEnvConfig(service.service_id) + const envData = envResponse.data + + // Initialize env vars and configs (EXACT same pattern as ServicesPage) + const allEnvVars = [...envData.required_env_vars, ...envData.optional_env_vars] + setEnvVars(allEnvVars) + + // Use API response data directly (backend already did smart mapping) + // ONLY override with infrastructure detection for K8s-specific values + const initialConfigs: Record = {} + allEnvVars.forEach(envVar => { + const infraValue = getInfraValueForEnvVar(envVar.name, infraServices) + console.log(`🔍 Checking env var ${envVar.name}:`, { infraValue, infraServices }) + + if (infraValue) { + // Override with infrastructure value for K8s cluster-specific endpoints + // Mark as locked so user can't edit + initialConfigs[envVar.name] = { + name: envVar.name, + source: 'new_setting', + value: infraValue, + new_setting_path: `api_keys.${envVar.name.toLowerCase()}`, + setting_path: undefined, + locked: true, + provider_name: 'K8s Infrastructure' + } + } else { + // Use data from API response (backend already mapped to settings) + initialConfigs[envVar.name] = { + name: envVar.name, + source: (envVar.source as 'setting' | 'new_setting' | 'literal' | 'default') || 'default', + setting_path: envVar.setting_path, + value: envVar.value, + new_setting_path: undefined + } + } + }) + + setEnvConfigs(initialConfigs) + setStep('configure') + } catch (err: any) { + console.error('Failed to load env config:', err) + setError(`Failed to load environment configuration: ${formatError(err)}`) + } + } + + // Helper to get infrastructure endpoint for common env vars + const getInfraValueForEnvVar = (envVarName: string, infraServices: Record): string | null => { + const upperName = envVarName.toUpperCase() + + // MongoDB - be specific about which env vars get which values + if (upperName === 'MONGODB_DATABASE') { + return 'ushadow' // Just the database name + } + if (upperName.includes('MONGO') || upperName.includes('MONGODB')) { + if (infraServices.mongo?.found && infraServices.mongo.endpoints.length > 0) { + return `mongodb://${infraServices.mongo.endpoints[0]}/ushadow` + } + } + + // Redis + if (upperName.includes('REDIS')) { + if (infraServices.redis?.found && infraServices.redis.endpoints.length > 0) { + return `redis://${infraServices.redis.endpoints[0]}/0` + } + } + + // Postgres + if (upperName.includes('POSTGRES') || upperName.includes('DATABASE_URL')) { + if (infraServices.postgres?.found && infraServices.postgres.endpoints.length > 0) { + return `postgresql://ushadow:ushadow@${infraServices.postgres.endpoints[0]}/ushadow` + } + } + + // Qdrant - be specific about port vs base URL + if (upperName === 'QDRANT_PORT') { + return '6333' // Just the port number + } + if (upperName.includes('QDRANT')) { + if (infraServices.qdrant?.found && infraServices.qdrant.endpoints.length > 0) { + return `http://${infraServices.qdrant.endpoints[0]}` + } + } + + return null + } + + const handleDeploy = async () => { + if (!selectedService || !selectedCluster) return + + try { + setStep('deploying') + setError(null) + + // Generate instance ID for this deployment target (only lowercase, numbers, hyphens) + const sanitizedServiceId = selectedService.service_id.replace(/[^a-z0-9-]/g, '-') + const instanceId = `${sanitizedServiceId}-k8s-${selectedCluster.cluster_id}-${namespace}` + const deploymentTarget = `k8s://${selectedCluster.cluster_id}/${namespace}` + + // Convert env configs to instance config format + const configValues: Record = {} + Object.entries(envConfigs).forEach(([name, config]) => { + if (config.source === 'setting' && config.setting_path) { + configValues[name] = { _from_setting: config.setting_path } + } else if (config.source === 'new_setting' && config.value) { + configValues[name] = config.value + // Also save to settings if new_setting_path is specified + if (config.new_setting_path) { + configValues[`_save_${name}`] = config.new_setting_path + } + } else if (config.value) { + configValues[name] = config.value + } + }) + + // Step 1: Create or update instance with this configuration + try { + // Try to get existing instance + await instancesApi.getInstance(instanceId) + // Instance exists - update it + await instancesApi.updateInstance(instanceId, { + name: `${selectedService.display_name} (${selectedCluster.name}/${namespace})`, + description: `K8s deployment to ${selectedCluster.name} in ${namespace} namespace`, + config: configValues, + deployment_target: deploymentTarget + }) + } catch { + // Instance doesn't exist - create it + await instancesApi.createInstance({ + id: instanceId, + template_id: selectedService.service_id, + name: `${selectedService.display_name} (${selectedCluster.name}/${namespace})`, + description: `K8s deployment to ${selectedCluster.name} in ${namespace} namespace`, + config: configValues, + deployment_target: deploymentTarget + }) + } + + // Step 2: Deploy the instance to K8s + // The backend will use centralized resolution which reads from the instance config + const deployResponse = await kubernetesApi.deployService( + selectedCluster.cluster_id, + { + service_id: selectedService.service_id, + namespace: namespace, + instance_id: instanceId + } + ) + + setDeploymentResult(deployResponse.data.message) + setStep('complete') + } catch (err: any) { + console.error('Deployment failed:', err) + setError(`Deployment failed: ${formatError(err)}`) + setStep('configure') + } + } + + const handleEnvConfigChange = (envVarName: string, updates: Partial) => { + setEnvConfigs(prev => ({ + ...prev, + [envVarName]: { ...(prev[envVarName] || { name: envVarName }), ...updates } as EnvVarConfig + })) + } + + const handleClusterSelection = async (cluster: KubernetesCluster) => { + setSelectedCluster(cluster) + setError(null) + + // Use cached infrastructure scan results from cluster + // Infrastructure is cluster-wide, so use any available namespace scan + let infraData = {} + if (cluster.infra_scans && Object.keys(cluster.infra_scans).length > 0) { + // Use the first available scan (infra is typically accessible cluster-wide) + const firstNamespace = Object.keys(cluster.infra_scans)[0] + infraData = cluster.infra_scans[firstNamespace] || {} + console.log(`🔍 Using cached K8s infrastructure from namespace '${firstNamespace}':`, infraData) + } else { + console.warn('No cached infrastructure scan found for cluster') + } + setInfraServices(infraData) + + setStep('select') + } + + const renderClusterSelection = () => ( +
+

+ Select a Kubernetes cluster for deployment +

+ +
+ {availableClusters.map((cluster) => ( + + ))} +
+
+ ) + + const renderSelectService = () => ( +
+

+ Select a service to deploy to {selectedCluster?.name} in namespace {namespace} +

+ +
+ {services.map((service) => ( + + ))} +
+
+ ) + + const renderConfigureEnvVars = () => ( +
+
+

+ {selectedService?.display_name} +

+

+ Configure deployment settings for this service +

+
+ + {error && ( +
+ {error} +
+ )} + + {/* Namespace input */} +
+ + setNamespace(e.target.value)} + placeholder="default" + className="w-full px-3 py-2 rounded border border-neutral-300 dark:border-neutral-600 bg-white dark:bg-neutral-700 text-neutral-900 dark:text-neutral-100" + data-testid="deploy-namespace-input" + /> +

+ Kubernetes namespace where the service will be deployed +

+
+ + {/* Environment Variables */} +
+ +
+ {envVars.map((envVar) => { + const config = envConfigs[envVar.name] || { + name: envVar.name, + source: 'default', + value: undefined, + setting_path: undefined, + new_setting_path: undefined + } + + return ( + handleEnvConfigChange(envVar.name, updates)} + /> + ) + })} +
+
+ +
+ + +
+
+ ) + + const renderDeploying = () => ( +
+ +

+ Deploying {selectedService?.display_name}... +

+

+ Creating ConfigMap, Secret, Deployment, and Service +

+
+ ) + + const renderComplete = () => ( +
+ +

+ Deployment Successful! +

+

+ {deploymentResult} +

+
+

+ Check deployment status: +

+ + kubectl get pods -n {namespace} + +
+ +
+ ) + + return ( + + {step === 'cluster' && renderClusterSelection()} + {step === 'select' && renderSelectService()} + {step === 'configure' && renderConfigureEnvVars()} + {step === 'deploying' && renderDeploying()} + {step === 'complete' && renderComplete()} + + ) +} diff --git a/ushadow/frontend/src/components/EnvVarEditor.tsx b/ushadow/frontend/src/components/EnvVarEditor.tsx new file mode 100644 index 00000000..f98e87ce --- /dev/null +++ b/ushadow/frontend/src/components/EnvVarEditor.tsx @@ -0,0 +1,194 @@ +import { useState } from 'react' +import { Pencil, Lock } from 'lucide-react' +import { EnvVarInfo, EnvVarConfig } from '../services/api' + +interface EnvVarEditorProps { + envVar: EnvVarInfo + config: EnvVarConfig + onChange: (updates: Partial) => void +} + +/** + * Shared component for editing environment variable configuration. + * + * Supports: + * - Mapping to existing settings (via dropdown of suggestions) + * - Manual value entry (auto-creates new settings) + * - Default values + * - Secret masking + * - Locked fields (provider-supplied values) + * + * Used by: + * - ServicesPage (for Docker service configuration) + * - DeployToK8sModal (for K8s deployment configuration) + * - InstancesPage (for instance configuration) + */ +export default function EnvVarEditor({ envVar, config, onChange }: EnvVarEditorProps) { + const [editing, setEditing] = useState(false) + const [showMapping, setShowMapping] = useState(config.source === 'setting' && !config.locked) + + const isSecret = envVar.name.includes('KEY') || envVar.name.includes('SECRET') || envVar.name.includes('PASSWORD') + const hasDefault = envVar.has_default && envVar.default_value + const isUsingDefault = config.source === 'default' || (!config.value && !config.setting_path && hasDefault) + const isLocked = config.locked || false + + // Generate setting path from env var name for auto-creating settings + const autoSettingPath = () => { + const name = envVar.name.toLowerCase() + if (name.includes('api_key') || name.includes('key') || name.includes('secret') || name.includes('token')) { + return `api_keys.${name}` + } + return `settings.${name}` + } + + // Handle value input - auto-create setting + const handleValueChange = (value: string) => { + if (value) { + onChange({ source: 'new_setting', new_setting_path: autoSettingPath(), value, setting_path: undefined }) + } else { + onChange({ source: 'default', value: undefined, setting_path: undefined, new_setting_path: undefined }) + } + } + + // Check if there's a matching suggestion for auto-mapping + const matchingSuggestion = envVar.suggestions.find((s) => { + const envName = envVar.name.toLowerCase() + const pathParts = s.path.toLowerCase().split('.') + const lastPart = pathParts[pathParts.length - 1] + return envName.includes(lastPart) || lastPart.includes(envName.replace(/_/g, '')) + }) + + // Auto-map if matching and not yet configured + const effectiveSettingPath = config.setting_path || (matchingSuggestion?.has_value ? matchingSuggestion.path : undefined) + + // Locked fields - provided by wired providers or infrastructure + if (isLocked) { + const displayValue = config.value || '' + const isMaskedSecret = isSecret && displayValue.length > 0 + const maskedValue = isMaskedSecret ? '•'.repeat(Math.min(displayValue.length, 20)) : displayValue + + return ( +
+ {/* Label */} + + {envVar.name} + {envVar.is_required && *} + + + {/* Padlock icon */} +
+ +
+ + {/* Value display */} +
+ + {maskedValue} + + + {config.provider_name || 'infrastructure'} + +
+
+ ) + } + + return ( +
+ {/* Label */} + + {envVar.name} + {envVar.is_required && *} + + + {/* Map button - LEFT of input */} + + + {/* Input area */} +
+ {showMapping ? ( + // Mapping mode - styled dropdown + + ) : hasDefault && isUsingDefault && !editing ? ( + // Default value display + <> + + {envVar.default_value} + + default + + + ) : ( + // Value input + handleValueChange(e.target.value)} + placeholder="enter value" + className="flex-1 px-2 py-1.5 text-xs rounded border-0 bg-neutral-700/50 text-neutral-200 focus:outline-none focus:ring-1 focus:ring-primary-500 placeholder:text-neutral-500" + autoFocus={editing} + onBlur={() => { + if (!config.value && hasDefault) setEditing(false) + }} + data-testid={`value-input-${envVar.name}`} + /> + )} +
+
+ ) +} diff --git a/ushadow/frontend/src/components/chronicle/ChronicleConversations.tsx b/ushadow/frontend/src/components/chronicle/ChronicleConversations.tsx index 8cdefbf4..f7216423 100644 --- a/ushadow/frontend/src/components/chronicle/ChronicleConversations.tsx +++ b/ushadow/frontend/src/components/chronicle/ChronicleConversations.tsx @@ -572,7 +572,7 @@ export default function ChronicleConversations({ onAuthRequired }: ChronicleConv {openDropdown === (conversation.conversation_id || conversation.audio_uuid) && ( -
+
+ {showDeployMenu && menuPosition && createPortal( +
+ + + +
, + document.body + )} + + )} {onEdit && (
- {getStatusIcon(node.status)} +
+ {node.role === 'leader' && ( + + )} + {getStatusIcon(node.status)} +
{/* Node IP */} @@ -679,7 +706,20 @@ export default function ClusterPage() { return (
{ + if (deployment.status === 'failed') { + e.stopPropagation() + handleViewLogs(deployment.id) + } + }} + title={deployment.status === 'failed' ? 'Click to view logs and error details' : ''} >
{isNodeOffline && ( @@ -702,6 +742,7 @@ export default function ClusterPage() { href={deployment.access_url} target="_blank" rel="noopener noreferrer" + onClick={(e) => e.stopPropagation()} className="p-1 text-neutral-500 hover:text-primary-600 rounded" title={`Open ${deployment.access_url}`} > @@ -710,7 +751,10 @@ export default function ClusterPage() { )} {deployment.status === 'running' ? ( ) : deployment.status === 'stopped' ? ( ) : null}
+ {availableToAdd.length > 0 && (
)} - {/* Templates - Compose */} + {/* Templates - Compose Services with Instances */} {composeTemplates.length > 0 && (

@@ -1499,16 +1979,151 @@ export default function InstancesPage() { Compose Services

-
- {composeTemplates.map((template) => ( - toggleTemplate(template.id)} - onCreate={() => openCreateInstanceModal(template)} - /> - ))} +
+ {composeTemplates.map((template) => { + const templateInstances = instancesByTemplate[template.id] || [] + const isExpanded = expandedTemplates.has(template.id) + + return ( +
+ {/* Service Template Row */} +
toggleTemplate(template.id)} + data-testid={`service-template-${template.id}`} + > + + + + +
+

+ {template.display_name || template.id} +

+ {template.description && ( +

+ {template.description} +

+ )} +
+ + {templateInstances.length > 0 && ( + + {templateInstances.length} {templateInstances.length === 1 ? 'instance' : 'instances'} + + )} + + +
+ + {/* Service Instances (indented) */} + {isExpanded && templateInstances.length > 0 && ( +
+ {templateInstances.map((instance) => { + const details = instanceDetails[instance.id] + const isRunning = details?.status === 'running' + + return ( +
+ + +
+
+ + {instance.name} + + {details && getStatusBadge(details.status)} +
+
+ {instance.deployment_target || 'local'} + {details?.outputs?.access_url && ( + <> + + e.stopPropagation()} + > + {details.outputs.access_url} + + + )} +
+
+ +
+ {isRunning && ( + + )} + + + + +
+
+ ) + })} +
+ )} +
+ ) + })}
@@ -1592,6 +2207,7 @@ export default function InstancesPage() { onEditConsumer={handleEditConsumer} onStartConsumer={handleStartConsumer} onStopConsumer={handleStopConsumer} + onDeployConsumer={handleDeployConsumer} />
@@ -1746,24 +2362,66 @@ export default function InstancesPage() {

- {/* Config fields */} - {editingProvider.template.config_schema && editingProvider.template.config_schema.length > 0 && ( -
- {editingProvider.template.config_schema.map((field: any) => ( - - setEditConfig((prev) => ({ - ...prev, - [field.key]: value, - })) - } - /> - ))} -
- )} + {/* Config fields - providers use config_schema */} + {editingProvider.template.source === 'provider' && + editingProvider.template.config_schema && + editingProvider.template.config_schema.length > 0 && ( +
+ {editingProvider.template.config_schema.map((field: any) => ( + + setEditConfig((prev) => ({ + ...prev, + [field.key]: value, + })) + } + /> + ))} +
+ )} + + {/* Environment variables - compose services (both templates and instances) */} + {editingProvider.template.source === 'compose' && + (loadingEnvConfig ? ( +
+ + Loading configuration... +
+ ) : envVars.length > 0 ? ( +
+ +
+ {envVars.map((envVar) => { + const config = envConfigs[envVar.name] || { + name: envVar.name, + source: 'default', + value: undefined, + setting_path: undefined, + new_setting_path: undefined, + } + + return ( + { + setEnvConfigs((prev) => ({ + ...prev, + [envVar.name]: { ...prev[envVar.name], ...updates } as EnvVarConfig, + })) + }} + /> + ) + })} +
+
+ ) : null)} {/* Footer */}
@@ -1847,6 +2505,117 @@ export default function InstancesPage() { onConfirm={confirmDeleteInstance} onCancel={() => setConfirmDialog({ isOpen: false, instanceId: null })} /> + + {/* Deploy to Kubernetes Modal */} + {deployModalState.isOpen && deployModalState.targetType === 'kubernetes' && ( + setDeployModalState({ isOpen: false, serviceId: null, targetType: null })} + cluster={deployModalState.selectedClusterId ? kubernetesClusters.find((c) => c.cluster_id === deployModalState.selectedClusterId) : undefined} + availableClusters={kubernetesClusters} + infraServices={deployModalState.infraServices || {}} + preselectedServiceId={deployModalState.serviceId || undefined} + /> + )} + + {/* Service Catalog Modal */} + setShowCatalog(false)} + title="Service Catalog" + maxWidth="2xl" + testId="catalog-modal" + > + {catalogLoading ? ( +
+ +
+ ) : ( +
+ {catalogServices.map(service => ( +
+
+
+
+ +
+
+

+ {service.service_name.split('-').map((w: string) => w.charAt(0).toUpperCase() + w.slice(1)).join(' ')} +

+

+ {service.description || service.image || 'Docker service'} +

+
+
+ +
+ {/* Capabilities */} + {service.requires && service.requires.length > 0 && ( +
+ {service.requires.map((cap: string) => ( + + {cap} + + ))} +
+ )} + + {/* Install/Uninstall Button */} + {service.installed ? ( + + ) : ( + + )} +
+
+
+ ))} + + {catalogServices.length === 0 && ( +
+ +

No services found in the catalog

+
+ )} +
+ )} +
+ + {/* TODO: Add Deploy to uNode modals for local and remote */}
) } @@ -2132,3 +2901,179 @@ function TemplateCard({ template, isExpanded, onToggle, onCreate, onRemove }: Te ) } + +// ============================================================================= +// Env Var Row Component (matches ServicesPage env var editor) +// ============================================================================= + +interface EnvVarRowProps { + envVar: EnvVarInfo + config: EnvVarConfig + onChange: (updates: Partial) => void +} + +function EnvVarRow({ envVar, config, onChange }: EnvVarRowProps) { + const [editing, setEditing] = useState(false) + const [showMapping, setShowMapping] = useState(config.source === 'setting' && !config.locked) + + const isSecret = envVar.name.includes('KEY') || envVar.name.includes('SECRET') || envVar.name.includes('PASSWORD') + const hasDefault = envVar.has_default && envVar.default_value + const isUsingDefault = config.source === 'default' || (!config.value && !config.setting_path && hasDefault) + const isLocked = config.locked || false + + // Generate setting path from env var name for auto-creating settings + const autoSettingPath = () => { + const name = envVar.name.toLowerCase() + if (name.includes('api_key') || name.includes('key') || name.includes('secret') || name.includes('token')) { + return `api_keys.${name}` + } + return `settings.${name}` + } + + // Handle value input - auto-create setting + const handleValueChange = (value: string) => { + if (value) { + onChange({ source: 'new_setting', new_setting_path: autoSettingPath(), value, setting_path: undefined }) + } else { + onChange({ source: 'default', value: undefined, setting_path: undefined, new_setting_path: undefined }) + } + } + + // Check if there's a matching suggestion for auto-mapping + const matchingSuggestion = envVar.suggestions.find((s) => { + const envName = envVar.name.toLowerCase() + const pathParts = s.path.toLowerCase().split('.') + const lastPart = pathParts[pathParts.length - 1] + return envName.includes(lastPart) || lastPart.includes(envVar.name.replace(/_/g, '')) + }) + + // Auto-map if matching and not yet configured + const effectiveSettingPath = config.setting_path || (matchingSuggestion?.has_value ? matchingSuggestion.path : undefined) + + // Locked fields - provided by wired providers or infrastructure + if (isLocked) { + const displayValue = config.value || '' + const isMaskedSecret = isSecret && displayValue.length > 0 + const maskedValue = isMaskedSecret ? '•'.repeat(Math.min(displayValue.length, 20)) : displayValue + + return ( +
+ {/* Label */} + + {envVar.name} + {envVar.is_required && *} + + + {/* Padlock icon */} +
+ +
+ + {/* Value display */} +
+ + {maskedValue} + + + {config.provider_name || 'infrastructure'} + +
+
+ ) + } + + return ( +
+ {/* Label */} + + {envVar.name} + {envVar.is_required && *} + + + {/* Map button - LEFT of input */} + + + {/* Input area */} +
+ {showMapping ? ( + // Mapping mode - styled dropdown + + ) : hasDefault && isUsingDefault && !editing ? ( + // Default value display + <> + + {envVar.default_value} + + default + + + ) : ( + // Value input + handleValueChange(e.target.value)} + placeholder="enter value" + className="flex-1 px-2 py-1.5 text-xs rounded border-0 bg-neutral-700/50 text-neutral-200 focus:outline-none focus:ring-1 focus:ring-primary-500 placeholder:text-neutral-500" + autoFocus={editing} + onBlur={() => { + if (!config.value && hasDefault) setEditing(false) + }} + data-testid={`value-input-${envVar.name}`} + /> + )} +
+
+ ) +} diff --git a/ushadow/frontend/src/pages/KubernetesClustersPage.tsx b/ushadow/frontend/src/pages/KubernetesClustersPage.tsx index 1c20f1ea..5abdc6b9 100644 --- a/ushadow/frontend/src/pages/KubernetesClustersPage.tsx +++ b/ushadow/frontend/src/pages/KubernetesClustersPage.tsx @@ -1,7 +1,24 @@ import { useState, useEffect } from 'react' import { createPortal } from 'react-dom' -import { Server, Plus, RefreshCw, Trash2, CheckCircle, XCircle, Clock, Upload, X } from 'lucide-react' +import { Server, Plus, RefreshCw, Trash2, CheckCircle, XCircle, Clock, Upload, X, Search, Database, AlertCircle, Rocket } from 'lucide-react' import { kubernetesApi, KubernetesCluster } from '../services/api' +import Modal from '../components/Modal' +import ConfirmDialog from '../components/ConfirmDialog' +import DeployToK8sModal from '../components/DeployToK8sModal' + +interface InfraService { + found: boolean + endpoints: string[] + type: string + default_port: number + error?: string +} + +interface InfraScanResults { + cluster_id: string + namespace: string + infra_services: Record +} export default function KubernetesClustersPage() { const [clusters, setClusters] = useState([]) @@ -9,6 +26,17 @@ export default function KubernetesClustersPage() { const [showAddModal, setShowAddModal] = useState(false) const [adding, setAdding] = useState(false) + // Infrastructure scanning + const [scanningClusterId, setScanningClusterId] = useState(null) + const [scanResults, setScanResults] = useState>({}) + const [showScanResults, setShowScanResults] = useState(null) + const [showNamespaceSelector, setShowNamespaceSelector] = useState(null) + const [scanNamespace, setScanNamespace] = useState('') + + // Deployment + const [showDeployModal, setShowDeployModal] = useState(false) + const [selectedClusterForDeploy, setSelectedClusterForDeploy] = useState(null) + // Form state const [clusterName, setClusterName] = useState('') const [kubeconfig, setKubeconfig] = useState('') @@ -24,6 +52,21 @@ export default function KubernetesClustersPage() { setError(null) const response = await kubernetesApi.listClusters() setClusters(response.data) + + // Load cached scan results from clusters + const cachedScans: Record = {} + response.data.forEach((cluster: KubernetesCluster) => { + if (cluster.infra_scans) { + Object.entries(cluster.infra_scans).forEach(([namespace, scanData]) => { + cachedScans[`${cluster.cluster_id}-${namespace}`] = { + cluster_id: cluster.cluster_id, + namespace: namespace, + infra_services: scanData as Record + } + }) + } + }) + setScanResults(cachedScans) } catch (err: any) { console.error('Error loading clusters:', err) setError(err.response?.data?.detail || 'Failed to load clusters') @@ -68,12 +111,55 @@ export default function KubernetesClustersPage() { try { await kubernetesApi.removeCluster(clusterId) + // Remove scan results for this cluster + const newScanResults = { ...scanResults } + delete newScanResults[clusterId] + setScanResults(newScanResults) loadClusters() } catch (err: any) { alert(`Failed to remove cluster: ${err.response?.data?.detail || err.message}`) } } + const handleScanInfrastructure = async (clusterId: string, namespace?: string) => { + try { + setScanningClusterId(clusterId) + setError(null) + + const cluster = clusters.find(c => c.cluster_id === clusterId) + const namespaceToScan = namespace || cluster?.namespace || 'default' + + const response = await kubernetesApi.scanInfraServices(clusterId, namespaceToScan) + + // Store scan results + setScanResults(prev => ({ + ...prev, + [`${clusterId}-${namespaceToScan}`]: response.data + })) + + // Show results modal + setShowScanResults(`${clusterId}-${namespaceToScan}`) + setShowNamespaceSelector(null) + setScanNamespace('') + } catch (err: any) { + console.error('Error scanning infrastructure:', err) + alert(`Failed to scan infrastructure: ${err.response?.data?.detail || err.message}`) + } finally { + setScanningClusterId(null) + } + } + + const handleOpenNamespaceSelector = (clusterId: string) => { + const cluster = clusters.find(c => c.cluster_id === clusterId) + setScanNamespace(cluster?.namespace || 'ushadow') + setShowNamespaceSelector(clusterId) + } + + const handleOpenDeployModal = (cluster: KubernetesCluster) => { + setSelectedClusterForDeploy(cluster) + setShowDeployModal(true) + } + const handleFileUpload = (e: React.ChangeEvent) => { const file = e.target.files?.[0] if (file) { @@ -105,14 +191,122 @@ export default function KubernetesClustersPage() { } } + const renderInfraScanResults = (clusterId: string) => { + const results = scanResults[clusterId] + if (!results) return null + + const foundServices = Object.entries(results.infra_services).filter(([_, service]) => service.found) + const notFoundServices = Object.entries(results.infra_services).filter(([_, service]) => !service.found) + + return ( + setShowScanResults(null)} + title="Infrastructure Scan Results" + maxWidth="lg" + testId="infra-scan-results-modal" + > +
+
+

+ Scanned namespace: {results.namespace} +

+
+ + {/* Found Services */} + {foundServices.length > 0 && ( +
+

+ + Found Infrastructure ({foundServices.length}) +

+
+ {foundServices.map(([name, service]) => ( +
+
+ + {name} + + + Running + +
+ {service.endpoints.length > 0 && ( +
+

Connection endpoints:

+ {service.endpoints.map((endpoint, idx) => ( + + {endpoint} + + ))} +
+ )} +
+ ))} +
+
+ )} + + {/* Not Found Services */} + {notFoundServices.length > 0 && ( +
+

+ + Not Found ({notFoundServices.length}) +

+
+ {notFoundServices.map(([name, service]) => ( +
+ {name} +

+ Not running in {results.namespace} +

+
+ ))} +
+
+ )} + + {/* Help Text */} +
+

+ Next steps: You can use existing infrastructure services when deploying applications, + or deploy your own infrastructure using the unified deployment UI. +

+
+ + {/* Actions */} +
+ +
+
+
+ ) + } + return ( -
+
{/* Header */}

Kubernetes Clusters

- Manage Kubernetes clusters for service deployment + Configure clusters and scan infrastructure for deployments

+
+
+ ) + })} +
+ )} + + {/* Labels */} + {Object.keys(cluster.labels).length > 0 && ( +
+ {Object.entries(cluster.labels).map(([key, value]) => ( + + {key}: {value} + + ))} +
+ )} + + {/* Actions */} +
+
+
- )} + {scanningClusterId === cluster.cluster_id ? ( + <> + + Scanning... + + ) : ( + <> + + Scan + + )} + + + +
- {/* Actions */} -
- + +
- - ))} + ) + })} )} @@ -235,6 +498,78 @@ export default function KubernetesClustersPage() { )} + {/* Namespace Selector Modal */} + {showNamespaceSelector && ( + setShowNamespaceSelector(null)} + title="Select Namespace to Scan" + maxWidth="md" + testId="namespace-selector-modal" + > +
+

+ Choose which namespace to scan for infrastructure services. +

+ +
+ + setScanNamespace(e.target.value)} + placeholder="e.g., ushadow, default, kube-system" + className="w-full px-4 py-2 rounded-lg border border-neutral-300 dark:border-neutral-600 bg-white dark:bg-neutral-700 text-neutral-900 dark:text-neutral-100" + data-testid="scan-namespace-input" + /> +

+ Common namespaces: ushadow, default, kube-system +

+
+ +
+ + +
+
+
+ )} + + {/* Scan Results Modal */} + {showScanResults && renderInfraScanResults(showScanResults)} + + {/* Deploy to K8s Modal */} + {showDeployModal && selectedClusterForDeploy && ( + { + setShowDeployModal(false) + setSelectedClusterForDeploy(null) + }} + cluster={selectedClusterForDeploy} + infraServices={ + Object.keys(scanResults).find(key => key.startsWith(selectedClusterForDeploy.cluster_id)) + ? scanResults[Object.keys(scanResults).find(key => key.startsWith(selectedClusterForDeploy.cluster_id))!].infra_services + : undefined + } + /> + )} + {/* Add Cluster Modal */} {showAddModal && createPortal(
@@ -291,10 +626,13 @@ export default function KubernetesClustersPage() { type="text" value={namespace} onChange={(e) => setNamespace(e.target.value)} - placeholder="default" + placeholder="ushadow" className="w-full px-4 py-2 rounded-lg border border-neutral-300 dark:border-neutral-600 bg-white dark:bg-neutral-700 text-neutral-900 dark:text-neutral-100" data-testid="namespace-input" /> +

+ Recommended: ushadow +

{/* Kubeconfig Upload */} diff --git a/ushadow/frontend/src/services/api.ts b/ushadow/frontend/src/services/api.ts index 8382766e..4214f304 100644 --- a/ushadow/frontend/src/services/api.ts +++ b/ushadow/frontend/src/services/api.ts @@ -317,6 +317,8 @@ export interface EnvVarConfig { setting_path?: string // For source='setting' - existing setting to map new_setting_path?: string // For source='new_setting' - new setting path to create value?: string // For source='literal' or 'new_setting' + locked?: boolean // For provider-supplied values that cannot be edited + provider_name?: string // Name of the provider supplying this value } export interface EnvVarSuggestion { @@ -541,6 +543,7 @@ export interface KubernetesCluster { node_count?: number namespace: string labels: Record + infra_scans?: Record } export const kubernetesApi = { @@ -552,6 +555,43 @@ export const kubernetesApi = { api.get(`/api/kubernetes/${clusterId}`), removeCluster: (clusterId: string) => api.delete(`/api/kubernetes/${clusterId}`), + + // Service management + getAvailableServices: () => + api.get<{ services: any[] }>('/api/kubernetes/services/available'), + getInfraServices: () => + api.get<{ services: any[] }>('/api/kubernetes/services/infra'), + + // Cluster operations + scanInfraServices: (clusterId: string, namespace: string = 'default') => + api.post<{ cluster_id: string; namespace: string; infra_services: Record }>( + `/api/kubernetes/${clusterId}/scan-infra`, + { namespace } + ), + createEnvmap: (clusterId: string, data: { service_name: string; namespace?: string; env_vars: Record }) => + api.post<{ success: boolean; configmap: string | null; secret: string | null; namespace: string }>( + `/api/kubernetes/${clusterId}/envmap`, + { namespace: 'default', ...data } + ), + deployService: (clusterId: string, data: { service_id: string; namespace?: string; k8s_spec?: any; instance_id?: string }) => + api.post<{ success: boolean; message: string; service_id: string; namespace: string }>( + `/api/kubernetes/${clusterId}/deploy`, + { namespace: 'default', ...data } + ), + + // Pod operations + listPods: (clusterId: string, namespace: string = 'ushadow') => + api.get<{ pods: Array<{ name: string; namespace: string; status: string; restarts: number; age: string; labels: Record; node: string }>; namespace: string }>( + `/api/kubernetes/${clusterId}/pods?namespace=${namespace}` + ), + getPodLogs: (clusterId: string, podName: string, namespace: string = 'ushadow', previous: boolean = false, tailLines: number = 100) => + api.get<{ pod_name: string; namespace: string; previous: boolean; logs: string }>( + `/api/kubernetes/${clusterId}/pods/${podName}/logs?namespace=${namespace}&previous=${previous}&tail_lines=${tailLines}` + ), + getPodEvents: (clusterId: string, podName: string, namespace: string = 'ushadow') => + api.get<{ pod_name: string; namespace: string; events: Array<{ type: string; reason: string; message: string; count: number; first_timestamp: string | null; last_timestamp: string | null }> }>( + `/api/kubernetes/${clusterId}/pods/${podName}/events?namespace=${namespace}` + ), } // Service Definition and Deployment types @@ -1061,6 +1101,7 @@ export interface Template { tags: string[] configured: boolean // Whether required config fields are set (for providers) available: boolean // Whether local service is running (for local providers) + installed: boolean // Whether service is installed (for compose services) } /** Instance config values */ From 442886311749f8fd59238a27af5b404b32574042 Mon Sep 17 00:00:00 2001 From: Stu Alexander Date: Fri, 16 Jan 2026 11:26:00 +0000 Subject: [PATCH 02/45] wip with instances --- docs/ENABLE_IPV6.md | 216 ++++++++++++ docs/IPV6_DNS_FIX.md | 280 ++++++++++++++++ docs/KUBELET_SYSCTL_CONFIGURATION.md | 312 ++++++++++++++++++ docs/KUBERNETES_VOLUME_MOUNTING.md | 296 +++++++++++++++++ scripts/k8s-helpers.sh | 267 +++++++++++++++ ushadow/backend/src/models/deployment.py | 1 + .../src/services/deployment_backends.py | 7 +- .../src/services/deployment_manager.py | 66 +++- .../backend/src/services/instance_manager.py | 167 +++++++--- 9 files changed, 1563 insertions(+), 49 deletions(-) create mode 100644 docs/ENABLE_IPV6.md create mode 100644 docs/IPV6_DNS_FIX.md create mode 100644 docs/KUBELET_SYSCTL_CONFIGURATION.md create mode 100644 docs/KUBERNETES_VOLUME_MOUNTING.md create mode 100755 scripts/k8s-helpers.sh diff --git a/docs/ENABLE_IPV6.md b/docs/ENABLE_IPV6.md new file mode 100644 index 00000000..7b17c3d8 --- /dev/null +++ b/docs/ENABLE_IPV6.md @@ -0,0 +1,216 @@ +# Enable IPv6 on MicroK8s Cluster + +## Problem +The cluster nodes have IPv6 disabled, but your network has full IPv6 connectivity (confirmed from your Mac). This causes applications like Chronicle (using `uv`) to fail when trying to access PyPI over IPv6. + +## Solution +Enable IPv6 on all cluster nodes and restart MicroK8s. + +--- + +## Step 1: Enable IPv6 on Each Node + +Run these commands on **each node** (anubis, babel, ra): + +```bash +# SSH to node +ssh anubis # or babel, or ra + +# Enable IPv6 +sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0 +sudo sysctl -w net.ipv6.conf.default.disable_ipv6=0 +sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=0 + +# Make persistent +echo "net.ipv6.conf.all.disable_ipv6=0" | sudo tee -a /etc/sysctl.conf +echo "net.ipv6.conf.default.disable_ipv6=0" | sudo tee -a /etc/sysctl.conf + +# Wait for IPv6 address (SLAAC from router) +sleep 3 + +# Verify IPv6 is working +ip -6 addr show | grep inet6 | grep -v "scope host" +ping6 -c 2 google.com +``` + +**Expected output:** +- You should see a global IPv6 address (starts with 2xxx:xxxx:...) +- Ping to google.com should succeed + +--- + +## Step 2: Restart MicroK8s on Each Node + +After enabling IPv6 on all nodes, restart MicroK8s: + +```bash +# On each node: +ssh anubis +microk8s stop +sleep 3 +microk8s start +exit + +# Repeat for babel and ra +``` + +Or restart all at once (from your Mac): + +```bash +ssh anubis "microk8s stop && sleep 3 && microk8s start" & +ssh babel "microk8s stop && sleep 3 && microk8s start" & +# Wait for both to complete +wait + +# Note: ra appears to be down, might need manual intervention +``` + +--- + +## Step 3: Verify Cluster Status + +```bash +# Check nodes are ready +kubectl get nodes + +# Check pods can use IPv6 +kubectl run test-ipv6 --image=busybox --restart=Never --rm -i -- ping6 -c 2 google.com + +# Should see successful ping responses +``` + +--- + +## Step 4: Remove IPv6 Sysctl from kubernetes_manager.py + +Once IPv6 is working, remove the sysctl code we added (since it's not working anyway): + +**File**: `ushadow/backend/src/services/kubernetes_manager.py` +**Lines**: 675-684 + +Remove: +```python +# Disable IPv6 at pod level to prevent DNS resolution issues +# with tools like uv that don't gracefully fall back to IPv4 +"securityContext": { + "sysctls": [ + { + "name": "net.ipv6.conf.all.disable_ipv6", + "value": "1" + } + ] +} +``` + +Replace with just: +```python +# Pod runs with default security context +``` + +--- + +## Step 5: Update Calico IPv6 Configuration + +Check if Calico has IPv6 enabled: + +```bash +kubectl get ippool -o yaml | grep cidr +``` + +If you only see IPv4 CIDRs, you may need to enable Calico IPv6: + +```bash +# Create IPv6 IP pool +cat </dev/null; then + echo "Adding to /etc/sysctl.conf for persistence..." + nsenter -t 1 -m -u -n -i -- sh -c 'echo "net.ipv6.conf.all.disable_ipv6=1" >> /etc/sysctl.conf' + nsenter -t 1 -m -u -n -i -- sh -c 'echo "net.ipv6.conf.default.disable_ipv6=1" >> /etc/sysctl.conf' + fi + + echo "Done" + securityContext: + privileged: true + containers: + - name: pause + image: gcr.io/google_containers/pause:3.1 + tolerations: + - effect: NoSchedule + operator: Exists + - key: CriticalAddonsOnly + operator: Exists + - effect: NoExecute + operator: Exists +``` + +**Apply the DaemonSet:** + +```bash +kubectl apply -f /tmp/disable-ipv6-daemonset.yaml +``` + +## How This Works + +1. **DaemonSet runs on ALL nodes** - One pod per node (anubis, babel, ra) +2. **Privileged access** - Uses `nsenter` to break into host namespace +3. **Kernel-level disable** - Sets `net.ipv6.conf.all.disable_ipv6=1` on the host +4. **Persistent across reboots** - Writes settings to `/etc/sysctl.conf` + +This is a **cluster-wide fix** that affects: +- All nodes in the cluster +- All pods on those nodes +- All network connections from pods +- Persists after node reboots + +## Verification + +### Check DaemonSet is Running on All Nodes + +```bash +kubectl get daemonset -n kube-system disable-ipv6 +``` + +Expected output: +``` +NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE +disable-ipv6 3 3 3 3 3 +``` + +### Verify IPv6 is Disabled on Each Node + +```bash +kubectl get pods -n kube-system -l name=disable-ipv6 +``` + +Check logs from any pod: +```bash +kubectl logs -n kube-system disable-ipv6-XXXXX -c disable-ipv6 +``` + +Should show: +``` +✅ IPv6 disabled on host +Current IPv6 status: +net.ipv6.conf.all.disable_ipv6 = 1 +``` + +### Test Application DNS Resolution + +Create a test pod to verify applications can now resolve DNS: + +```bash +kubectl run dns-test --image=busybox --restart=Never -- sh -c "nslookup github.com && nslookup pypi.org" +kubectl logs dns-test +kubectl delete pod dns-test +``` + +## Previous Attempts (Did Not Fully Solve the Issue) + +1. ✅ **CoreDNS template to block AAAA queries** - Helped but didn't solve the root issue + - Some applications bypass CoreDNS or use cached DNS + +2. ✅ **gai.conf modification for IPv4 preference** - Helped for some tools + - Modified `/etc/gai.conf` with `precedence ::ffff:0:0/96 100` + - Not respected by all applications (especially Rust-based tools) + +3. ❌ **Calico FELIX_IPV6SUPPORT setting** - Was already set correctly + - `FELIX_IPV6SUPPORT=true` (for IPv6 readiness, not routing) + - Didn't prevent IPv6 address assignment issues + +## Current Status + +**What Works:** +- ✅ DNS queries return IPv4 addresses only (no AAAA records) +- ✅ Busybox/curl/wget can resolve DNS +- ✅ Pods get IPv4-only addresses (no IPv6 from Calico pool) + +**What Still Fails:** +- ❌ Applications with IPv6 enabled (like Rust-based `uv`) still try IPv6 +- ❌ **Cannot disable IPv6 in pod network namespaces** - Kubernetes forbids the required sysctl +- ❌ Chronicle `no-spacy` image **still downloads packages at runtime** despite the name + +## The Real Problem + +The fundamental issue is **NOT just IPv6** - it's that: + +1. **Chronicle images download dependencies at runtime** instead of having them baked in +2. The Rust-based `uv` package manager doesn't gracefully fall back from IPv6 to IPv4 +3. Kubernetes security policy prevents using `sysctls` to disable IPv6 in pods + +**Why Host-Level Disable Doesn't Help:** +- Each pod has its own isolated network namespace +- Host sysctls don't propagate to pods +- Test shows: `cat /proc/sys/net/ipv6/conf/all/disable_ipv6` returns `0` (enabled) in pods + +## Solutions Going Forward + +### Option 1: Use a Pre-Built Chronicle Image (RECOMMENDED) +Find or build a Chronicle image that has ALL dependencies pre-installed: +```bash +# Image should include spacy, all Python packages +# No runtime downloads needed +``` + +### Option 2: PyPI Mirror/Proxy +Set up a local PyPI mirror that's reachable via IPv4: +```bash +# devpi, bandersnatch, or pypi-mirror +# Configure UV_INDEX_URL to point to local mirror +``` + +### Option 3: Force IPv4 in Application +Try environment variables to force IPv4 (limited success): +```yaml +env: +- name: FORCE_IPV4 + value: "1" +# May not work with all Rust networking stacks +``` + +## Impact of Current Fixes + +**Partial Success:** +- Standard tools (curl, wget, nslookup) work fine +- DNS resolution returns IPv4 only +- Host networking improved + +**Still Broken:** +- Rust-based tools like `uv` that don't fall back to IPv4 +- Any application that tries IPv6 connections despite DNS returning IPv4 + +## Maintenance + +- **DaemonSet is permanent** - Runs continuously to ensure IPv6 stays disabled +- **Automatic on new nodes** - Any new node added to cluster will automatically get IPv6 disabled +- **Survives node reboots** - Settings are written to `/etc/sysctl.conf` + +## Rollback (If Needed) + +To re-enable IPv6: + +```bash +# Delete the DaemonSet +kubectl delete daemonset disable-ipv6 -n kube-system + +# On each node, manually re-enable IPv6: +# ssh to node and run: +sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0 +sudo sysctl -w net.ipv6.conf.default.disable_ipv6=0 + +# Remove from sysctl.conf +sudo sed -i '/disable_ipv6/d' /etc/sysctl.conf + +# Recreate the IPv6 IP pool (if desired) +# You would need the original IPv6 pool configuration +``` + +## Date Applied + +- **2026-01-14** - IPv6 IP pool deleted +- **2026-01-14** - DaemonSet deployed and verified on all 3 nodes (anubis, babel, ra) + +## Nodes Affected + +- anubis (192.168.1.42) +- babel (192.168.1.43) +- ra (192.168.1.44) + +All nodes now have IPv6 disabled at the kernel level. diff --git a/docs/KUBELET_SYSCTL_CONFIGURATION.md b/docs/KUBELET_SYSCTL_CONFIGURATION.md new file mode 100644 index 00000000..f5e7161b --- /dev/null +++ b/docs/KUBELET_SYSCTL_CONFIGURATION.md @@ -0,0 +1,312 @@ +# Kubelet Unsafe Sysctl Configuration - Status Report + +**Date**: 2026-01-14 +**Goal**: Enable `net.ipv6.conf.all.disable_ipv6` sysctl in Kubernetes pods to fix Chronicle DNS issues + +--- + +## Current Status: ❌ NOT WORKING + +Pods attempting to use the `net.ipv6.conf.all.disable_ipv6` sysctl still fail with: +``` +Status: SysctlForbidden +``` + +## The Problem We're Solving + +Chronicle uses `uv` (Rust-based package manager) which doesn't gracefully fall back from IPv6 to IPv4 when making HTTP connections to PyPI. This causes DNS resolution failures: + +``` +dns error: failed to lookup address information: Name has no usable address +``` + +**Root Cause**: The cluster nodes have no IPv6 routing to the internet, but `uv`'s HTTP client (reqwest) tries IPv6 first and fails instead of falling back to IPv4. + +**Solution**: Disable IPv6 at the pod level so `uv` never attempts IPv6 connections. + +--- + +## What We've Configured + +### 1. Added Kubelet Argument to All Nodes (✅ DONE) + +Used a DaemonSet to add the following line to `/var/snap/microk8s/current/args/kubelet` on all nodes: + +```bash +--allowed-unsafe-sysctls='net.ipv6.*' +``` + +**Verification on anubis**: +```bash +$ cat /var/snap/microk8s/current/args/kubelet | grep allowed +--allowed-unsafe-sysctls='net.ipv6.*' +``` + +✅ The configuration file has been updated correctly. + +### 2. Updated kubernetes_manager.py (✅ DONE) + +Added pod-level sysctl to deployment spec (lines 675-684): + +```python +"securityContext": { + "sysctls": [ + { + "name": "net.ipv6.conf.all.disable_ipv6", + "value": "1" + } + ] +} +``` + +✅ The code is ready to deploy pods with IPv6 disabled. + +--- + +## What We've Tried + +### Attempt 1: Deploy DaemonSet to Modify Kubelet Args +**Status**: ✅ Success +**Method**: Created `/tmp/allow-ipv6-sysctl-daemonset.yaml` +**Result**: Configuration added to `/var/snap/microk8s/current/args/kubelet` + +### Attempt 2: Restart MicroK8s Using `microk8s stop && microk8s start` +**Status**: ⚠️ Partial - Caused control plane downtime +**Method**: Privileged pod with nsenter to run `microk8s stop && start` +**Result**: Control plane went down (anubis is the master). User manually recovered cluster. + +### Attempt 3: Restart Individual Nodes (babel, ra) +**Status**: ✅ Completed +**Method**: Privileged pods with nsenter to restart worker nodes +**Result**: Nodes restarted successfully, cluster stable + +### Attempt 4: Restart Kubelet Daemon +**Status**: ❌ Failed +**Method**: `nsenter -t 1 ... snapctl restart microk8s.daemon-kubelet` +**Result**: Command executed but sysctl still forbidden + +### Attempt 5: Restart All MicroK8s Services Using `snap restart` +**Status**: ✅ Command succeeded +**Method**: `nsenter -t 1 ... snap restart microk8s` +**Result**: Command executed, cluster came back online, but sysctl still forbidden + +### Attempt 6: Test Pod with IPv6 Sysctl +**Status**: ❌ Failed (tested 3 times) +**Test Pods**: `test-ipv6-sysctl`, `test-ipv6-sysctl-v2`, `test-ipv6-sysctl-v3` +**Result**: All failed with `Status: SysctlForbidden` + +--- + +## Current Cluster State + +**Nodes**: All healthy +``` +NAME STATUS ROLES AGE VERSION +anubis Ready 141d v1.33.7 +babel Ready 140d v1.33.7 +ra Ready 47d v1.33.7 +``` + +**Core Services**: Running +- calico-node: Running on all nodes +- coredns: Running +- Cluster is stable and accepting workloads + +**Kubelet Args File**: Contains the correct configuration +```bash +--allowed-unsafe-sysctls='net.ipv6.*' +``` + +**Test Pod Status**: Failing +``` +NAME READY STATUS RESTARTS AGE +test-ipv6-sysctl-v3 0/1 SysctlForbidden 0 XXs +``` + +--- + +## Why It's Not Working + +### Theory 1: Kubelet Hasn't Reloaded the Args File + +Even though we modified `/var/snap/microk8s/current/args/kubelet`, the kubelet process may not have picked up the new configuration. + +**Evidence**: +- We modified the args file ✅ +- We restarted services ✅ +- But pods still fail with SysctlForbidden ❌ + +**Possible Cause**: The kubelet may need a different restart method, or the args file might not be read during restart. + +### Theory 2: MicroK8s Overrides the Args File + +MicroK8s might have a different mechanism for configuring kubelet that overrides the args file. + +**Possible Locations**: +- `/var/snap/microk8s/current/args/*` - Other arg files +- MicroK8s snap configuration +- systemd drop-in files + +### Theory 3: Wrong Restart Method + +We used `snap restart microk8s` which should restart all services, but maybe the kubelet needs a more forceful restart: +- Kill the kubelet process directly? +- Full node reboot? +- Different snap command? + +--- + +## What We Haven't Tried Yet + +### Option 1: Check Actual Kubelet Process Arguments ⏭️ NEXT STEP + +Verify what arguments the running kubelet process actually has: +```bash +ps auxww | grep kubelet | grep -v grep +``` + +This will show if `--allowed-unsafe-sysctls` is actually being passed to the process. + +### Option 2: Full Node Reboot + +A complete node reboot would ensure all services start fresh with new configuration: +```bash +# On each node: +sudo reboot +``` + +**Risk**: Cluster downtime during reboots. + +### Option 3: Manually Kill and Restart Kubelet + +Force-kill the kubelet process and let snap restart it: +```bash +pkill -9 kubelet +# Snap should auto-restart it +``` + +### Option 4: Check MicroK8s Documentation + +Look for MicroK8s-specific way to configure kubelet: +- `microk8s kubectl` might have different config paths +- MicroK8s might use a different config mechanism + +### Option 5: Alternative: Use PodSecurityPolicy + +Instead of sysctls, use PodSecurityPolicy to allow the sysctl: +```yaml +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: allow-ipv6-sysctl +spec: + allowedUnsafeSysctls: + - net.ipv6.conf.all.disable_ipv6 +``` + +**Note**: PodSecurityPolicy is deprecated in K8s 1.25+, replaced by Pod Security Standards. + +--- + +## Update: Root Cause Found + +**MicroK8s 1.33.7 uses kubelite architecture**: The cluster runs a single `kubelite` binary (not separate kubelet) that internally runs all K8s components. + +**Verification Results**: +✅ Args file contains correct flag: `--allowed-unsafe-sysctls='net.ipv6.*'` +❌ Running kubelite process does NOT have this flag +❌ Test pods still fail with `SysctlForbidden` + +**The Problem**: Even though the flag is in `/var/snap/microk8s/8596/args/kubelet`, kubelite's internal kubelet component is not honoring it. This may be: +1. A bug in kubelite's args file parsing +2. Version-specific limitation in MicroK8s 1.33.7 +3. Requires different restart method (full reboot) + +## Recommended Path Forward + +### Option 1: Enable IPv6 Internet Access (PREFERRED) + +Instead of disabling IPv6 in pods, fix the underlying issue - nodes don't have IPv6 internet connectivity. + +**Why This Is Better**: +- No security workarounds needed +- Works with all software (not just workarounds) +- Future-proof as more services require IPv6 + +**Implementation**: +1. Enable IPv6 internet routing on host nodes +2. Configure NAT64/DNS64 if needed for IPv4 services +3. Verify connectivity: `ping6 pypi.org` + +**Resources**: +- [MicroK8s Dual-Stack Configuration](https://microk8s.io/docs/explain-dual-stack) +- [IPv6 Masquerading for Egress on MicroK8s](https://www.checklyhq.com/blog/ipv6-masquerading-for-egress-on-microk8s-on-ec2/) +- [How to enable IPv6 when MicroK8s is already installed](https://discuss.kubernetes.io/t/how-can-i-enable-ipv6-when-microk8s-is-already-installed/25312) + +### Option 2: Full Node Reboot + +If you want to continue with the sysctl approach, try full node reboots: +```bash +# Reboot one at a time to maintain availability +ssh anubis "sudo reboot" +# Wait for anubis to come back +ssh babel "sudo reboot" +# Wait for babel to come back +ssh ra "sudo reboot" +``` + +**Risk**: May still not work due to kubelite limitation. + +### Option 3: Use UV_NO_SYNC Workaround + +Modify Chronicle's startup command to skip package sync: +```yaml +command: + - /bin/bash + - -c + - | + export UV_NO_SYNC=1 + exec uv run --extra deepgram python src/advanced_omi_backend/main.py +``` + +**Downsides**: +- Only works if dependencies are pre-installed in image +- Doesn't fix the underlying IPv6 issue +- Other services may have similar problems + +--- + +## Resources Created + +### Files Modified +- `/var/snap/microk8s/current/args/kubelet` - Added `--allowed-unsafe-sysctls='net.ipv6.*'` on all nodes +- `/Users/stu/repos/worktrees/ushadow/purple/ushadow/backend/src/services/kubernetes_manager.py` - Added sysctl to pod spec (lines 675-684) + +### DaemonSets Deployed +- `allow-ipv6-sysctl` (namespace: kube-system) - Configured kubelet args +- `disable-ipv6` (namespace: kube-system) - Disabled IPv6 at host level (still running) + +### Test Pods Created +- `test-ipv6-sysctl` - Deleted +- `test-ipv6-sysctl-v2` - Deleted +- `test-ipv6-sysctl-v3` - Currently failing with SysctlForbidden + +--- + +## Related Documentation + +- [IPv6 DNS Fix](./IPV6_DNS_FIX.md) - Previous attempts at fixing IPv6 issues +- [Kubernetes Volume Mounting](./KUBERNETES_VOLUME_MOUNTING.md) - Volume mount implementation for config files + +--- + +## Summary + +We have successfully: +1. ✅ Modified kubelet args file to allow IPv6 sysctls +2. ✅ Updated code to deploy pods with IPv6 sysctl +3. ✅ Restarted MicroK8s services + +But the kubelet is still rejecting the sysctl. The configuration in the args file is correct, but the running kubelet process doesn't appear to be using it. + +**Immediate Action Required**: Verify the actual kubelet process arguments to determine if the configuration was loaded. diff --git a/docs/KUBERNETES_VOLUME_MOUNTING.md b/docs/KUBERNETES_VOLUME_MOUNTING.md new file mode 100644 index 00000000..2e89885e --- /dev/null +++ b/docs/KUBERNETES_VOLUME_MOUNTING.md @@ -0,0 +1,296 @@ +# Kubernetes Volume Mounting for Docker Compose Services + +## Overview + +The `kubernetes_manager.py` now automatically handles volume mounts from Docker Compose files when deploying services to Kubernetes. This allows services like Chronicle that require config files to work seamlessly in K8s without manual ConfigMap creation. + +## How It Works + +When deploying a service with volumes defined in its compose file: + +### 1. Volume Parsing + +The system parses volumes from the `volumes:` section of the compose service definition: + +```yaml +volumes: + - ${PROJECT_ROOT}/config/config.yml:/app/config.yml:ro + - ${PROJECT_ROOT}/config/defaults.yml:/app/config/defaults.yml:ro + - chronicle_data:/app/data + - chronicle_audio:/app/audio_chunks +``` + +### 2. Volume Type Detection + +**Config Files (Bind Mounts)**: +- If source is an existing **file** on the host: `${PROJECT_ROOT}/config/config.yml` +- Action: Read file contents and create a ConfigMap +- Result: File mounted into container via ConfigMap + +**Data Volumes (Named Volumes or Directories)**: +- If source is a **directory** or doesn't exist: `chronicle_data`, `/app/data` +- Action: Create an `emptyDir` volume +- Result: Ephemeral storage mounted into container + +### 3. Kubernetes Resources Created + +For a service with config files, the system creates: + +1. **ConfigMap for environment variables** (`{service}-config`) + - Non-sensitive environment variables + +2. **Secret for sensitive data** (`{service}-secrets`) + - API keys, passwords, tokens + +3. **ConfigMap for config files** (`{service}-files`) + - File contents from bind mounts + - Each file becomes a key in the ConfigMap + +4. **Deployment** with: + - `volumeMounts` referencing the config files + - `volumes` definitions for ConfigMaps and emptyDirs + +## Example: Chronicle Deployment + +### Compose File (chronicle-compose.yaml) + +```yaml +services: + chronicle-backend: + image: ghcr.io/ushadow-io/chronicle/backend:no-spacy + volumes: + - ${PROJECT_ROOT}/config/config.yml:/app/config.yml:ro + - chronicle_audio:/app/audio_chunks + - chronicle_data:/app/data +``` + +### Generated Kubernetes Resources + +**ConfigMap: chronicle-backend-files** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: chronicle-backend-files + namespace: ushadow +data: + config.yml: | + # Full contents of config/config.yml + llm: + default: openai + providers: + - id: openai + ... +``` + +**Deployment: chronicle-backend** +```yaml +apiVersion: apps/v1 +kind: Deployment +spec: + template: + spec: + containers: + - name: chronicle-backend + volumeMounts: + - name: config-files + mountPath: /app/config.yml + subPath: config.yml + readOnly: true + - name: chronicle-audio + mountPath: /app/audio_chunks + - name: chronicle-data + mountPath: /app/data + volumes: + - name: config-files + configMap: + name: chronicle-backend-files + - name: chronicle-audio + emptyDir: {} + - name: chronicle-data + emptyDir: {} +``` + +## Supported Volume Formats + +### Bind Mount with Environment Variables +```yaml +- ${PROJECT_ROOT}/config/file.yml:/app/config/file.yml:ro +``` +- Environment variables are expanded using `os.path.expandvars()` +- `:ro` suffix makes the mount read-only + +### Named Volume +```yaml +- volume_name:/container/path +``` +- Creates an `emptyDir` volume +- Data is ephemeral (lost when pod restarts) + +### Absolute Path +```yaml +- /host/path:/container/path +``` +- If path is a file: Creates ConfigMap +- If path is a directory: Creates emptyDir + +## Configuration + +No additional configuration needed! The system automatically: + +1. **Detects environment variables** in volume source paths +2. **Resolves `${PROJECT_ROOT}`** to the backend's working directory +3. **Reads local files** and creates ConfigMaps +4. **Adds volumes** to the Deployment manifest + +## Limitations + +### Current Limitations + +1. **File Size**: ConfigMaps are limited to ~1MB per file + - For larger files, use PersistentVolumes instead + +2. **Data Persistence**: Named volumes use `emptyDir` (ephemeral) + - Data is lost when pod restarts + - Future: Could add PersistentVolumeClaim support + +3. **File Path Resolution**: Only works for files accessible from the backend + - Files must exist at deployment time + - Backend must have read permissions + +### Future Enhancements + +- [ ] Support for PersistentVolumeClaims for persistent data +- [ ] Binary file support (currently text files only) +- [ ] Directory mounting (currently file-level only) +- [ ] ConfigMap size validation and warnings + +## Deployment Process + +### From UI + +1. Navigate to K8s Clusters → Deploy Service +2. Select Chronicle (or any service with volumes) +3. Click Deploy +4. System automatically: + - Reads `config.yml` and `defaults.yml` from local filesystem + - Creates `chronicle-backend-files` ConfigMap + - Mounts files into pod at correct paths + +### From API + +```bash +curl -X POST http://localhost:8400/api/kubernetes/{cluster_id}/deploy \ + -H "Content-Type: application/json" \ + -d '{ + "service_id": "chronicle-compose:chronicle-backend", + "namespace": "ushadow" + }' +``` + +## Troubleshooting + +### Config File Not Found in Pod + +**Symptom**: Pod logs show "No config.yml found" + +**Check**: +1. Verify ConfigMap exists: + ```bash + kubectl get configmap {service}-files -n {namespace} + kubectl describe configmap {service}-files -n {namespace} + ``` + +2. Check file was read at deployment time: + ```bash + # Check backend logs + grep "Adding config file" /tmp/k8s-manifests/{cluster_id}/{namespace}/*.yaml + ``` + +3. Verify volume mount in pod: + ```bash + kubectl describe pod {pod-name} -n {namespace} | grep -A 5 "Mounts:" + ``` + +### ConfigMap Too Large + +**Symptom**: Deployment fails with "ConfigMap too large" + +**Solution**: ConfigMaps are limited to ~1MB. For larger files: +- Split into multiple smaller files +- Use PersistentVolume instead +- Store large files in the container image + +### File Not Updated After Changes + +**Symptom**: Changes to local config file don't appear in pod + +**Solution**: Redeploy the service to update the ConfigMap: +```bash +# Delete deployment +kubectl delete deployment {service} -n {namespace} + +# Redeploy via UI or API +``` + +ConfigMaps are immutable once created, so you need to recreate the deployment. + +## Code Implementation + +The volume mounting logic is implemented in: + +**File**: `ushadow/backend/src/services/kubernetes_manager.py` + +**Key Functions**: +- `compile_service_to_k8s()` - Parses volumes and creates manifests +- Lines 479-548 - Volume parsing logic +- Lines 587-598 - ConfigMap for config files creation +- Lines 670-674 - Volume mounts in Deployment + +**Key Variables**: +- `config_files` - Dict of filename → file content for ConfigMap +- `volume_mounts` - List of volumeMount specs for container +- `k8s_volumes` - List of volume definitions for pod + +## Examples + +### Multiple Config Files + +```yaml +volumes: + - ./config/app.yml:/app/config/app.yml:ro + - ./config/database.yml:/app/config/database.yml:ro + - ./config/features.yml:/app/config/features.yml:ro +``` + +All three files are added to the same ConfigMap (`{service}-files`) and mounted individually. + +### Mixed Volume Types + +```yaml +volumes: + - ./config.yml:/app/config.yml:ro # ConfigMap + - ./data:/app/data # emptyDir + - logs:/app/logs # emptyDir +``` + +Config files go to ConfigMap, directories become emptyDir volumes. + +### Read-Only vs Read-Write + +```yaml +volumes: + - ./config.yml:/app/config.yml:ro # Read-only + - ./data:/app/data # Read-write +``` + +The `:ro` suffix is respected in the volumeMount's `readOnly` field. + +## Date Implemented + +**2026-01-14** - Volume mounting support added to kubernetes_manager.py + +## Related Documentation + +- [IPv6 DNS Fix](./IPV6_DNS_FIX.md) - DNS resolution issues in K8s +- [Kubernetes Integration](./KUBERNETES_INTEGRATION.md) - General K8s deployment diff --git a/scripts/k8s-helpers.sh b/scripts/k8s-helpers.sh new file mode 100755 index 00000000..67372d6b --- /dev/null +++ b/scripts/k8s-helpers.sh @@ -0,0 +1,267 @@ +#!/bin/bash +# Kubernetes Helper Scripts for Ushadow +# Common operations for managing Ushadow on Kubernetes + +set -e + +NAMESPACE="${NAMESPACE:-ushadow}" +K8S_DIR="${K8S_DIR:-k8s}" + +# Colors +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +print_info() { echo -e "${BLUE}ℹ${NC} $1"; } +print_success() { echo -e "${GREEN}✓${NC} $1"; } +print_warning() { echo -e "${YELLOW}⚠${NC} $1"; } +print_error() { echo -e "${RED}✗${NC} $1"; } + +# Function to check cluster connection +check_cluster() { + print_info "Checking Kubernetes cluster connection..." + if kubectl cluster-info &> /dev/null; then + print_success "Connected to cluster: $(kubectl config current-context)" + else + print_error "Cannot connect to Kubernetes cluster" + exit 1 + fi +} + +# Function to deploy infrastructure only +deploy_infra() { + print_info "Deploying infrastructure services (MongoDB, Redis, etc.)..." + kubectl apply -f "${K8S_DIR}/namespace.yaml" + kubectl apply -f "${K8S_DIR}/infra/" -n "${NAMESPACE}" + print_success "Infrastructure deployed" +} + +# Function to deploy application only +deploy_app() { + print_info "Deploying application services (backend, frontend)..." + kubectl apply -f "${K8S_DIR}/namespace.yaml" + kubectl apply -f "${K8S_DIR}/configmap.yaml" + kubectl apply -f "${K8S_DIR}/secret.yaml" + kubectl apply -f "${K8S_DIR}/base/" -n "${NAMESPACE}" + print_success "Application deployed" +} + +# Function to deploy everything +deploy_all() { + print_info "Deploying all services..." + kubectl apply -k "${K8S_DIR}/" + print_success "All services deployed" +} + +# Function to get status +get_status() { + print_info "Getting status of all resources in namespace ${NAMESPACE}..." + echo "" + kubectl get all -n "${NAMESPACE}" + echo "" + print_info "Persistent Volume Claims:" + kubectl get pvc -n "${NAMESPACE}" +} + +# Function to get logs +get_logs() { + local service=$1 + if [ -z "$service" ]; then + print_error "Usage: $0 logs " + echo "Available services:" + kubectl get pods -n "${NAMESPACE}" -o jsonpath='{.items[*].metadata.labels.app}' | tr ' ' '\n' | sort -u + exit 1 + fi + + print_info "Getting logs for ${service}..." + kubectl logs -n "${NAMESPACE}" -l app="${service}" --tail=100 -f +} + +# Function to restart a service +restart_service() { + local service=$1 + if [ -z "$service" ]; then + print_error "Usage: $0 restart " + exit 1 + fi + + print_info "Restarting ${service}..." + kubectl rollout restart deployment/"${service}" -n "${NAMESPACE}" + print_success "Restart initiated for ${service}" +} + +# Function to scale a service +scale_service() { + local service=$1 + local replicas=$2 + + if [ -z "$service" ] || [ -z "$replicas" ]; then + print_error "Usage: $0 scale " + exit 1 + fi + + print_info "Scaling ${service} to ${replicas} replicas..." + kubectl scale deployment/"${service}" --replicas="${replicas}" -n "${NAMESPACE}" + print_success "Scaled ${service} to ${replicas} replicas" +} + +# Function to port-forward to a service +port_forward() { + local service=$1 + local port=$2 + + if [ -z "$service" ] || [ -z "$port" ]; then + print_error "Usage: $0 port-forward " + echo "Example: $0 port-forward backend 8000:8000" + exit 1 + fi + + print_info "Port forwarding ${service} on ${port}..." + kubectl port-forward -n "${NAMESPACE}" "svc/${service}" "${port}" +} + +# Function to delete all resources +delete_all() { + print_warning "This will delete ALL resources in namespace ${NAMESPACE}" + read -p "Are you sure? (yes/no): " confirm + + if [ "$confirm" != "yes" ]; then + print_info "Deletion cancelled" + exit 0 + fi + + print_info "Deleting all resources..." + kubectl delete namespace "${NAMESPACE}" + print_success "All resources deleted" +} + +# Function to create a secret from .env file +create_secret_from_env() { + if [ ! -f .env ]; then + print_error ".env file not found" + exit 1 + fi + + print_info "Creating secret from .env file..." + kubectl create secret generic ushadow-env-secret \ + --from-env-file=.env \ + -n "${NAMESPACE}" \ + --dry-run=client -o yaml | kubectl apply -f - + + print_success "Secret created from .env" +} + +# Function to execute command in a pod +exec_pod() { + local service=$1 + shift + local cmd="$@" + + if [ -z "$service" ]; then + print_error "Usage: $0 exec " + echo "Example: $0 exec backend bash" + exit 1 + fi + + local pod=$(kubectl get pods -n "${NAMESPACE}" -l app="${service}" -o jsonpath='{.items[0].metadata.name}') + + if [ -z "$pod" ]; then + print_error "No pod found for service ${service}" + exit 1 + fi + + print_info "Executing in pod ${pod}..." + kubectl exec -it -n "${NAMESPACE}" "${pod}" -- ${cmd} +} + +# Main menu +show_menu() { + echo "" + echo "╔════════════════════════════════════════════╗" + echo "║ Ushadow Kubernetes Helper Scripts ║" + echo "╚════════════════════════════════════════════╝" + echo "" + echo "Usage: $0 [args]" + echo "" + echo "Commands:" + echo " deploy-infra Deploy infrastructure only" + echo " deploy-app Deploy application only" + echo " deploy-all Deploy everything" + echo " status Get status of all resources" + echo " logs Tail logs for a service" + echo " restart Restart a service" + echo " scale Scale a service" + echo " port-forward Port forward to service" + echo " exec Execute command in pod" + echo " create-secret Create secret from .env file" + echo " delete-all Delete all resources (careful!)" + echo "" + echo "Environment Variables:" + echo " NAMESPACE=${NAMESPACE}" + echo " K8S_DIR=${K8S_DIR}" + echo "" +} + +# Main command dispatcher +main() { + local command=$1 + shift || true + + case "$command" in + check) + check_cluster + ;; + deploy-infra) + check_cluster + deploy_infra + ;; + deploy-app) + check_cluster + deploy_app + ;; + deploy-all) + check_cluster + deploy_all + ;; + status) + check_cluster + get_status + ;; + logs) + check_cluster + get_logs "$@" + ;; + restart) + check_cluster + restart_service "$@" + ;; + scale) + check_cluster + scale_service "$@" + ;; + port-forward) + check_cluster + port_forward "$@" + ;; + exec) + check_cluster + exec_pod "$@" + ;; + create-secret) + check_cluster + create_secret_from_env + ;; + delete-all) + check_cluster + delete_all + ;; + *) + show_menu + exit 1 + ;; + esac +} + +main "$@" diff --git a/ushadow/backend/src/models/deployment.py b/ushadow/backend/src/models/deployment.py index 1bf2a2a3..73c69083 100644 --- a/ushadow/backend/src/models/deployment.py +++ b/ushadow/backend/src/models/deployment.py @@ -152,6 +152,7 @@ class Deployment(BaseModel): id: str = Field(..., description="Unique deployment ID") service_id: str = Field(..., description="Reference to ServiceDefinition") unode_hostname: str = Field(..., description="Target u-node hostname") + instance_id: Optional[str] = Field(None, description="Instance ID (for instance-based deployments)") # Status status: DeploymentStatus = Field( diff --git a/ushadow/backend/src/services/deployment_backends.py b/ushadow/backend/src/services/deployment_backends.py index 33cde746..4643a4ad 100644 --- a/ushadow/backend/src/services/deployment_backends.py +++ b/ushadow/backend/src/services/deployment_backends.py @@ -198,7 +198,12 @@ async def deploy( if self._is_local_deployment(unode): # Use Docker directly for local deployments logger.info("Using local Docker for deployment") - return await self._deploy_local(unode, resolved_service, deployment_id, container_name) + return await self._deploy_local( + unode, + resolved_service, + deployment_id, + container_name + ) # Build deploy payload for remote unode manager payload = { diff --git a/ushadow/backend/src/services/deployment_manager.py b/ushadow/backend/src/services/deployment_manager.py index bc9f06bd..7ffe79f8 100644 --- a/ushadow/backend/src/services/deployment_manager.py +++ b/ushadow/backend/src/services/deployment_manager.py @@ -440,7 +440,8 @@ async def deploy_service( self, service_id: str, unode_hostname: str, - namespace: Optional[str] = None + namespace: Optional[str] = None, + instance_id: Optional[str] = None ) -> Deployment: """ Deploy a service to any deployment target (Docker unode or K8s cluster). @@ -452,6 +453,7 @@ async def deploy_service( service_id: Service to deploy unode_hostname: Target unode hostname (Docker host or K8s cluster ID) namespace: Optional K8s namespace (only used for K8s deployments) + instance_id: Optional instance ID (for instance-based deployments) """ # Resolve service with all variables substituted try: @@ -472,17 +474,28 @@ async def deploy_service( unode = UNode(**unode_dict) # Check if already deployed - existing = await self.deployments_collection.find_one({ + # If instance_id is provided, check for that specific instance + # Otherwise, check for any deployment of this service (legacy behavior) + query = { "service_id": service_id, "unode_hostname": unode_hostname - }) + } + if instance_id: + query["instance_id"] = instance_id + + existing = await self.deployments_collection.find_one(query) if existing and existing.get("status") in [ DeploymentStatus.RUNNING, DeploymentStatus.DEPLOYING ]: - raise ValueError( - f"Service {service_id} already deployed to {unode_hostname}" - ) + if instance_id: + raise ValueError( + f"Instance {instance_id} already deployed to {unode_hostname}" + ) + else: + raise ValueError( + f"Service {service_id} already deployed to {unode_hostname}" + ) # Create deployment ID deployment_id = str(uuid.uuid4())[:8] @@ -496,6 +509,43 @@ async def deploy_service( backend = get_deployment_backend(unode, k8s_manager) + # Check for port conflicts using the existing method (Docker only) + if unode.type != UNodeType.KUBERNETES: + from src.services.docker_manager import get_docker_manager + docker_mgr = get_docker_manager() + + # Get the service name from the resolved service + service_name = resolved_service.compose_service_name + + # Use existing port conflict checking method + conflicts = docker_mgr.check_port_conflicts(service_name) + + if conflicts: + logger.info(f"Found {len(conflicts)} port conflicts for {service_name}, remapping ports") + + # Remap ports in resolved_service to use suggested alternatives + updated_ports = [] + for port_str in resolved_service.ports: + if ":" in port_str: + host_port, container_port = port_str.split(":") + original_port = int(host_port) + + # Find if this port has a conflict + conflict = next((c for c in conflicts if c.port == original_port), None) + if conflict and conflict.suggested_port: + # Use suggested alternative port + updated_ports.append(f"{conflict.suggested_port}:{container_port}") + logger.info(f"Remapped port {original_port} -> {conflict.suggested_port}") + else: + updated_ports.append(port_str) + else: + updated_ports.append(port_str) + + # Update the resolved service with new ports + resolved_service.ports = updated_ports + else: + logger.info(f"No port conflicts detected for {service_name}") + # Deploy using the backend try: deployment = await backend.deploy( @@ -505,6 +555,9 @@ async def deploy_service( namespace=namespace ) + # Set instance_id on the deployment + deployment.instance_id = instance_id + # For Docker deployments, update tailscale serve routes if deployment.backend_type == "docker": is_local = _is_local_deployment(unode_hostname) @@ -540,6 +593,7 @@ async def deploy_service( id=deployment_id, service_id=service_id, unode_hostname=unode_hostname, + instance_id=instance_id, status=DeploymentStatus.FAILED, created_at=datetime.now(timezone.utc), deployed_config=resolved_service.model_dump(), diff --git a/ushadow/backend/src/services/instance_manager.py b/ushadow/backend/src/services/instance_manager.py index 5915954f..36909e87 100644 --- a/ushadow/backend/src/services/instance_manager.py +++ b/ushadow/backend/src/services/instance_manager.py @@ -130,6 +130,10 @@ def _load_instances(self) -> None: deployed_at=instance_data.get('deployed_at'), updated_at=instance_data.get('updated_at'), error=instance_data.get('error'), + # Deployment tracking + deployment_id=instance_data.get('deployment_id'), + container_id=instance_data.get('container_id'), + container_name=instance_data.get('container_name'), # Integration-specific fields integration_type=instance_data.get('integration_type'), sync_enabled=instance_data.get('sync_enabled'), @@ -196,7 +200,9 @@ def _save_instances(self) -> None: if instance.deployment_target: instance_data['deployment_target'] = instance.deployment_target if instance.status != InstanceStatus.PENDING: - instance_data['status'] = instance.status + # Handle both enum and string status values + status_value = instance.status.value if isinstance(instance.status, InstanceStatus) else instance.status + instance_data['status'] = status_value if instance.outputs.access_url or instance.outputs.env_vars: instance_data['outputs'] = {} if instance.outputs.access_url: @@ -210,6 +216,14 @@ def _save_instances(self) -> None: if instance.error: instance_data['error'] = instance.error + # Deployment tracking fields + if instance.deployment_id: + instance_data['deployment_id'] = instance.deployment_id + if instance.container_id: + instance_data['container_id'] = instance.container_id + if instance.container_name: + instance_data['container_name'] = instance.container_name + # Integration-specific fields if instance.integration_type is not None: instance_data['integration_type'] = instance.integration_type @@ -452,8 +466,10 @@ def update_instance_status( async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: """Deploy/start an instance. - For compose services: starts the docker container - For cloud providers: marks as N/A (always available) + Routes deployment based on deployment_target: + - None: Local docker (ServiceOrchestrator) + - "cloud": Cloud provider (marks as N/A) + - hostname: Remote unode (DeploymentManager) """ self._ensure_loaded() @@ -469,54 +485,121 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: compose_service = compose_registry.get_service(instance.template_id) if compose_service: - # This is a compose service - use ServiceOrchestrator - from src.services.service_orchestrator import get_service_orchestrator - orchestrator = get_service_orchestrator() - - # Update status to deploying - instance.status = InstanceStatus.DEPLOYING - self._save_instances() + # Check deployment target + if instance.deployment_target and instance.deployment_target != "cloud": + # Remote unode deployment - use DeploymentManager + from src.services.deployment_manager import get_deployment_manager + deployment_manager = get_deployment_manager() - # Use service_name (not template_id) for orchestrator calls - service_name = compose_service.service_name + # Update status to deploying + instance.status = InstanceStatus.DEPLOYING + self._save_instances() - try: - result = await orchestrator.start_service(service_name, instance_id=instance_id) - if result.success: - # Get the service status to find access URL - status_info = await orchestrator.get_service_status(service_name) - access_url = None - if status_info and status_info.get("status") == "running": - # Try to get the access URL from docker details - details = await orchestrator.get_docker_details(service_name) - if details and details.ports: - # Use first mapped port - for port_info in details.ports: - if port_info.get("host_port"): - access_url = f"http://localhost:{port_info['host_port']}" - break + try: + # Deploy via deployment manager (creates Deployment record) + deployment = await deployment_manager.deploy_service( + service_id=compose_service.service_id, + unode_hostname=instance.deployment_target, + instance_id=instance_id + ) + # Store deployment_id in instance + instance.deployment_id = deployment.id + instance.container_id = deployment.container_id + instance.container_name = deployment.container_name + + # Update instance status based on deployment + if deployment.status == "running": + self.update_instance_status( + instance_id, + InstanceStatus.RUNNING, + access_url=deployment.access_url, + ) + return True, f"Service deployed to {instance.deployment_target}" + else: + self.update_instance_status( + instance_id, + InstanceStatus.DEPLOYING, + ) + return True, f"Service deploying to {instance.deployment_target}" + + except Exception as e: + logger.exception(f"Failed to deploy instance {instance_id} to unode") self.update_instance_status( instance_id, - InstanceStatus.RUNNING, - access_url=access_url, + InstanceStatus.ERROR, + error=str(e), ) - return True, f"Service {instance.template_id} started" - else: + return False, str(e) + else: + # Local docker deployment - use ServiceOrchestrator + from src.services.service_orchestrator import get_service_orchestrator + from src.services.docker_manager import get_docker_manager + from src.config.omegaconf_settings import get_settings_store + + orchestrator = get_service_orchestrator() + docker_mgr = get_docker_manager() + settings_store = get_settings_store() + + # Update status to deploying + instance.status = InstanceStatus.DEPLOYING + self._save_instances() + + # Use service_name (not template_id) for orchestrator calls + service_name = compose_service.service_name + + # Check for port conflicts before deploying + conflicts = docker_mgr.check_port_conflicts(service_name) + if conflicts: + logger.info(f"Found {len(conflicts)} port conflicts for {service_name}, remapping to available ports") + + # Remap ports to suggested alternatives + for conflict in conflicts: + if conflict.env_var and conflict.suggested_port: + # Save port override in service preferences + # This matches the pattern from /api/services/{name}/port-override + pref_key = f"services.{service_name}.ports.{conflict.env_var}" + await settings_store.set(pref_key, conflict.suggested_port) + logger.info(f"Remapped {conflict.env_var}: {conflict.port} -> {conflict.suggested_port}") + + try: + result = await orchestrator.start_service(service_name, instance_id=instance_id) + if result.success: + # Get the service status to find access URL + status_info = await orchestrator.get_service_status(service_name) + access_url = None + if status_info and status_info.get("status") == "running": + # Try to get the access URL from docker details + details = await orchestrator.get_docker_details(service_name) + if details and details.ports: + # ports is Dict[str, str] where key is container port, value is host port + # e.g., {"8080/tcp": "32768"} + for container_port, host_port in details.ports.items(): + if host_port: + access_url = f"http://localhost:{host_port}" + break + + self.update_instance_status( + instance_id, + InstanceStatus.RUNNING, + access_url=access_url, + ) + return True, f"Service {service_name} started" + else: + self.update_instance_status( + instance_id, + InstanceStatus.ERROR, + error=result.message, + ) + return False, result.message + except Exception as e: + logger.exception(f"Failed to deploy instance {instance_id}") self.update_instance_status( instance_id, InstanceStatus.ERROR, - error=result.message, + error=str(e), ) - return False, result.message - except Exception as e: - logger.exception(f"Failed to deploy instance {instance_id}") - self.update_instance_status( - instance_id, - InstanceStatus.ERROR, - error=str(e), - ) - return False, str(e) + return False, str(e) else: # Cloud provider - mark as N/A (always available) self.update_instance_status(instance_id, InstanceStatus.NOT_APPLICABLE) @@ -550,7 +633,7 @@ async def undeploy_instance(self, instance_id: str) -> tuple[bool, str]: service_name = compose_service.service_name try: - result = await orchestrator.stop_service(service_name, instance_id=instance_id) + result = orchestrator.stop_service(service_name) if result.success: self.update_instance_status(instance_id, InstanceStatus.STOPPED) return True, f"Service {service_name} stopped" From f353c6822a163f528ba162567f1c06a0e446feed Mon Sep 17 00:00:00 2001 From: Stu Alexander Date: Fri, 16 Jan 2026 12:58:36 +0000 Subject: [PATCH 03/45] Pre-rename checkpoint: Port conflict work + architecture docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added port conflict detection to deployment flow - Created architecture documentation (ARCHITECTURE_OVERVIEW.md, UNIFIED_CONFIG_ARCHITECTURE.md) - Added rename script for Instance → ServiceConfig refactoring - K8s deployment files and configuration updates - Ready for automated renaming via scripts/rename_to_service_config.py --- .tmux-attach.sh | 3 + .tmux.conf | 31 + ARCHITECTURE_OVERVIEW.md | 212 ++++++ K8S_ARCHITECTURE.md | 342 +++++++++ KUBERNETES.md | 513 ++++++++++++++ KUBERNETES_INTEGRATION.md | 658 ++++++++++++++++++ REFACTORING_PLAN.md | 220 ++++++ UNIFIED_CONFIG_ARCHITECTURE.md | 271 ++++++++ compose/chronicle-compose.yaml | 4 +- compose/openmemory-compose.yaml | 6 +- config/defaults.yml | 96 +++ config/instances.yaml | 49 ++ config/kubeconfigs/003fd5798ebbea9f.enc | 1 + config/tailscale-serve.json | 23 + config/wiring.yaml | 17 + deploy.sh | 508 ++++++++++++++ k8s/base/backend-deployment.yaml | 36 + k8s/base/backend-service.yaml | 13 + k8s/base/webui-deployment.yaml | 29 + k8s/base/webui-service.yaml | 13 + k8s/configmap.yaml | 9 + k8s/gai-conf-fix.yaml | 47 ++ k8s/infra/mongo-deployment.yaml | 23 + k8s/infra/mongo-service.yaml | 13 + k8s/infra/postgres-deployment.yaml | 30 + k8s/infra/postgres-service.yaml | 13 + k8s/infra/qdrant-deployment.yaml | 25 + k8s/infra/qdrant-service.yaml | 16 + k8s/infra/redis-deployment.yaml | 27 + k8s/infra/redis-service.yaml | 13 + k8s/kustomization.yaml | 22 + k8s/namespace.yaml | 7 + k8s/secret.yaml | 12 + k8s/tweaks/README.md | 63 ++ k8s/tweaks/ingress-example.yaml | 32 + k8s/tweaks/mongo-statefulset-example.yaml | 54 ++ scripts/rename_to_service_config.py | 322 +++++++++ test_k8s_deploy.py | 210 ++++++ ushadow/backend/src/config/instances.yaml | 76 +- ushadow/backend/src/config/wiring.yaml | 14 +- .../backend/src/middleware/app_middleware.py | 27 + 41 files changed, 4054 insertions(+), 46 deletions(-) create mode 100755 .tmux-attach.sh create mode 100644 .tmux.conf create mode 100644 ARCHITECTURE_OVERVIEW.md create mode 100644 K8S_ARCHITECTURE.md create mode 100644 KUBERNETES.md create mode 100644 KUBERNETES_INTEGRATION.md create mode 100644 REFACTORING_PLAN.md create mode 100644 UNIFIED_CONFIG_ARCHITECTURE.md create mode 100644 config/defaults.yml create mode 100644 config/instances.yaml create mode 100644 config/kubeconfigs/003fd5798ebbea9f.enc create mode 100644 config/tailscale-serve.json create mode 100644 config/wiring.yaml create mode 100755 deploy.sh create mode 100644 k8s/base/backend-deployment.yaml create mode 100644 k8s/base/backend-service.yaml create mode 100644 k8s/base/webui-deployment.yaml create mode 100644 k8s/base/webui-service.yaml create mode 100644 k8s/configmap.yaml create mode 100644 k8s/gai-conf-fix.yaml create mode 100644 k8s/infra/mongo-deployment.yaml create mode 100644 k8s/infra/mongo-service.yaml create mode 100644 k8s/infra/postgres-deployment.yaml create mode 100644 k8s/infra/postgres-service.yaml create mode 100644 k8s/infra/qdrant-deployment.yaml create mode 100644 k8s/infra/qdrant-service.yaml create mode 100644 k8s/infra/redis-deployment.yaml create mode 100644 k8s/infra/redis-service.yaml create mode 100644 k8s/kustomization.yaml create mode 100644 k8s/namespace.yaml create mode 100644 k8s/secret.yaml create mode 100644 k8s/tweaks/README.md create mode 100644 k8s/tweaks/ingress-example.yaml create mode 100644 k8s/tweaks/mongo-statefulset-example.yaml create mode 100755 scripts/rename_to_service_config.py create mode 100644 test_k8s_deploy.py diff --git a/.tmux-attach.sh b/.tmux-attach.sh new file mode 100755 index 00000000..5230080b --- /dev/null +++ b/.tmux-attach.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# Auto-generated tmux attach script +tmux attach-session -t workmux:ushadow-purple diff --git a/.tmux.conf b/.tmux.conf new file mode 100644 index 00000000..9d63c296 --- /dev/null +++ b/.tmux.conf @@ -0,0 +1,31 @@ +# User-friendly tmux configuration for Ushadow environments + +# Enable mouse support (scroll, select, resize panes) +set -g mouse on + +# Increase scrollback buffer +set -g history-limit 50000 + +# Don't rename windows automatically +set -g allow-rename off + +# Start window numbering at 1 +set -g base-index 1 + +# Enable 256 colors +set -g default-terminal "screen-256color" + +# Faster command sequences +set -s escape-time 0 + +# Status bar styling +set -g status-style bg=default,fg=white +set -g status-left-length 40 +set -g status-right "#[fg=yellow]#S #[fg=white]%H:%M" + +# Pane border colors +set -g pane-border-style fg=colour238 +set -g pane-active-border-style fg=colour39 + +# Fix mouse scrolling in terminal applications +set -g terminal-overrides 'xterm*:smcup@:rmcup@' diff --git a/ARCHITECTURE_OVERVIEW.md b/ARCHITECTURE_OVERVIEW.md new file mode 100644 index 00000000..40e1fe20 --- /dev/null +++ b/ARCHITECTURE_OVERVIEW.md @@ -0,0 +1,212 @@ +# Architecture Overview: Services vs Instances vs Deployments + +## Core Concepts + +### 1. **Service** (Services Page) +**Model**: Managed by `DockerManager.MANAGEABLE_SERVICES` (dynamically built from compose registry) +**Location**: `ushadow/backend/src/services/docker_manager.py` + +- Represents a **Docker Compose service** that can be started/stopped +- Lives in docker-compose files (e.g., `compose/openmemory-compose.yaml`) +- Discovered and registered automatically by `ComposeServiceRegistry` +- **Single instance per service** - only one copy can run at a time +- Start/stop controls the actual Docker container directly +- Has port conflict checking with user dialog for port overrides +- Port overrides saved to: `services.{service_name}.ports.{ENV_VAR}` + +**UI**: Services Page (`ushadow/frontend/src/pages/ServicesPage.tsx`) +- Shows cards with Start/Stop buttons +- Port conflict flow: preflight check → dialog → port override → retry + +**APIs**: +- `POST /api/services/{name}/start` - Start service +- `POST /api/services/{name}/stop` - Stop service +- `GET /api/services/{name}/preflight` - Check for port conflicts +- `POST /api/services/{name}/port-override` - Set port override + +**Hook**: `useServiceStart` from `ushadow/frontend/src/hooks/useServiceStart.ts` + +--- + +### 2. **Instance** (Instances Page) +**Model**: `Instance` in `ushadow/backend/src/models/instance.py` +**Manager**: `InstanceManager` in `ushadow/backend/src/services/instance_manager.py` + +- Represents a **template + configuration + deployment target** +- Can have **multiple instances** of the same template (e.g., openai-1, openai-2) +- Has lifecycle: PENDING → DEPLOYING → RUNNING → STOPPED → ERROR +- Can be deployed to: + - Local Docker (deployment_target=None) + - Remote unode (deployment_target=hostname) + - Cloud provider (deployment_target="cloud", status="n/a") + +**Deployment Types**: +- **Local Docker**: Uses `ServiceOrchestrator` (compose services) or direct Docker +- **Remote unode**: Creates a `Deployment` record, uses `DeploymentManager` +- **Cloud**: No actual deployment, just config storage + +**Port Handling**: +- For LOCAL deployments via orchestrator: Has port conflict checking code in `instance_manager.py:551-563` +- For REMOTE deployments: No port conflict checking (just added in `deployment_manager.py:512-547`) +- **Problem**: No user dialog, just logs + auto-remap + +**UI**: Instances Page (`ushadow/frontend/src/pages/InstancesPage.tsx`) +- Shows cards with Start/Stop buttons (similar to services) +- Start button calls `handleDeployInstance()` +- Stop button calls `handleUndeployInstance()` +- **No port conflict dialog** - just fails or auto-remaps silently + +**APIs**: +- `POST /api/instances` - Create instance +- `POST /api/instances/{id}/deploy` - Deploy/start instance +- `POST /api/instances/{id}/undeploy` - Stop instance +- `DELETE /api/instances/{id}` - Delete instance + +--- + +### 3. **Deployment** (Database Record) +**Model**: `Deployment` in `ushadow/backend/src/models/deployment.py` +**Manager**: `DeploymentManager` in `ushadow/backend/src/services/deployment_manager.py` + +- Represents a **service deployed to a specific unode** +- Lower-level runtime record tracking container state +- Created when an instance is deployed to a remote unode +- Stores: container_id, container_name, status, access_url, exposed_port +- Has relationship to Instance: `Instance.deployment_id` → `Deployment.id` +- Also has `Deployment.instance_id` → `Instance.id` (bidirectional) + +**Backends**: +- `DockerDeploymentBackend` - Deploys to Docker hosts +- `KubernetesDeploymentBackend` - Deploys to K8s clusters + +**Not visible in UI** - only used internally for tracking remote deployments + +--- + +## Current Problems + +### 1. **Duplicate Port Conflict Logic** +- Services page: Full preflight + dialog + port override flow +- Instances page (local): Port conflict check + auto-remap to settings (no dialog) +- Instances page (remote): Port conflict check + auto-remap to resolved_service.ports (no dialog) +- **Different implementations** in 3 places! + +### 2. **No User Confirmation for Instances** +Services ask user: "Port 8765 is in use, switch to 8766?" +Instances: Just auto-remap (or fail silently before my changes) + +### 3. **Port Override Storage Inconsistency** +- Services: `services.{name}.ports.{ENV_VAR}` (service-level, shared) +- Instances (local): Also `services.{name}.ports.{ENV_VAR}` (conflicts with other instances!) +- Instances (remote): Only in `resolved_service.ports` (temporary, not persisted) + +### 4. **Instance Config Not Used for Ports** +Instances have a `config` field but ports aren't stored there per-instance + +--- + +## Proposed Unified Architecture + +### Goal: Reuse `useServiceStart` Pattern for Instances + +### Backend Changes + +#### 1. Add Preflight Check for Instances +```python +# /api/instances/{id}/preflight +# Returns same format as services preflight +{ + "can_start": false, + "port_conflicts": [ + { + "port": 8765, + "env_var": "MEM0_PORT", + "used_by": "Docker: mem0-abc123", + "suggested_port": 8766 + } + ] +} +``` + +#### 2. Add Port Override for Instances +```python +# /api/instances/{id}/port-override +# Sets port in instance.config (per-instance, not service-level) +instance.config.values["MEM0_PORT"] = 8766 +save_instances() +``` + +#### 3. Update Deploy Flow +```python +async def deploy_instance(instance_id): + # 1. Check ports using existing check_port_conflicts() + conflicts = docker_mgr.check_port_conflicts(service_name) + + # 2. If conflicts, return 409 with conflict info + # (Let frontend handle it) + + # 3. Apply instance.config port overrides to env vars + # before starting container +``` + +### Frontend Changes + +#### 1. Create `useInstanceDeploy` Hook +Similar to `useServiceStart` but for instances: +```typescript +export function useInstanceDeploy( + onSuccess?: (instanceId: string) => void, + onError?: (instanceId: string) => void +) { + // Call preflight check + // Show port conflict dialog if needed + // Call port override API + // Retry deploy +} +``` + +#### 2. Update InstancesPage +```typescript +// Replace handleDeployInstance with: +const instanceDeploy = useInstanceDeploy(...) +onClick={() => instanceDeploy.startInstance(instance.id)} + +// Render port conflict dialog + +``` + +--- + +## Code Reuse Plan + +### ✅ Already Shared +- `check_port_conflicts()` in `docker_manager.py` +- `PortConflictDialog` component (can reuse for instances) + +### ❌ Currently Duplicated +- Port conflict checking logic (3 implementations) +- Preflight check flow (services only) +- Port override storage (inconsistent) + +### 🎯 Should Be Shared +- Preflight check pattern (services + instances) +- Port conflict resolution dialog (same UI) +- Port override API pattern (adapt for instance config) + +--- + +## Next Steps + +1. **Add instance preflight endpoint** (`/api/instances/{id}/preflight`) +2. **Add instance port override endpoint** (`/api/instances/{id}/port-override`) +3. **Remove auto-remap logic** from deployment_manager.py (let frontend handle) +4. **Create useInstanceDeploy hook** (mirror useServiceStart) +5. **Add PortConflictDialog to InstancesPage** +6. **Store port overrides in instance.config** (not service-level settings) + +This unifies the UX while respecting the difference that instances are per-config vs services are singleton. diff --git a/K8S_ARCHITECTURE.md b/K8S_ARCHITECTURE.md new file mode 100644 index 00000000..c1e75095 --- /dev/null +++ b/K8S_ARCHITECTURE.md @@ -0,0 +1,342 @@ +# Kubernetes Deployment Architecture + +## Overview + +Ushadow supports deploying services to Kubernetes clusters in addition to Docker. This document describes the architecture, components, and deployment flow. + +## Architecture Components + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Frontend (React) │ +│ - KubernetesClustersPage: Cluster management UI │ +│ - DeployToK8sModal: Service deployment UI │ +└─────────────────┬───────────────────────────────────────────┘ + │ HTTPS API +┌─────────────────▼───────────────────────────────────────────┐ +│ Backend (FastAPI) │ +│ - routers/kubernetes.py: K8s API endpoints │ +│ - services/kubernetes_manager.py: K8s operations │ +│ - services/compose_registry.py: Service definitions │ +└─────────────────┬───────────────────────────────────────────┘ + │ Kubernetes Python Client +┌─────────────────▼───────────────────────────────────────────┐ +│ Kubernetes Cluster │ +│ - Namespace: ushadow (default) │ +│ - ConfigMaps: Non-sensitive env vars │ +│ - Secrets: Sensitive env vars │ +│ - Deployments: Service pods │ +│ - Services: Network endpoints │ +└──────────────────────────────────────────────────────────────┘ +``` + +## Deployment Strategy + +### 1. Direct Kubernetes Interface + +We chose **direct K8s API** over deploying unode-manager to K8s: + +**Benefits:** +- Simpler architecture (no additional pods to manage) +- Native K8s features (StatefulSets, Operators, CRDs) +- Better debugging (direct API errors) +- Can add unode-manager-in-k8s later if needed + +**Trade-offs:** +- Different code paths for Docker vs K8s deployments +- K8s-specific manifest generation + +### 2. Manifest Generation Flow + +``` +Service Definition (Compose YAML) + ↓ +ComposeRegistry (parse & register) + ↓ +kubernetes_manager.compile_service_to_k8s() + ↓ +Generated Manifests: + - ConfigMap (non-sensitive env vars) + - Secret (sensitive env vars: keys, passwords, tokens) + - Deployment (pods with envFrom references) + - Service (ClusterIP/NodePort/LoadBalancer) + - Ingress (optional) + ↓ +Apply to Kubernetes via Python client + ↓ +Running Pods +``` + +### 3. Environment Variable Handling + +**Separation Strategy:** +- **ConfigMap**: Non-sensitive configuration + - Database URLs (without credentials) + - Service endpoints + - Feature flags + - Public configuration + +- **Secret**: Sensitive data (base64 encoded) + - API keys (`*_API_KEY`, `*_KEY`) + - Passwords (`*_PASSWORD`, `*_PASS`) + - Tokens (`*_TOKEN`) + - Credentials (`*_CREDENTIALS`, `*_SECRET`) + +**Resolution Order:** +1. Manual value (from deployment UI) +2. settingsStore suggestion (from user settings) +3. Infrastructure discovery (from cluster scan) +4. Default value (from compose file) + +**Variable Substitution:** +Docker Compose variables like `${VAR:-default}` are resolved at deployment time: +- Check service env_config +- Check OS environment +- Use default value + +### 4. Port Handling + +**Multiple Ports Support:** +```yaml +# Service has multiple ports +ports: ['3002:3000', '8080:8080'] + +# Generated container ports with unique names +spec: + containers: + - ports: + - name: http + containerPort: 3000 + - name: http-2 + containerPort: 8080 +``` + +**Port Name Requirements:** +- Must be unique within a container +- Must match regex: `[a-z0-9]([-a-z0-9]*[a-z0-9])?` +- Max 15 characters + +### 5. Infrastructure Discovery + +**Scan Process:** +``` +1. User adds K8s cluster (with kubeconfig) +2. User clicks "Scan Infrastructure" +3. Backend scans namespace for services: + - mongo/mongodb + - redis + - postgres/postgresql + - qdrant + - neo4j +4. Results cached in cluster document +5. Auto-mapped to service env vars on deployment +``` + +**Connection String Formats:** +- ClusterIP: `{service}.{namespace}.svc.cluster.local:{port}` +- NodePort: `:{nodePort}` +- LoadBalancer: `{lb-ip}:{port}` + +## Data Model + +### KubernetesCluster +```python +{ + "cluster_id": str, # Unique ID + "name": str, # Display name + "context": str, # Kubeconfig context + "server": str, # API server URL + "status": "connected", # connected | unreachable | unauthorized + "version": str, # K8s version + "node_count": int, # Number of nodes + "namespace": str, # Default namespace + "infra_scans": { # Cached scan results + "ushadow": { + "mongo": { + "found": true, + "endpoints": ["mongo.ushadow.svc.cluster.local:27017"] + }, + ... + } + } +} +``` + +### KubernetesDeploymentSpec +```python +{ + "replicas": int, # Pod replicas (default: 1) + "namespace": str, # Target namespace + "resources": { # Resource limits + "requests": {"cpu": "100m", "memory": "128Mi"}, + "limits": {"cpu": "500m", "memory": "512Mi"} + }, + "service_type": str, # ClusterIP | NodePort | LoadBalancer + "health_check_path": str, # Health probe path (None = disabled) + "ingress": { # Optional ingress config + "enabled": bool, + "host": str, + "path": str, + "tls": bool + }, + "annotations": dict, # Custom annotations + "labels": dict # Custom labels +} +``` + +## API Endpoints + +### Cluster Management +- `POST /api/kubernetes/clusters` - Add cluster +- `GET /api/kubernetes/clusters` - List clusters +- `GET /api/kubernetes/clusters/{id}` - Get cluster +- `DELETE /api/kubernetes/clusters/{id}` - Remove cluster + +### Infrastructure +- `POST /api/kubernetes/{id}/scan-infra` - Scan for infrastructure +- `GET /api/kubernetes/services/available` - List deployable services +- `GET /api/kubernetes/services/infra` - List infrastructure services + +### Deployment +- `POST /api/kubernetes/{id}/envmap` - Create ConfigMap/Secret +- `POST /api/kubernetes/{id}/deploy` - Deploy service + +## Security Considerations + +### 1. Kubeconfig Storage +- Encrypted at rest using Fernet (derived from app secret key) +- Stored as `.enc` files in `/config/kubeconfigs/` +- Never sent to frontend +- Temporary files deleted after use + +### 2. RBAC Requirements +Minimum required permissions for service account: +```yaml +rules: +- apiGroups: [""] + resources: ["namespaces", "configmaps", "secrets", "services"] + verbs: ["get", "list", "create", "update", "patch"] +- apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "create", "update", "patch"] +- apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "create", "update", "patch"] +``` + +### 3. Secret Management +- Sensitive env vars automatically separated +- Base64 encoded in Kubernetes Secrets +- Never logged in plain text +- Accessed via envFrom in pods + +## Debugging + +### 1. Generated Manifests +All manifests saved to: `/tmp/k8s-manifests/{cluster_id}/{namespace}/` + +```bash +docker exec ushadow-backend ls /tmp/k8s-manifests/ +docker exec ushadow-backend cat /tmp/k8s-manifests/{cluster-id}/{namespace}/mem0-ui-deployment.yaml +``` + +### 2. Logs +```bash +# Backend logs with full stack traces +docker logs ushadow-backend | grep -A 20 "deployment of" + +# K8s deployment status +kubectl get deployments,pods,services -n ushadow +kubectl describe deployment mem0-ui -n ushadow +kubectl logs -f deployment/mem0-ui -n ushadow +``` + +### 3. Common Issues + +**Image pull errors:** +```bash +kubectl describe pod {pod-name} -n ushadow | grep -A 5 "Events:" +``` + +**ConfigMap/Secret issues:** +```bash +kubectl get configmaps,secrets -n ushadow +kubectl describe configmap mem0-ui-config -n ushadow +``` + +**Port conflicts:** +```bash +# Check generated manifest +docker exec ushadow-backend cat /tmp/k8s-manifests/{cluster-id}/{namespace}/mem0-ui-deployment.yaml | grep -A 10 "ports:" +``` + +## Future Enhancements + +### 1. StatefulSets for Databases +Current: All services use Deployments +Future: Database services use StatefulSets with PVCs + +### 2. Helm Chart Generation +Current: Direct manifest application +Future: Optional Helm chart generation for complex services + +### 3. GitOps Integration +Current: Direct deployment +Future: ArgoCD/Flux integration with git-based workflows + +### 4. Multi-Cluster Deployments +Current: Single cluster per deployment +Future: Deploy to multiple clusters simultaneously + +### 5. Resource Autoscaling +Current: Fixed replica count +Future: HPA (Horizontal Pod Autoscaler) based on metrics + +## Troubleshooting Guide + +### Issue: "Duplicate port name" +**Symptom:** `spec.template.spec.containers[0].ports[1].name: Duplicate value: "http"` + +**Cause:** Multiple ports with same name in container spec + +**Fix:** Check generated manifest - each port must have unique name + +### Issue: "Image pull failed" +**Symptom:** `ErrImagePull` or `ImagePullBackOff` + +**Causes:** +1. Image doesn't exist +2. Registry authentication required +3. Network connectivity issues + +**Fix:** +```bash +# Check image +docker pull {image-name} + +# Add image pull secret +kubectl create secret docker-registry regcred \ + --docker-server={registry} \ + --docker-username={user} \ + --docker-password={password} +``` + +### Issue: "CrashLoopBackOff" +**Symptom:** Pod keeps restarting + +**Debug:** +```bash +kubectl logs {pod-name} -n ushadow --previous +kubectl describe pod {pod-name} -n ushadow +``` + +### Issue: "Liveness probe failed" +**Symptom:** Pod killed by liveness probe + +**Fix:** Set `health_check_path: null` in deployment spec to disable health checks for services without health endpoints + +## References + +- [Kubernetes Python Client](https://github.com/kubernetes-client/python) +- [Kubernetes API Reference](https://kubernetes.io/docs/reference/kubernetes-api/) +- [KUBERNETES_INTEGRATION.md](./KUBERNETES_INTEGRATION.md) - Implementation details diff --git a/KUBERNETES.md b/KUBERNETES.md new file mode 100644 index 00000000..4bf17403 --- /dev/null +++ b/KUBERNETES.md @@ -0,0 +1,513 @@ +# Ushadow Kubernetes Deployment Guide + +This guide covers deploying Ushadow to Kubernetes using automatically generated manifests from Docker Compose files. + +## Overview + +The deployment process uses **kompose** to convert your existing Docker Compose files into Kubernetes manifests, then applies production-ready tweaks for a robust deployment. + +## Prerequisites + +### Required Tools + +1. **kubectl** - Kubernetes CLI + ```bash + # macOS + brew install kubectl + + # Linux + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ + + # Windows + choco install kubernetes-cli + ``` + +2. **kompose** - Compose to Kubernetes converter + ```bash + # macOS + brew install kompose + + # Linux + curl -L https://github.com/kubernetes/kompose/releases/download/v1.34.0/kompose-linux-amd64 -o kompose + chmod +x kompose + sudo mv kompose /usr/local/bin/ + + # Windows + choco install kubernetes-kompose + ``` + +3. **A Kubernetes Cluster** - One of: + - Minikube (local development) + - Docker Desktop with Kubernetes (local development) + - Cloud provider (EKS, GKE, AKS) + - On-premises cluster + +### Verify Cluster Connection + +```bash +kubectl cluster-info +kubectl get nodes +``` + +## Quick Start + +### 1. Generate Kubernetes Manifests + +Run the deployment script to convert Docker Compose files to Kubernetes manifests: + +```bash +./deploy.sh +``` + +This will: +- ✅ Check for kompose and kubectl +- ✅ Convert infrastructure services (MongoDB, Redis, Qdrant, etc.) +- ✅ Convert application services (backend, frontend) +- ✅ Generate namespace, ConfigMaps, and Secret templates +- ✅ Create production-ready examples and guides +- ✅ Generate kustomization file + +### 2. Review Generated Manifests + +```bash +ls -la k8s/ +``` + +Directory structure: +``` +k8s/ +├── namespace.yaml # Namespace definition +├── configmap.yaml # Configuration data +├── secret.yaml # Secrets template (NEEDS EDITING) +├── kustomization.yaml # Kustomize config +├── infra/ # Infrastructure services +│ ├── mongo-*.yaml +│ ├── redis-*.yaml +│ ├── qdrant-*.yaml +│ └── postgres-*.yaml +├── base/ # Application services +│ ├── backend-*.yaml +│ └── webui-*.yaml +└── tweaks/ # Examples and guides + ├── README.md + ├── ingress-example.yaml + └── mongo-statefulset-example.yaml +``` + +### 3. Update Secrets + +**IMPORTANT:** Edit `k8s/secret.yaml` with your actual secrets: + +```bash +# Edit the secret file +vim k8s/secret.yaml + +# Or create from .env file +./scripts/k8s-helpers.sh create-secret +``` + +### 4. Deploy to Kubernetes + +#### Option A: Deploy Everything with Kustomize (Recommended) + +```bash +kubectl apply -k k8s/ +``` + +#### Option B: Deploy Step-by-Step + +```bash +# 1. Create namespace +kubectl apply -f k8s/namespace.yaml + +# 2. Deploy infrastructure first +kubectl apply -f k8s/infra/ + +# 3. Wait for infrastructure to be ready +kubectl wait --for=condition=ready pod -l app=mongo -n ushadow --timeout=300s +kubectl wait --for=condition=ready pod -l app=redis -n ushadow --timeout=300s + +# 4. Deploy application +kubectl apply -f k8s/configmap.yaml +kubectl apply -f k8s/secret.yaml +kubectl apply -f k8s/base/ + +# 5. Check status +kubectl get all -n ushadow +``` + +#### Option C: Use Helper Script + +```bash +# Deploy infrastructure +./scripts/k8s-helpers.sh deploy-infra + +# Deploy application +./scripts/k8s-helpers.sh deploy-app + +# Or deploy everything +./scripts/k8s-helpers.sh deploy-all +``` + +## Management Operations + +### Get Status + +```bash +# Get all resources +./scripts/k8s-helpers.sh status + +# Or manually +kubectl get all -n ushadow +kubectl get pvc -n ushadow +``` + +### View Logs + +```bash +# Using helper script +./scripts/k8s-helpers.sh logs backend +./scripts/k8s-helpers.sh logs webui + +# Or manually +kubectl logs -n ushadow -l app=backend --tail=100 -f +kubectl logs -n ushadow -l app=webui --tail=100 -f +``` + +### Port Forwarding (Local Access) + +```bash +# Forward backend API +./scripts/k8s-helpers.sh port-forward backend 8000:8000 + +# Forward frontend +./scripts/k8s-helpers.sh port-forward webui 3000:80 + +# Or manually +kubectl port-forward -n ushadow svc/backend 8000:8000 +kubectl port-forward -n ushadow svc/webui 3000:80 +``` + +Then access: +- Frontend: http://localhost:3000 +- Backend API: http://localhost:8000 +- API Docs: http://localhost:8000/docs + +### Scale Services + +```bash +# Scale backend to 3 replicas +./scripts/k8s-helpers.sh scale backend 3 + +# Scale frontend to 2 replicas +./scripts/k8s-helpers.sh scale webui 2 + +# Or manually +kubectl scale deployment/backend --replicas=3 -n ushadow +``` + +### Restart Services + +```bash +# Restart backend +./scripts/k8s-helpers.sh restart backend + +# Or manually +kubectl rollout restart deployment/backend -n ushadow +``` + +### Execute Commands in Pods + +```bash +# Get a shell in backend pod +./scripts/k8s-helpers.sh exec backend bash + +# Run a command +./scripts/k8s-helpers.sh exec backend env + +# Or manually +kubectl exec -it -n ushadow deployment/backend -- bash +``` + +## Production-Ready Adjustments + +The auto-generated manifests need these adjustments for production. See `k8s/tweaks/README.md` for detailed instructions. + +### 1. StatefulSets for Databases + +Convert database Deployments to StatefulSets for stable storage: + +```bash +# Example provided in k8s/tweaks/mongo-statefulset-example.yaml +kubectl apply -f k8s/tweaks/mongo-statefulset-example.yaml +``` + +### 2. Add Resource Limits + +Edit deployments to add resource constraints: + +```yaml +resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" +``` + +### 3. Configure Ingress + +Edit and apply the Ingress example: + +```bash +# Edit with your domain +vim k8s/tweaks/ingress-example.yaml + +# Apply +kubectl apply -f k8s/tweaks/ingress-example.yaml +``` + +### 4. Add Persistent Storage + +For production, configure proper StorageClass and PVCs: + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mongo-data + namespace: ushadow +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + storageClassName: standard # Use your cloud provider's storage class +``` + +### 5. Configure Monitoring + +Add Prometheus annotations to services: + +```yaml +annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8000" + prometheus.io/path: "/metrics" +``` + +## Environment-Specific Deployments + +### Development Environment + +```bash +# Use namespace for dev +export NAMESPACE=ushadow-dev +./deploy.sh +kubectl apply -k k8s/ +``` + +### Production Environment + +```bash +# Use namespace for prod +export NAMESPACE=ushadow-prod +./deploy.sh + +# Review carefully before applying +kubectl apply -k k8s/ --dry-run=client + +# Apply +kubectl apply -k k8s/ +``` + +## Troubleshooting + +### Pods Not Starting + +```bash +# Check pod status +kubectl get pods -n ushadow + +# Describe problematic pod +kubectl describe pod -n ushadow + +# Check logs +kubectl logs -n ushadow +``` + +### Storage Issues + +```bash +# Check PVCs +kubectl get pvc -n ushadow + +# Describe PVC +kubectl describe pvc -n ushadow + +# Check storage class +kubectl get storageclass +``` + +### Network Issues + +```bash +# Check services +kubectl get svc -n ushadow + +# Test connectivity from a pod +kubectl run -it --rm debug --image=busybox -n ushadow -- sh +# Inside the pod: +wget -O- http://backend:8000/health +``` + +### Image Pull Issues + +```bash +# Check events +kubectl get events -n ushadow --sort-by='.lastTimestamp' + +# If using private registry, create image pull secret +kubectl create secret docker-registry regcred \ + --docker-server= \ + --docker-username= \ + --docker-password= \ + -n ushadow +``` + +## Cleanup + +### Delete All Resources + +```bash +# Using helper script (prompts for confirmation) +./scripts/k8s-helpers.sh delete-all + +# Or manually +kubectl delete namespace ushadow + +# Delete with kustomize +kubectl delete -k k8s/ +``` + +### Regenerate Manifests + +```bash +# Clean and regenerate +rm -rf k8s/ +./deploy.sh +``` + +## Advanced Configuration + +### Using Kustomize Overlays + +For different environments, use overlays: + +```bash +# Create overlay for production +mkdir -p k8s/overlays/prod + +cat > k8s/overlays/prod/kustomization.yaml <:{nodePort}` + - LoadBalancer: `{lb-ip}:{port}` + +##### `update_cluster_infra_scan(cluster_id, namespace, scan_results)` +Updates cached infrastructure scan results for a cluster namespace. + +- **Persistence:** Stores scan results in cluster's `infra_scans` field +- **Per-namespace caching:** Results keyed by namespace for multi-namespace support +- **Automatic:** Called automatically by scan endpoint +- **Returns:** True if update successful + +```python +results = await k8s_manager.scan_cluster_for_infra_services("cluster-123", "ushadow") +# Returns: { +# "mongo": { +# "found": True, +# "endpoints": ["mongo.ushadow.svc.cluster.local:27017"], +# "type": "mongo", +# "default_port": 27017 +# }, +# ... +# } +``` + +##### `get_or_create_envmap(cluster_id, namespace, service_name, env_vars)` +Creates or updates ConfigMap and Secret for service environment variables. + +- **Automatic namespace creation:** Creates namespace if it doesn't exist +- **Automatic separation:** Sensitive data → Secret, non-sensitive → ConfigMap +- **Sensitive patterns:** SECRET, KEY, PASSWORD, TOKEN, PASS, CREDENTIALS +- **Base64 encoding:** Automatic for Secret data +- **Idempotent:** Creates or patches if already exists +- **Returns:** Tuple of (configmap_name, secret_name) + +```python +configmap, secret = await k8s_manager.get_or_create_envmap( + "cluster-123", + "ushadow", + "my-service", + { + "DATABASE_URL": "postgres://...", # → ConfigMap + "API_KEY": "secret-key-123", # → Secret + } +) +# Returns: ("my-service-config", "my-service-secrets") +``` + +#### 2. Enhanced `kubernetes.py` Router +**Location:** `ushadow/backend/src/routers/kubernetes.py` + +**New Endpoints:** + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/api/kubernetes/services/available` | GET | Get all services from compose registry | +| `/api/kubernetes/services/infra` | GET | Get infrastructure services only | +| `/api/kubernetes/{cluster_id}/scan-infra` | POST | Scan cluster for existing infrastructure | +| `/api/kubernetes/{cluster_id}/envmap` | POST | Create/update ConfigMaps and Secrets | +| `/api/kubernetes/{cluster_id}/deploy` | POST | Deploy a service to cluster | + +**Request/Response Models:** +```python +class ScanInfraRequest(BaseModel): + namespace: str = "ushadow" + +class CreateEnvmapRequest(BaseModel): + service_name: str + namespace: str = "ushadow" + env_vars: Dict[str, str] + +class DeployServiceRequest(BaseModel): + service_id: str + namespace: str = "ushadow" + k8s_spec: Optional[KubernetesDeploymentSpec] = None +``` + +#### 3. Fixed `deployment_manager.py` +**Location:** `ushadow/backend/src/services/deployment_manager.py` + +**Issue:** MongoDB index conflict when upgrading from old version +**Solution:** Graceful handling of IndexKeySpecsConflict errors + +```python +# Detects conflicting index specs +# Drops old index +# Creates new index +# Continues initialization +``` + +### Frontend Components + +#### 4. Enhanced `KubernetesClustersPage.tsx` +**Location:** `ushadow/frontend/src/pages/KubernetesClustersPage.tsx` + +**New Features:** + +##### Infrastructure Scanning +- **"Scan Infrastructure" button** on each cluster card +- **Disabled** when cluster status is not "connected" +- **Loading state** with spinner during scan +- **Results modal** showing found/not-found services +- **Connection endpoints** displayed for discovered infrastructure +- **Persistent results** stored per cluster + +##### UI Improvements +- Changed default namespace to `ushadow` (recommended) +- Infrastructure status badge on cluster cards +- View scan results button for rescanned clusters +- Beautiful scan results modal with color-coded services +- Help text explaining next steps + +**Test IDs:** +- `kubernetes-page` +- `scan-infra-{clusterId}` +- `view-scan-results-{clusterId}` +- `infra-scan-results-modal` +- `close-scan-results` +- `remove-cluster-{clusterId}` + +#### 5. Enhanced `api.ts` +**Location:** `ushadow/frontend/src/services/api.ts` + +**New API Methods:** +```typescript +export const kubernetesApi = { + // Existing methods + addCluster, listClusters, getCluster, removeCluster, + + // New methods + getAvailableServices: () => api.get('/api/kubernetes/services/available'), + getInfraServices: () => api.get('/api/kubernetes/services/infra'), + scanInfraServices: (clusterId, namespace = 'ushadow') => + api.post(`/api/kubernetes/${clusterId}/scan-infra`, { namespace }), + createEnvmap: (clusterId, data) => + api.post(`/api/kubernetes/${clusterId}/envmap`, data), + deployService: (clusterId, data) => + api.post(`/api/kubernetes/${clusterId}/deploy`, data), +} +``` + +### Deployment Tools + +#### 6. `deploy.sh` Script +**Location:** `./deploy.sh` + +**Features:** +- Automated kompose conversion of Docker Compose → K8s manifests +- Handles infrastructure and application services separately +- Generates ConfigMaps, Secrets, Deployments, Services +- Creates production-ready examples and tweaking guides +- Colorized output with progress indicators + +**Usage:** +```bash +./deploy.sh + +# Generates: +# k8s/ +# ├── namespace.yaml +# ├── configmap.yaml +# ├── secret.yaml +# ├── kustomization.yaml +# ├── infra/ # MongoDB, Redis, Qdrant, Postgres +# ├── base/ # Backend, Frontend +# └── tweaks/ # Production examples and guides +``` + +#### 7. `scripts/k8s-helpers.sh` +**Location:** `./scripts/k8s-helpers.sh` + +**Helper Commands:** +```bash +./scripts/k8s-helpers.sh deploy-infra # Deploy infrastructure only +./scripts/k8s-helpers.sh deploy-app # Deploy application only +./scripts/k8s-helpers.sh deploy-all # Deploy everything +./scripts/k8s-helpers.sh status # Get cluster status +./scripts/k8s-helpers.sh logs # Tail service logs +./scripts/k8s-helpers.sh restart # Restart a service +./scripts/k8s-helpers.sh scale # Scale replicas +./scripts/k8s-helpers.sh port-forward # Local access +./scripts/k8s-helpers.sh exec # Run command in pod +``` + +## User Workflow + +### 1. Add Cluster +``` +1. Click "Add Cluster" +2. Upload kubeconfig or paste YAML +3. Specify cluster name and namespace (default: ushadow) +4. System validates connectivity +5. Cluster appears in list with status +``` + +### 2. Scan Infrastructure +``` +1. Click "Scan Infrastructure" on cluster card +2. System scans namespace for infrastructure services +3. Results modal shows: + - Found services (with connection endpoints) + - Not found services + - Next steps guidance +4. Results persist and show badge on cluster card +``` + +### 3. Use or Deploy Decision +After scanning, you can: +- **Use existing infrastructure:** Configure services to point to discovered endpoints +- **Deploy new infrastructure:** Use unified deployment UI (to be built) + +## Configuration & Setup + +### Prerequisites +```bash +# macOS +brew install kompose + +# Linux +curl -L https://github.com/kubernetes/kompose/releases/download/v1.34.0/kompose-linux-amd64 -o kompose +chmod +x kompose +sudo mv kompose /usr/local/bin/ + +# Windows +choco install kubernetes-kompose +``` + +### Backend Dependencies +Already included in `pyproject.toml`: +```toml +dependencies = [ + "kubernetes>=31.0.0", + ... +] +``` + +Install with: +```bash +cd ushadow/backend +uv sync +``` + +### Generate Manifests +```bash +./deploy.sh +``` + +### Deploy to Kubernetes +```bash +# Using kustomize (recommended) +kubectl apply -k k8s/ + +# Or step-by-step +kubectl apply -f k8s/namespace.yaml +kubectl apply -f k8s/infra/ +kubectl apply -f k8s/base/ +``` + +## API Examples + +### Scan Cluster for Infrastructure +```bash +curl -X POST http://localhost:8000/api/kubernetes/{cluster_id}/scan-infra \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"namespace": "ushadow"}' +``` + +**Response:** +```json +{ + "cluster_id": "abc123", + "namespace": "ushadow", + "infra_services": { + "mongo": { + "found": true, + "endpoints": ["mongo.ushadow.svc.cluster.local:27017"], + "type": "mongo", + "default_port": 27017 + }, + "redis": { + "found": true, + "endpoints": ["redis.ushadow.svc.cluster.local:6379"], + "type": "redis", + "default_port": 6379 + }, + "postgres": { + "found": false, + "endpoints": [], + "type": "postgres", + "default_port": 5432 + } + } +} +``` + +### Create Environment Map +```bash +curl -X POST http://localhost:8000/api/kubernetes/{cluster_id}/envmap \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "service_name": "my-service", + "namespace": "ushadow", + "env_vars": { + "DATABASE_URL": "postgres://...", + "API_KEY": "secret-key-123" + } + }' +``` + +**Response:** +```json +{ + "success": true, + "configmap": "my-service-config", + "secret": "my-service-secrets", + "namespace": "ushadow" +} +``` + +### Deploy Service +```bash +curl -X POST http://localhost:8000/api/kubernetes/{cluster_id}/deploy \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "service_id": "parakeet-compose:parakeet", + "namespace": "ushadow" + }' +``` + +**Response:** +```json +{ + "success": true, + "message": "Deployed to ushadow/parakeet", + "service_id": "parakeet-compose:parakeet", + "namespace": "ushadow" +} +``` + +## Next Steps + +### Unified Deployment UI (To Be Implemented) +The Kubernetes integration is ready for the unified deployment UI, which will: + +1. **Select Deployment Target:** Docker or Kubernetes +2. **Choose Service:** From compose registry +3. **Configure Environment:** Map env vars to settings or K8s discovered services +4. **Deploy:** Execute deployment with progress tracking + +**Separation of Concerns:** +- **Kubernetes Page:** Cluster configuration, infrastructure readiness +- **Deployment UI:** Unified interface for deploying to Docker or K8s + +## Troubleshooting + +### Index Conflict Error +**Error:** `IndexKeySpecsConflict: An existing index has the same name as the requested index` + +**Solution:** Fixed in `deployment_manager.py` - automatically drops and recreates conflicting indexes. + +### Kubernetes Package Not Found +**Error:** `ModuleNotFoundError: No module named 'kubernetes'` + +**Solution:** +```bash +cd ushadow/backend +uv sync +``` + +### Kompose Not Found +**Solution:** +```bash +brew install kompose # macOS +# or see Prerequisites section above +``` + +### Cluster Connection Failed +**Check:** +1. Kubeconfig is valid: `kubectl cluster-info` +2. Network connectivity to cluster +3. Correct namespace exists: `kubectl get namespaces` +4. RBAC permissions for service account + +## Files Created/Modified + +### Created +- ✅ `deploy.sh` - Kompose conversion script +- ✅ `scripts/k8s-helpers.sh` - K8s helper commands +- ✅ `KUBERNETES.md` - Kubernetes documentation +- ✅ `KUBERNETES_INTEGRATION.md` - This file + +### Modified Backend +- ✅ `ushadow/backend/src/services/kubernetes_manager.py` - Added scanning and envmap methods +- ✅ `ushadow/backend/src/routers/kubernetes.py` - Added new API endpoints +- ✅ `ushadow/backend/src/services/deployment_manager.py` - Fixed index conflict + +### Modified Frontend +- ✅ `ushadow/frontend/src/pages/KubernetesClustersPage.tsx` - Added scanning UI +- ✅ `ushadow/frontend/src/services/api.ts` - Added new API methods + +## Testing + +### Manual Testing Checklist +- [ ] Add a Kubernetes cluster via UI +- [ ] Scan infrastructure on connected cluster +- [ ] View scan results modal +- [ ] Verify found services show connection endpoints +- [ ] Remove cluster and verify scan results are cleared +- [ ] Generate manifests with `./deploy.sh` +- [ ] Deploy infrastructure to K8s cluster +- [ ] Rescan and verify services are found + +### API Testing +```bash +# Test scan endpoint +curl -X POST http://localhost:8000/api/kubernetes/{cluster_id}/scan-infra \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"namespace": "ushadow"}' + +# Test envmap endpoint +curl -X POST http://localhost:8000/api/kubernetes/{cluster_id}/envmap \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "service_name": "test", + "namespace": "ushadow", + "env_vars": {"TEST_VAR": "value"} + }' +``` + +## Recent Fixes and Improvements + +### Label Sanitization (Issue #2) +**Problem:** Kubernetes labels must match regex `(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?` - colons are invalid. +Service IDs like `openmemory-compose:mem0-ui` were causing deployment failures. + +**Solution:** +- Added label sanitization in `compile_service_to_k8s()` +- Converts `service_id` to `safe_service_id` by replacing `:` and `/` with `-` +- Example: `openmemory-compose:mem0-ui` → `openmemory-compose-mem0-ui` + +### Infrastructure Scan Caching (Issue #1) +**Problem:** Infrastructure scan results weren't retained, requiring re-scanning each time. + +**Solution:** +- Added `infra_scans` field to `KubernetesCluster` model +- Results cached per namespace: `infra_scans: {namespace: scan_results}` +- Scan endpoint now persists results automatically via `update_cluster_infra_scan()` +- Frontend can retrieve cached results from cluster document + +### Automatic Namespace Creation +**Enhancement:** Both `get_or_create_envmap()` and `deploy_to_kubernetes()` now automatically create namespaces if they don't exist. + +### Port Parsing Robustness +**Fix:** Added defensive handling for None/invalid port values to prevent deployment errors. + +### Image Variable Resolution (Issue #3) +**Problem:** Docker Compose image names with environment variables like `${VAR:-default}` weren't being resolved, causing K8s deployment failures. + +**Solution:** +- Added `_resolve_image_variables()` method to resolve Docker Compose variable syntax +- Handles `${VAR}`, `${VAR:-default}`, and `${VAR-default}` patterns +- Looks up variables in: environment dict → OS environment → default value +- Example: `ghcr.io/ushadow-io/u-mem0-ui:${OPENMEMORY_IMAGE_TAG:-latest}` → `ghcr.io/ushadow-io/u-mem0-ui:latest` + +### Manifest Storage and Debugging +**Enhancement:** Generated K8s manifests are now saved for debugging. + +**Features:** +- Manifests saved to `/tmp/k8s-manifests/{cluster_id}/{namespace}/` +- Each manifest saved as YAML: `{service-name}-{manifest-type}.yaml` +- Logged at INFO level for tracking +- Logged at DEBUG level with full YAML content +- Helps troubleshooting deployment issues + +**Location:** Inside backend container at `/tmp/k8s-manifests/` + +### Port Parsing Fix (Issue #4) +**Problem:** UI containers were getting default port 8000 instead of their actual container port (e.g., 3000). + +**Root Cause:** Key mismatch - compose parser returns ports as `{"host": "3002", "container": "3000"}` but kubernetes deployment code was looking for `{"published": ..., "target": ...}`. + +**Solution:** +- Updated port extraction in kubernetes router to check both key formats +- Now correctly extracts container port from parsed compose data +- Example: `"${OPENMEMORY_UI_PORT:-3002}:3000"` → correctly parsed as container port 3000 + +### Health Check Configuration (Issue #5) +**Problem:** Liveness/readiness probes were hardcoded to `/health` endpoint, causing failures for frontend apps that don't have health endpoints. + +**Solution:** +- Added `health_check_path` field to `KubernetesDeploymentSpec` +- Health checks are now **disabled by default** (`health_check_path = None`) +- Can be enabled by setting `health_check_path` in deployment spec +- Example: Set `health_check_path = "/health"` for backend services that support it + +**Default Behavior:** +- Frontend apps: No health checks (default) +- Backend apps: Can enable via deployment UI or API + +### Port Variable Resolution (Issue #6) +**Problem:** Port definitions with variables like `${OPENMEMORY_UI_PORT:-3002}:3000` weren't being resolved, causing parsing errors: `invalid literal for int() with base 10: '-3002}:3000'` + +**Solution:** +- Added `resolve_port_var()` helper function in kubernetes router +- Resolves variables in port strings before passing to kubernetes_manager +- Lookup order: service env_config → OS environment → default value +- Example: `${OPENMEMORY_UI_PORT:-3002}:3000` → `3002:3000` → container port 3000 + +### Enhanced Deployment Logging (Issue #7) +**Problem:** Deployments returning 200 OK but unclear what resources were created or if deployment actually succeeded. + +**Solution:** +- Added detailed logging at deployment start showing service name, image, and ports +- Enhanced success message to list all created resources +- Added detailed error logging with response body and stack traces +- Manifests saved to `/tmp/k8s-manifests/{cluster_id}/{namespace}/` for inspection + +**Log Examples:** +``` +Starting deployment of mem0-ui to cluster abc123, namespace ushadow +Service definition: image=ghcr.io/ushadow-io/u-mem0-ui:latest, ports=['3000'] +Successfully deployed mem0-ui to ushadow. Resources: ConfigMap/mem0-ui-config, Deployment/mem0-ui, Service/mem0-ui +``` + +### Catch-All Exception Handler (Issue #8) +**Problem:** 500 errors weren't showing actual error details in logs - only "Internal Server Error" with no stack trace or error message. + +**Root Cause:** Exception handlers only caught specific types (database, network, HTTP exceptions) but not general Python exceptions. + +**Solution:** +- Added catch-all `@app.exception_handler(Exception)` to middleware +- Logs full error message and stack trace for all unhandled exceptions +- Returns structured JSON response with error details +- Example response: + ```json + { + "detail": "invalid literal for int() with base 10: '-3002}:3000'", + "error_type": "ValueError", + "error_category": "internal_error" + } + ``` + +**Log Example:** +``` +ERROR: Unhandled exception in POST /api/kubernetes/abc123/deploy: ValueError: invalid literal for int() with base 10: '-3002}:3000' +ERROR: Stack trace: +Traceback (most recent call last): + ...full stack trace... +``` + +### Multiple Ports Support (Issue #9) +**Problem:** Services with multiple ports caused deployment failure: `Duplicate value: "http"` in port names. + +**Root Cause:** All container ports were being named "http", violating Kubernetes' requirement for unique port names within a container. + +**Solution:** +- Refactored port parsing to support multiple ports +- Generate unique port names: `http`, `http-2`, `http-3`, etc. +- Create Service ports for each container port +- Each port gets a unique name and correct targetPort mapping + +**Example:** +```yaml +# Service with multiple ports +ports: + - name: http # Port 3000 + containerPort: 3000 + - name: http-2 # Port 8080 + containerPort: 8080 +``` + +**Kubernetes Service:** +```yaml +ports: + - port: 3000 + targetPort: http + name: http + - port: 8080 + targetPort: http-2 + name: http-2 +``` + +## Production Recommendations + +Before deploying to production: + +1. **Review generated manifests** in `k8s/tweaks/README.md` +2. **Convert databases to StatefulSets** (see examples in `k8s/tweaks/`) +3. **Add PersistentVolumeClaims** for data persistence +4. **Configure Ingress** for external access +5. **Set resource limits** (CPU/memory) on all pods +6. **Add NetworkPolicies** for security +7. **Use external secret management** (Sealed Secrets, External Secrets Operator) +8. **Set up monitoring and alerting** (Prometheus, Grafana) + +## Additional Resources + +- [Kompose Documentation](https://kompose.io/) +- [Kubernetes Documentation](https://kubernetes.io/docs/) +- [Kubernetes Python Client](https://github.com/kubernetes-client/python) +- [Kustomize Documentation](https://kustomize.io/) diff --git a/REFACTORING_PLAN.md b/REFACTORING_PLAN.md new file mode 100644 index 00000000..a325ef70 --- /dev/null +++ b/REFACTORING_PLAN.md @@ -0,0 +1,220 @@ +# Refactoring Plan: Instance → ServiceConfiguration + +## Goal +Rename "Instance" to "ServiceConfiguration" to better reflect that this represents a configured service (either cloud credentials or deployable service with config). + +## New Naming Convention + +| Current | New | Purpose | +|---------|-----|---------| +| `Template` | `Template` | Abstract service or provider definition (keep) | +| `Instance` | `ServiceConfiguration` | Template + Config + DeploymentTarget | +| `InstanceManager` | `ConfigurationManager` | Manages service configurations | +| `InstanceStatus` | `ConfigurationStatus` | Status enum | +| `InstanceConfig` | `ServiceConfig` | Configuration values | +| `InstanceOutputs` | `ConfigurationOutputs` | Runtime outputs | +| `InstanceCreate` | `ConfigurationCreate` | API request model | +| `InstanceUpdate` | `ConfigurationUpdate` | API request model | +| `InstanceSummary` | `ConfigurationSummary` | API response model | +| `instances.yaml` | `configurations.yaml` | YAML storage file | + +## Status Values Semantic + +### Cloud Providers +- `configured` - Has valid credentials, ready to use +- `unconfigured` - Missing required credentials + +### Deployable Services (ComposeService, local providers) +- `pending` - Created but not yet started +- `deploying` - Currently starting +- `running` - Running and accessible +- `stopped` - Stopped gracefully +- `error` - Failed to deploy or crashed + +## Files to Update + +### Backend Models (Priority 1) + +1. **`ushadow/backend/src/models/instance.py`** → Rename to `configuration.py` + - `Instance` → `ServiceConfiguration` + - `InstanceStatus` → `ConfigurationStatus` + - `InstanceConfig` → `ServiceConfig` + - `InstanceOutputs` → `ConfigurationOutputs` + - `InstanceCreate` → `ConfigurationCreate` + - `InstanceUpdate` → `ConfigurationUpdate` + - `InstanceSummary` → `ConfigurationSummary` + - `Wiring` stays the same + - Update all docstrings + +2. **`ushadow/backend/src/services/instance_manager.py`** → Rename to `configuration_manager.py` + - `InstanceManager` → `ConfigurationManager` + - `_instances` → `_configurations` + - `instances.yaml` → `configurations.yaml` + - All method names: `create_instance` → `create_configuration`, etc. + - Update all docstrings + +### Backend Services (Priority 1) + +3. **`ushadow/backend/src/services/capability_resolver.py`** + - Update all references to `Instance` → `ServiceConfiguration` + - `consumer_instance_id` → `consumer_config_id` + - `provider_instance` → `provider_config` + +4. **`ushadow/backend/src/services/deployment_manager.py`** + - `instance_id` parameter → `config_id` + - Update docstrings + +5. **`ushadow/backend/src/services/service_orchestrator.py`** + - `instance_id` parameter → `config_id` + +### API Routes (Priority 1) + +6. **`ushadow/backend/src/routers/instances.py`** → Rename to `configurations.py` + - Update all endpoint paths: + - `/api/instances` → `/api/configurations` + - `/api/instances/{id}` → `/api/configurations/{id}` + - Add backwards-compatibility aliases (optional) + - Update all request/response models + - Update docstrings + +7. **`ushadow/backend/src/main.py`** + - Update router import and include + +### Frontend Types (Priority 2) + +8. **`ushadow/frontend/src/services/api.ts`** + - `Instance` → `ServiceConfiguration` + - `InstanceSummary` → `ConfigurationSummary` + - `InstanceCreateRequest` → `ConfigurationCreateRequest` + - `instancesApi` → `configurationsApi` (or keep as `configurationsApi` but map endpoints) + - Update endpoint URLs + +### Frontend Pages (Priority 2) + +9. **`ushadow/frontend/src/pages/InstancesPage.tsx`** → Rename to `ConfigurationsPage.tsx` + - Update component name + - Update all variable names + - Update all API calls + - Update test IDs + +10. **`ushadow/frontend/src/App.tsx`** + - Update route import and component + +11. **`ushadow/frontend/src/components/wiring/WiringBoard.tsx`** + - Update all references to instances + +### Configuration Files (Priority 3) + +12. **`config/instances.yaml`** → Rename to `configurations.yaml` + - Update key: `instances:` → `configurations:` + - Migrate existing data + +13. **`config/wiring.yaml`** + - Update references if needed + +### Documentation (Priority 3) + +14. **Update all markdown files** + - `ARCHITECTURE_OVERVIEW.md` + - `README.md` + - Any other docs + +## Migration Strategy + +### Phase 1: Backend Models (Break nothing) +1. Create new `configuration.py` alongside `instance.py` +2. Copy all classes with new names +3. Add type aliases in `instance.py` for backwards compatibility: + ```python + # Backwards compatibility + Instance = ServiceConfiguration + InstanceManager = ConfigurationManager + ``` + +### Phase 2: Backend Services & Routes (Gradual migration) +1. Update internal services to use new names +2. Keep old API endpoints working with aliases +3. Add deprecation warnings to old endpoints + +### Phase 3: Frontend (Coordinated update) +1. Update API client first +2. Update pages and components +3. Test thoroughly + +### Phase 4: Cleanup (After testing) +1. Remove backwards compatibility aliases +2. Remove old files +3. Rename YAML files (with data migration) + +## Backwards Compatibility Considerations + +### Option 1: Hard Break (Fast, risky) +- Rename everything at once +- Update all references in one PR +- Requires coordination with frontend + +### Option 2: Soft Transition (Safer, slower) +- Keep old API endpoints working +- Add deprecation warnings +- Gradually migrate frontend +- Remove old code after 1-2 releases + +**Recommendation**: Option 2 for production, Option 1 for development branches + +## Testing Checklist + +- [ ] All backend tests pass +- [ ] All frontend tests pass +- [ ] API endpoints work with new names +- [ ] YAML config loads correctly +- [ ] Create new configuration works +- [ ] Deploy configuration works +- [ ] Stop configuration works +- [ ] Delete configuration works +- [ ] Wiring still works +- [ ] Frontend UI displays correctly +- [ ] No broken imports +- [ ] Documentation updated + +## Rollback Plan + +If issues arise: +1. Keep old model files as `instance.py` +2. Git revert specific commits +3. Use type aliases to minimize changes + +## Estimated Effort + +- Backend models: 1-2 hours +- Backend services: 2-3 hours +- API routes: 1-2 hours +- Frontend types: 1 hour +- Frontend pages: 2-3 hours +- Testing: 2-3 hours +- Documentation: 1 hour + +**Total: ~12-15 hours** + +## Next Steps + +1. Get approval on naming convention +2. Choose migration strategy (hard break vs soft transition) +3. Start with backend models (Phase 1) +4. Test each phase before proceeding +5. Update documentation as you go + +--- + +## Questions to Resolve + +1. **API endpoint naming**: Keep `/api/instances` with alias or change to `/api/configurations`? +2. **YAML filename**: Migrate `instances.yaml` → `configurations.yaml` now or later? +3. **Variable names**: `config_id` or `configuration_id`? +4. **Backwards compatibility**: How long to keep old names? + +## Decision Log + +- [x] Use `ServiceConfiguration` instead of `Instance` +- [x] Keep unified model (not splitting cloud/local) +- [ ] API endpoint strategy: TBD +- [ ] Migration timeline: TBD diff --git a/UNIFIED_CONFIG_ARCHITECTURE.md b/UNIFIED_CONFIG_ARCHITECTURE.md new file mode 100644 index 00000000..7eeec7b4 --- /dev/null +++ b/UNIFIED_CONFIG_ARCHITECTURE.md @@ -0,0 +1,271 @@ +# Unified Configuration Architecture + +## New Model: ServiceConfig + +### What It Represents +A **service configuration** - a specific configured instance of a template (Provider or ComposeService), ready to be used or deployed. + +### Naming Convention +- Model: `ServiceConfig` +- Manager: `ServiceConfigManager` +- API: `/api/svc-configs` (shortened) +- YAML: `service_configs.yaml` (runtime state only) +- Variables: `svc_config_id` or `config_id` + +## Architecture Change + +### Before (Current - Duplicated Config) + +``` +SettingsStore (OmegaConf): + config.defaults.yaml: + llm: + openai_model: gpt-4o-mini + + config.overrides.yaml: + llm: + openai_model: gpt-4o + +Separate instances.yaml: + instances: + openai-prod: + config: # DUPLICATE! + model: gpt-4o + api_key: sk-123 +``` + +### After (Unified in SettingsStore) + +```yaml +# config.defaults.yaml (template defaults) +templates: + openai: + model: gpt-4o-mini + base_url: https://api.openai.com/v1 + + openmemory-compose:mem0: + port: 8765 + openai_model: gpt-4o-mini + +# config.overrides.yaml (instance-specific config) +service_configs: + openai-prod: + api_key: ${api_keys.openai_prod_key} # Interpolation works! + model: gpt-4o # Overrides template default + + openai-dev: + api_key: ${api_keys.openai_dev_key} + model: gpt-4o-mini + + mem0-local: + port: 8766 # Port override for second instance! + openai_api_key: ${api_keys.openai_api_key} + +# service_configs.yaml (runtime state ONLY) +service_configs: + openai-prod: + template_id: openai + config_path: service_configs.openai-prod # Reference to SettingsStore + status: configured + created_at: 2026-01-15T20:19:23Z + + mem0-local: + template_id: openmemory-compose:mem0 + config_path: service_configs.mem0-local + deployment_target: local + status: running + container_id: abc123def456 + container_name: mem0-abc123de + deployment_id: abc123de +``` + +## ServiceConfig Model (New) + +```python +class ServiceConfig(BaseModel): + """ + A configured service ready for use. + + Runtime state only - actual config values live in SettingsStore. + """ + id: str + template_id: str # openai, openmemory-compose:mem0 + name: str + description: Optional[str] + + # Configuration reference (not the config itself!) + config_path: str # Path in SettingsStore: "service_configs.{id}" + + # Deployment + deployment_target: Optional[str] # None=local, hostname=unode, "cloud" + status: ServiceConfigStatus + + # Runtime state (only for deployed services) + container_id: Optional[str] + container_name: Optional[str] + deployment_id: Optional[str] + + # Timestamps + created_at: datetime + deployed_at: Optional[datetime] + updated_at: Optional[datetime] + + # Error tracking + error: Optional[str] +``` + +## Status Values + +```python +class ServiceConfigStatus(str, Enum): + # Cloud services + CONFIGURED = "configured" # Has valid credentials + UNCONFIGURED = "unconfigured" # Missing required config + + # Deployable services + PENDING = "pending" # Created but not deployed + DEPLOYING = "deploying" # Currently deploying + RUNNING = "running" # Running and accessible + STOPPED = "stopped" # Stopped gracefully + ERROR = "error" # Failed or crashed +``` + +## Configuration Resolution Order + +``` +1. Template defaults (config.defaults.yaml → templates.{template_id}) +2. Instance overrides (config.overrides.yaml → service_configs.{id}) +3. Wired provider values (for capability resolution) +``` + +All resolved through SettingsStore with OmegaConf interpolation! + +## Benefits + +### 1. Single Source of Truth +All config values in SettingsStore - no duplication + +### 2. OmegaConf Interpolation Everywhere +```yaml +service_configs: + mem0-prod: + openai_api_key: ${api_keys.openai_api_key} # ✅ Works! + neo4j_password: ${secrets.neo4j_password} # ✅ Works! +``` + +### 3. Defaults/Overrides Pattern +Consistent with existing SettingsStore behavior + +### 4. Port Overrides Natural +```yaml +service_configs: + mem0-instance-2: + port: 8766 # Just another config value! +``` + +### 5. Per-Instance Configuration +Multiple configs of same template, each with different values + +### 6. Secret Management +All secrets in `secrets.yaml`, referenced via interpolation + +## API Changes + +### Endpoints +``` +POST /api/svc-configs Create new service config +GET /api/svc-configs List all service configs +GET /api/svc-configs/{id} Get service config details +PATCH /api/svc-configs/{id} Update service config +DELETE /api/svc-configs/{id} Delete service config + +POST /api/svc-configs/{id}/deploy Deploy/start service +POST /api/svc-configs/{id}/stop Stop service +GET /api/svc-configs/{id}/preflight Check for port conflicts +POST /api/svc-configs/{id}/port-override Set port override +``` + +### Request/Response Models +```python +class ServiceConfigCreate(BaseModel): + id: str + template_id: str + name: str + config: Dict[str, Any] # Written to SettingsStore + deployment_target: Optional[str] + +class ServiceConfigUpdate(BaseModel): + name: Optional[str] + config: Optional[Dict[str, Any]] # Merged into SettingsStore + deployment_target: Optional[str] +``` + +## Implementation Strategy + +### Phase 1: Rename (No Config Move) +1. Rename `Instance` → `ServiceConfig` (keep nested config for now) +2. Rename manager, routes, frontend +3. Test everything still works + +### Phase 2: Unify Config (Architecture Change) +1. Add `config_path` field to `ServiceConfig` +2. Read config from SettingsStore instead of nested field +3. Write config updates to SettingsStore +4. Remove nested `config` field +5. Migrate existing data + +**Recommendation**: Do Phase 1 first (rename), Phase 2 separately (config unification) + +## Migration Path + +### Step 1: Add config_path Field (Backwards Compatible) +```python +class ServiceConfig(BaseModel): + config_path: Optional[str] # New field + config: Optional[Dict] # Keep for backwards compat +``` + +### Step 2: Dual Write +When creating/updating, write to BOTH locations: +- SettingsStore: `service_configs.{id}.*` +- service_configs.yaml: Keep `config` field + +### Step 3: Migrate Data +Script to copy all `config` values to SettingsStore + +### Step 4: Dual Read +Prefer SettingsStore, fall back to nested config + +### Step 5: Remove Nested Config +Once all data migrated, remove nested field + +## Files Affected (Phase 1 - Rename Only) + +### Backend +- `models/instance.py` → `models/service_config.py` +- `services/instance_manager.py` → `services/service_config_manager.py` +- `routers/instances.py` → `routers/service_configs.py` +- All references in other services + +### Frontend +- `pages/InstancesPage.tsx` → `pages/ServiceConfigsPage.tsx` +- `services/api.ts` - Update types and endpoints +- `components/wiring/*` - Update references + +### Config +- `instances.yaml` → `service_configs.yaml` +- `wiring.yaml` - Update field names + +## Questions to Resolve + +1. **API path**: `/api/svc-configs` or `/api/service-configs`? + - Recommend: `/api/svc-configs` (shorter) + +2. **Config path pattern**: `service_configs.{id}` or `instances.{id}`? + - Recommend: `service_configs.{id}` (matches new naming) + +3. **Do both phases now or separate?** + - Recommend: Phase 1 (rename) now, Phase 2 (config unification) next + +4. **Backwards compatibility**: Keep old endpoints? + - Recommend: Clean break since still in development diff --git a/compose/chronicle-compose.yaml b/compose/chronicle-compose.yaml index 924515bc..4934016b 100644 --- a/compose/chronicle-compose.yaml +++ b/compose/chronicle-compose.yaml @@ -20,7 +20,7 @@ x-ushadow: services: chronicle-backend: - image: ghcr.io/ushadow-io/chronicle-backend:auth-fix + image: ghcr.io/ushadow-io/chronicle/backend:no-spacy container_name: ${COMPOSE_PROJECT_NAME:-ushadow}-chronicle-backend # Sidecar mode: Run both workers and backend in same container command: @@ -94,7 +94,7 @@ services: memory: 2G chronicle-webui: - image: ghcr.io/ushadow-io/chronicle-webui:${CHRONICLE_IMAGE_TAG:-latest} + image: ghcr.io/ushadow-io/chronicle/webui:no-spacy container_name: ${COMPOSE_PROJECT_NAME:-ushadow}-chronicle-webui ports: - "${CHRONICLE_WEBUI_PORT:-3080}:80" diff --git a/compose/openmemory-compose.yaml b/compose/openmemory-compose.yaml index 528fda27..8d22210d 100644 --- a/compose/openmemory-compose.yaml +++ b/compose/openmemory-compose.yaml @@ -22,7 +22,7 @@ x-ushadow: services: mem0: - image: ghcr.io/ushadow-io/mem0-api:${OPENMEMORY_IMAGE_TAG:-latest} + image: ghcr.io/ushadow-io/mem0-api:latest container_name: ${COMPOSE_PROJECT_NAME:-ushadow}-mem0 pull_policy: always # Requires qdrant from infra (started via infra_services in x-ushadow) @@ -62,10 +62,10 @@ services: restart: unless-stopped mem0-ui: - image: ghcr.io/ushadow-io/u-mem0-ui:${OPENMEMORY_IMAGE_TAG:-latest} + image: ghcr.io/ushadow-io/u-mem0-ui:latest container_name: ${COMPOSE_PROJECT_NAME:-ushadow}-mem0-ui ports: - - "${OPENMEMORY_UI_PORT:-3002}:3000" + - "3002:3000" environment: - VITE_API_URL=http://localhost:${OPENMEMORY_PORT:-8765} - API_URL=http://mem0:8765 diff --git a/config/defaults.yml b/config/defaults.yml new file mode 100644 index 00000000..47e39d8d --- /dev/null +++ b/config/defaults.yml @@ -0,0 +1,96 @@ +.# Default model registry configuration +# These provide fallback defaults when config.yml is missing or incomplete +# Priority: config.yml > environment variables > defaults.yml + +defaults: + llm: openai-llm + embedding: openai-embed + stt: stt-deepgram + vector_store: vs-qdrant + +models: + # OpenAI LLM (default) + - name: openai-llm + description: OpenAI GPT-4o-mini + model_type: llm + model_provider: openai + api_family: openai + model_name: ${OPENAI_MODEL:-gpt-4o-mini} + model_url: ${OPENAI_BASE_URL:-https://api.openai.com/v1} + api_key: ${OPENAI_API_KEY:-} + model_params: + temperature: 0.2 + max_tokens: 2000 + model_output: json + + # OpenAI Embeddings (default) + - name: openai-embed + description: OpenAI text-embedding-3-small + model_type: embedding + model_provider: openai + api_family: openai + model_name: text-embedding-3-small + model_url: ${OPENAI_BASE_URL:-https://api.openai.com/v1} + api_key: ${OPENAI_API_KEY:-} + embedding_dimensions: 1536 + model_output: vector + + # Deepgram STT (default) + - name: stt-deepgram + description: Deepgram Nova 3 (batch) + model_type: stt + model_provider: deepgram + api_family: http + model_url: https://api.deepgram.com/v1 + api_key: ${DEEPGRAM_API_KEY:-} + operations: + stt_transcribe: + method: POST + path: /listen + headers: + Authorization: Token ${DEEPGRAM_API_KEY:-} + Content-Type: audio/raw + query: + model: nova-3 + language: multi + smart_format: 'true' + punctuate: 'true' + diarize: 'true' + utterances: 'true' + encoding: linear16 + sample_rate: '16000' + channels: '1' + response: + type: json + extract: + text: results.channels[0].alternatives[0].transcript + words: results.channels[0].alternatives[0].words + segments: results.utterances + + # Qdrant Vector Store (default) + - name: vs-qdrant + description: Qdrant vector database + model_type: vector_store + model_provider: qdrant + api_family: qdrant + model_url: http://${QDRANT_BASE_URL:-qdrant}:${QDRANT_PORT:-6333} + model_params: + host: ${QDRANT_BASE_URL:-qdrant} + port: ${QDRANT_PORT:-6333} + collection_name: omi_memories + +memory: + provider: chronicle + timeout_seconds: 1200 + extraction: + enabled: true + prompt: 'Extract important information from this conversation and return a JSON object with an array named "facts". Include personal preferences, plans, names, dates, locations, numbers, and key details. Keep items concise and useful. + + ' + +speaker_recognition: + enabled: false + service_url: null + timeout: 60 + +chat: {} diff --git a/config/instances.yaml b/config/instances.yaml new file mode 100644 index 00000000..7ff96729 --- /dev/null +++ b/config/instances.yaml @@ -0,0 +1,49 @@ +instances: + openai-1: + template_id: openai + name: openai-1 + deployment_target: cloud + status: n/a + created_at: '2026-01-15T20:19:23.282751+00:00' + openmemory-compose-mem0-unode-1768508366189: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: running + outputs: + access_url: http://localhost:8765 + created_at: '2026-01-15T20:19:26.194496+00:00' + openmemory-compose-mem0-unode-1768508494524: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: running + outputs: + access_url: http://localhost:8765 + created_at: '2026-01-15T20:21:34.538624+00:00' + openai-2: + template_id: openai + name: openai-2 + deployment_target: cloud + status: n/a + created_at: '2026-01-15T20:28:40.099964+00:00' + openmemory-compose-mem0-unode-1768508923327: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: running + created_at: '2026-01-15T20:28:43.330992+00:00' + deployment_id: 63a2c6f9 + container_id: c3da4a4475991d4d31bd096fd223b9df8296a78a707ec51024cb9e11be76a5ca + container_name: mem0-63a2c6f9 + openmemory-compose-mem0-unode-1768508947058: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: error + created_at: '2026-01-15T20:29:07.068665+00:00' + error: Service openmemory-compose:mem0 already deployed to ushadow-purple diff --git a/config/kubeconfigs/003fd5798ebbea9f.enc b/config/kubeconfigs/003fd5798ebbea9f.enc new file mode 100644 index 00000000..427c0ec2 --- /dev/null +++ b/config/kubeconfigs/003fd5798ebbea9f.enc @@ -0,0 +1 @@ +gAAAAABpYuWNtVsZByuCipzHS1h9cDOArpMzXXwuz5_QzmsE6cKIE7VToTuizL-PhK-BKIfNlL2quTS4ATEvqlRoNigsvfZkkMhBlUPmfdtJBVN8XPhx11pYyEn77rJfy4gQIAqe65PXvfLCuRX7BTnLwxHHw__Qg-JipT9fpRsUHKSLVZwJBcXW8V7gNjBA2jRawlE9X8syB9iZGokjxrLsrXLUvQHSiLdS1gTQ0BHl8nzFCGRN9fd2wbLPbf4u31au4Arl_GUaHmFbVM6qvLUaoBw36z3i8vK_QpujsIF2cmvLIPpWZKJF4Nd53d1r9iTSAEX7OEcMsVKXMeCTP6Ya6gMxdJqtDmwe6oNCc6-pC4BuEFzMKfHM-IA7Syj5DnN5P9yJj5Az14Xhvl5wa2-uha24arkyeA8m9Zt5u6hEmsJ1rixkW5YxJTzDAGy6WR7JUHQuB-r8pgdbmqgGelKQeiaTWYz5otvaH37-PBiUwiXXYE-cLaOfQBez4gSkNZwN0yg1-CwQgtMXKKV0O9V_g8hibvLl4207wSowEzMrXPOAqaS-h4WoG8-yowTpZmbqYc946d6_WnN9O4Ea8lPXIZ_wjbcyt91kP3FGxNv_1mC9W73m2Z9Pm8aC7Ms8pSx6bcXfoWwA2ivPdwsTvbM8Y-5YDdYYs4q_DKhD41TbirINu_aWazGcsKNRGw7RXXryDm5wPd2mS4yhTEQzNJiKZXHYhxrgWAINWTDhWR4lptiiBJZX8bFu61NBF_S94dt26dJGCEUfY8eZU_xkXdxKnEphood-a5k7fVMSheWGOLGpsF_lkqHi7ibh0PLFs3jMwmbt1hlvTgD4nOY6RMXFe0KBPwH0liLrywR0HFEYT44dw5MsT808PcY-tvopigr1cI5pvhjN65QfZ41vIDc7trgwHuEouFK4cfezGOkpIyGf4YZmCMUfaL5lrDF7DA8HEKyUwaHmeWpvE2l1XSfpeG7f3EdBoq5VdJ8k2Ef8sKI3sKz8369bDkuKf_XcNxTSnVJhluubGqIOMPOneWk0cmDt8CTcp820kRLNrfCkIUEpbMRew8Ruvko8vD9Ztn8XhOdJnwgSy6APtCyvcFwkdiUWpgSUjM3zBls6RjxddDR9fVCpjCzyVlwOHXKv5nCZGhL99hlxk-coV-0aTMtGjtEKQrlzEcUJYN9aq_hhGhCtYzlYLguEJ9CJL16u2gswxerCifncEVP0_Lpqqzt9Cev1QZi8nb9JYMgPawIERw3L-syBsY2lB6IZ8dcC8JDXL-63Y6PYDmkZ2SEIFfMG-upjBBu-ID3502QXkyNmaVqLuMRx7UEx-v2VW_bvhSl6my5oum_qLTyPbnEBz1ZwfCooA7OAou6Dh7mWXBU2uc9N2hTNJJpIGOTTcP3EUdC6IyDR-4yeCRigicjqTGcLiDVH3k5bMetaYG0HJHdU75LDRv0inK7EBRJS9lDzFlTFNCAYnMyZIgEs3h9QKNpYwqbQmgwbLp3Y86PrPXmIeWYs5Awym2D9NL-1gHee-OD6v34OUrfQ9sBcfdVkIotRvaNzIUJoTVrbBLr2YjsED40tuMspbuywbo_G0Xhsmd2UR9ECEm0bQobos8Nnprmy2MMrkYCA1z1aLipb6RQsR-7pVO-DvzpxOKNiLf_M4mbJooZaeiD4b9KImshVwVjoy4xidrVNWBwER9OEyivkcoES9Mq3mMAW7G89Q07xJ-6e6B4msXCtYqFQvLfFcJF4Bxfy7LyaCRRFqsdakY2njRr67BerbifCaEGTKtl8A2m62XBVOuZV38YHyaWwRpAN2uPEAkXM7yURbGmI-qbWviGLaKud4FVtNy309qghhQDlYek62zWyG9inB94sYwbGr82uOMT5sV2m3MwrNTmDIwK_SMPBGi0jgmglkFOelMNo96EIv4ho_JUsgNXYwbP23D-2NH9-qP_Z7tTUNaXinhoVJl8Dc20Lm1YJgTjp3QdkYDzGZv-JoJBKVO0bmysYeGw4mmXUNdupc9kq1el1WHfBgiIgIUBw3xZLkcQ9vvS9RLgbO8M5xrYfYXEUIHCD23hQlY1xmlYq5Z_5GSXxaUNhE3VWHSIB0Q8rveqvx6CgyLN2X6SwpGdMFSupdJiPGG7UvNcwLNGPitDNX0ZhmVU73sFxUakcjxtb60UhpAKh4kC9ehx4XzBzSn9mNSvyrdkYIYNOYoVQVuQNo4NdyaHdqSskdV1lilUICMB4wSGCnfLyQKC1ijCgd-nMHcoIT3fB5wbTfBadDafu-j0EWxeDkMNh8dlb05QwKtrDtlQABUnne17b_6qrVQDDATGT5plnz-YFsV4y-wAlNZ66avR80-bY5RufGtLvcPiRlerJrG_mwfaJRLzPDbcbZXsfBpuurx4KzFqKfR42vHvdlTrGqwTSLOSo5v9jQa6XEkiQq6myuCO_4TFSPmqVDkjUIE2hJHiURt-Pla6MTw-t-wpjoAEI2y2uEkpOu8cdEfC44xc9Mv_yTwnUIbaoQB71zErbxIRnrtSA3WPrJiIvSII2QvtCMucPWOEleSQf23QP9FE4EEQ7iIt9bcCtrMqa_7vxQb63DPiVz2aeGyAspM4FSea_bPqd0nkzcUzMhOgYMeBsQNJsTOSa0Xe5mPad2qmyZPjC-Y1OBYNXf0H9BZuuyaBfENKOncqBI4g96VcWqK2iV_4mY1Ww-5xNIhjLCkPIBBOJ9En_p0MRNV8ZKNkHdvR-HMwF5g-UT4DSWuKh61uEQNdj0qZ726OmdTK8JMyhKqGu-8SNi5CRFOwtgKK6KFj92cMfkc2J-jXrtqBBZ8JqwqcS4kiw_ekVq0CmrnZs3QCFsqN4lzWUiE6L1DKmTj0S-uL7CrGkjeZjzQFM6sgRQi7uwRM2yCOeWZz_oAUJnTUGMln5iEHq1Wa1T8505rpB5yxCv8IPzMYfwGpMnfBFcSUqRF1cTpetQrJ3TQeXMNGUWT2Nik4ld5Z9YvsOAIxo__FYefB6vVn-khkLQYLZOmqF-LISoGCv08tZGkIttU6ojVQqreBafcUfxHncagx7FvLrhmxRhFdwBgHqJsJM42FHRPZMNU5ikHZrrgE3FElm08O2r_XgeonvzpbOnxfzd4zKxouR3U7iag_6A-pK9MesOKbs3CG9vK6uH0YX_2y9nz2nf7OHciVo3uX1vT6HV6uJNU-LbIpJi1dh12ZKcp-VkEHjg4IbPwAuLpbMNEoG1v5B5iN6oY5vjEOoAoiK35Dh9XHjSngyS1ku9BkA9TcXU5BcVviZM04iURa484yhoXi-JjYleGe3VboekKJJcvF_1KCM4Npwl05KWFfsGkiE84oBaPKJqIDy7UBbtxcRPJ5gHHjeNvBgS6hiSSFT-ZDKBpS-Qhb110KK1ytsm5GYOQ-8gos2QUwaojj6VEW60HxrNrtQrHmB2SdFRywwBCclZsNNn6bsVpRCH5rDM7KxBSDFVHVun9Cy8MIUzXb6RoPrzTOJn3zhos65zaaM9gotYqYk6h6_nMmvNYUIoLgRFqojRPCKApDmZTLhpBpGOD1AF5FJeP8ksLK_vjD3dy_6UpOEzLvAk37eUUSbUDZ690ww3e3-lK6raKPP7So96ax39NuGt1Kz6RUC2FWHCGj1cLh4XmGTAqe8vpkbb6kbEj3-F478S3WTx_-8ZZlxiRKkcUML7p_GiQseDTACyOSPUThhMeWBkRumfdFwQHqDFWL_wU4r3wo96lgdN4VnLuCZFd2yySdc6DRQJL4_VuvxmjXvfR8JtUS27lgN0vQaBCg61lvPXlb5Q0_Z_ZHHgDf7dnKt_T-FQDw8RBlzYA0ag6S4cMesZWWloiHZ1uGwQhQM8Jin2aSuiPgYH7XiLSmQdCKNYtqLZBXFSIgPVuoSwlovvUiIPCa6oMgXRoBo4NOqZiNXoC2FYQOJlyj-QzkrDK3pbGteWUjgsmR578zA_lCgd-Bp9o981xsQItfktfhgNWXMPUWI44ZmciyEDUWtidDxsV2-i_DEBYZxIm0MjBJh-rWsAx2qyLf3rUNvR3tJstAMlUyh8gG4vIQQhuDwDRivrcNdtkCmYrv7VWa5pLILh22CnSBOOUK5tJ9frMHdwXDgXSwIN2Y4OERPMe78j2ARBCz-a_qRs16Hnn1CSF3Yj7RsXPK3NlgL8CJ1ZR7fUo71Wx0h0WVKtrWj3xF7U0cCknNOs6j5UgKjgntyfTKnpLJlEPtFgrZeHPNsf6GFLYL2TfclXAks-W0zRkbHiNKWgFUAJLghpzsP__z9du5MjbYRqp7FDe2WVR-0D1_qyxa4NbQBckQxzP2aBW4P-rZJtyzxl9rEg5dCSPK54aC5hBHrYvyUqZKq4C3YoX-z0i1Qd2LNN_O9QNE1OJl8gtP3zRi6yXpECgYhvxgNVKVsw7BWtN2CRedIZv7_bLouWDWqpOoyynGlhdjVhQMeQhYndDHdvWasiREoTzMObRi0yA-ZOK0rFUIbHbxKoM5w_MSY2j9GoI8f7ilw2lbp-UpsOcH4kqABC1H_221a3PcTNBR2AjCi7LMb6e2zEVplL0deJrXrfUk_jzcmoWJ9n8vrxIIJGeUpJBiSLmbmO0PZokPBAYetZVtgIBCt1jKMWzntQXZqudnnssaxl4SJUd_e_RWkMLYJk3cUse4wEej6Kem19ZT9PA4dUS4m1hKNUdhF3r05K75ypskHDWcPzRnQFs6NCoiDjupwyWNeee_Jq0l0SFc9m_b_nWrLu0mq6iUWNNYD-PnNtHUW6S51C7J-lvNHcfbn0-pXtNEmQ9_fWzHXj4mOJyHXEzaykHHo2ioDDmAeeZ33eOz-j9tBzQyPu4Y5dkl8x7qHFSSdOPOgLJ37EZzHAnPe1e-rT_eGnGDtHbvyScgjqYs149tjbKNjdZb35y4s4w-Pt8-aeIo4FZ9nbN2UO0a-Vea7fIw3RL1nRwM1RJqyz5dhm43suJPvFGBXqGMf7RcaoaOaTcDNZoyGL7os9xHyzDarTA8Z60N45m-_FD7hWttkpkvwYYGnoEhoHsCZc3GP4tHYLQPgQnDSWm270_BAjmrr2MFr80tbX-M-_wCAnMbcMMndcmpikdZuo9ZpaIkPZVKXGI8J-RrYLSEDobxqh3rgUMFp-LGTJkJifFJ6W05cq_dYdfhIEvjiuA35_h-Pb-lXWtiGrfQNW_kC4bhgIEwml-U_5wOCmldE-GcVtAJwm-hKekE8OCzbYhVHmUQodxqG0pJmkOUeQBYJeMyuEnb9KNMAEF3ZQdj0l15en3pjrN5JCeN543y7s4ocJWhgymBHmJOvfXzLJNqHmE2CtHXeHYlFS_VjrDeoi1JEWVQEkgTbMpgjDF_X_Kbu0atYz7W-4AKE7-ko74lNF_L_uH6vIDqhleyk-uNooFwZ0u5Mkl-zRtrBFua5zBJR-bJKtNuNqfO6pZoxGV3NPtZ_RCvBq78yvYseVrM-jMF_9q_Y1jdR4cRfOk7nhHE-KA53ZzNDKaKf1xxFIN_ejGbaqTD0xRXPummCHCW59Y0rDcHVtTOqSQy4TGsfqvVnApLf5Z29erKYZ9dLuJsGMpO1ebDxN1cuIQhC7H1l0tHkG_1c65tQQr49JxC8CpaVfMVanXlDDLxWvhccpK85FR7HV3biunED3KAclO4KlNsCOP62aabzUn9Sr5alZVsLL5ERmJvSXtvn2d1V4mqUY7nmoSZGaG33f-4FyNbevz_RueKT2dyOn5xJm-JGrF2Z38id3gdxplXsaGAFeM3zPaVv6SPuvXuivw_zcX5ffwYvhEmF3K_-oPeKogLXsiJjBAwhLFFzqBd-DAN8y2d6RoWyXTM3bIZ3sClo93uX138fu4E34dcC9yyYCqzQyGS2CUHRMc4lViMO5CvGkKyd8_3ue3qM8gQBsMHjzLvsw9EWKZoseYWznCOYwDujmQaz-97DH13M-SxcYVZiKf6L8HauAkcSjhSK2Db8IlLXAWNf7LhB0mq9rjTCqSQibC-MF4JOmzqidgayGDe03bTtE12rM7KLVDV4fYLFE1Vlc7PJUU7vL-hdNT1iOujdNNuhND94j1ypdoiaN_rR1bVXo7wzlkjsgEHh3RAk9p-e1iYqNReVB7K8RKfCTphUA2NfGRv7ybQ3dISWNPqd1ebeOMdD2edHsTErVrRj-nRz_DXhYeb0gIpWeADIyhHkJpMYPPKq3pEq4aRX4c8dDUn3eaeF0vpFTguMYxUpMYxtK52feo4ppg4nvRd6vD2RYYOey793sQt4gs7vyesdnqcfBrnMVLd4b2TvIg98bLq36Ck2UpEoK8K4nuboRLWLeJn3Yq374U6Hzs0CnzmMmFD5hL6OQFvcWteiSe9Zi88kUoB7v8uvY6c8VceAHIpwy6ohqVnucsG1I_LjMLeXiRlJZ3HVUnJ2B9X5VHIbGhITotvV9NylRJoQbRSiwIe_kz-_I44umtaWTCtBbAKG49oBj-p1zcsUvRZ8UZe2AXFQMHqvFR6lH6SwKouzbTOruF8X2EUflNOqBrDBeEstGR5OixUuMr9o5fP5Ws1iV2zIWIFRurPL7rG5bg2wZ08a_QwOMBmzVmg_17PeEt26A8n4Fj3wOMi8qUoytmM0SA-SjdLPeRtQ-TS3dm7UzWKDY5VEY1Vwu7h5QXp0GTL0gOxIWvCWyqIW-5UQv2MLwoi-a3WP3_GfwfLlELhwJOO5T3xha4GEmeZ0AJea0lP-q7_Q661-zYWj7V6FlNAEqMjilY6rsKL1JMw3Gwm5iDQMNvTdQowGx5EaBsBiA0El6tfjHyhYGEfQeeVc1o7pRNvPvrx2s5mmyTTBlWF80qRs9orE2llgCeJP1Lg-0bazHft2F62FGbC1ekRqVkSJ8nlawMe9gx7esw1ysnflzVbMoxHsOY-9ewLf6_gJZZ4oRv_gUxCBYyKdaKo3aTaUTS5-vPV4u1HAXqpJsvGPtoxfcR6G6Ul6znPS9hnAll9lGg4JjAgsyebTvw4qWvnEbzPf44AM80SiGDxlwQuLDSwkUuw2BTC8_j6QAk2le2Oq3jwtnCH8bt_3P6vdM6ovqTnU3F81YN0dAQV2HTr5gKY2tvj5TfZ3yrm46q0c0VACqSnml8EBQptaju_Q9Ja17tVb4Jt7jONeAOxBU-ElWYomHn8uqVrDMvcSOvwTPOxdNme2lFQ66DlFtFT-hkJBssud3eM6FZvYg4c025Nt6B3oi0fP6nCPd5KEMpdQZgdGAksQJnJDTrOI2hdOlxryYsCJG2SfUsJYS2pk4rlu5rFHrE9SLgk60-EviPLvLgWBqKOedLsHBTBYSnqiwuy2EWQ3TtxkjlevM-YQaquGfRxm92asrfCFat17jssYMcCG6kW-IToSJ41V8wpAYnim5W4Xc6sLe7w3RA2q2Zmt2K--N3fDTVQIRyXPZFz6jPP4QaFvSgBUHv7Rl2VakKPWEb5lPOoaHByNoVX7sTSUfRTZ4EInWtOucCiVldzagly-0wv5Yp6R2gZr48sQPhAbZjZGBAYm0O5zmXjxydmwPP2NqXxpAy8VPtgZDaBa2aiQReHuTAglSS3JNSHGTI2-CTkZUsp7v4xaUnA4zmCftOXWlJIi9IoxZp0= \ No newline at end of file diff --git a/config/tailscale-serve.json b/config/tailscale-serve.json new file mode 100644 index 00000000..a3f359d2 --- /dev/null +++ b/config/tailscale-serve.json @@ -0,0 +1,23 @@ +{ + "version": "alpha0", + "TCP": { + "443": { + "HTTPS": true + } + }, + "Web": { + "purple.spangled-kettle.ts.net:443": { + "Handlers": { + "/auth": { + "Proxy": "http://ushadow-purple-backend:8000/auth" + }, + "/api": { + "Proxy": "http://ushadow-purple-backend:8000/api" + }, + "/": { + "Proxy": "http://ushadow-purple-webui:5173" + } + } + } + } +} \ No newline at end of file diff --git a/config/wiring.yaml b/config/wiring.yaml new file mode 100644 index 00000000..4eadbd87 --- /dev/null +++ b/config/wiring.yaml @@ -0,0 +1,17 @@ +defaults: {} +wiring: +- id: 31e04a7c + source_instance_id: openai-1 + source_capability: llm + target_instance_id: chronicle-compose:chronicle-backend + target_capability: llm +- id: '56324379' + source_instance_id: deepgram-1 + source_capability: transcription + target_instance_id: chronicle-compose:chronicle-backend + target_capability: transcription +- id: 8ea7cb2c + source_instance_id: openai-1 + source_capability: llm + target_instance_id: openmemory-compose:mem0 + target_capability: llm diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 00000000..597d37f1 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,508 @@ +#!/bin/bash +# Ushadow Kubernetes Deployment Script +# Converts Docker Compose to Kubernetes manifests using kompose +# and applies production-ready adjustments + +set -e + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# Configuration +K8S_DIR="k8s" +NAMESPACE="ushadow" +ENV_NAME="${ENV_NAME:-purple}" + +# Function to print colored output +print_info() { + echo -e "${BLUE}ℹ${NC} $1" +} + +print_success() { + echo -e "${GREEN}✓${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}⚠${NC} $1" +} + +print_error() { + echo -e "${RED}✗${NC} $1" +} + +# Function to check if kompose is installed +check_kompose() { + if ! command -v kompose &> /dev/null; then + print_error "kompose is not installed!" + echo "" + echo "Install kompose:" + echo " macOS: brew install kompose" + echo " Linux: curl -L https://github.com/kubernetes/kompose/releases/download/v1.34.0/kompose-linux-amd64 -o kompose && chmod +x kompose && sudo mv kompose /usr/local/bin/" + echo " Windows: choco install kubernetes-kompose" + echo "" + echo "Or download from: https://github.com/kubernetes/kompose/releases" + exit 1 + fi + print_success "kompose is installed ($(kompose version))" +} + +# Function to check if kubectl is installed +check_kubectl() { + if ! command -v kubectl &> /dev/null; then + print_error "kubectl is not installed!" + echo "" + echo "Install kubectl: https://kubernetes.io/docs/tasks/tools/" + exit 1 + fi + print_success "kubectl is installed ($(kubectl version --client -o yaml | grep gitVersion | head -1 | awk '{print $2}'))" +} + +# Function to clean up old k8s manifests +clean_k8s_dir() { + print_info "Cleaning up old Kubernetes manifests..." + rm -rf "${K8S_DIR}" + mkdir -p "${K8S_DIR}"/{infra,base,tweaks} + print_success "Created fresh k8s directory structure" +} + +# Function to convert infrastructure services +convert_infra() { + print_info "Converting infrastructure services (MongoDB, Redis, Qdrant, etc.)..." + + # Create a temporary infrastructure compose file without profiles + # Kompose doesn't handle profiles well, so we create a flat version + cat > /tmp/ushadow-infra.yml < /tmp/ushadow-prod-compose.yml < "${K8S_DIR}/namespace.yaml" < "${K8S_DIR}/configmap.yaml" < "${K8S_DIR}/secret.yaml" < "${K8S_DIR}/tweaks/README.md" < "${K8S_DIR}/tweaks/ingress-example.yaml" < "${K8S_DIR}/tweaks/mongo-statefulset-example.yaml" < "${K8S_DIR}/kustomization.yaml" <> /host-etc/gai.conf + echo "gai.conf configured successfully" + fi + volumeMounts: + - name: host-etc + mountPath: /host-etc + securityContext: + privileged: true + containers: + - name: pause + image: gcr.io/google_containers/pause:3.1 + volumes: + - name: host-etc + hostPath: + path: /etc + type: Directory diff --git a/k8s/infra/mongo-deployment.yaml b/k8s/infra/mongo-deployment.yaml new file mode 100644 index 00000000..6d231eb1 --- /dev/null +++ b/k8s/infra/mongo-deployment.yaml @@ -0,0 +1,23 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + io.kompose.service: mongo + name: mongo +spec: + replicas: 1 + selector: + matchLabels: + io.kompose.service: mongo + template: + metadata: + labels: + io.kompose.service: mongo + spec: + containers: + - image: mongo:8.0 + name: mongo + ports: + - containerPort: 27017 + protocol: TCP + restartPolicy: Always diff --git a/k8s/infra/mongo-service.yaml b/k8s/infra/mongo-service.yaml new file mode 100644 index 00000000..8dc5c514 --- /dev/null +++ b/k8s/infra/mongo-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + io.kompose.service: mongo + name: mongo +spec: + ports: + - name: "27017" + port: 27017 + targetPort: 27017 + selector: + io.kompose.service: mongo diff --git a/k8s/infra/postgres-deployment.yaml b/k8s/infra/postgres-deployment.yaml new file mode 100644 index 00000000..58b9c0b2 --- /dev/null +++ b/k8s/infra/postgres-deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + io.kompose.service: postgres + name: postgres +spec: + replicas: 1 + selector: + matchLabels: + io.kompose.service: postgres + template: + metadata: + labels: + io.kompose.service: postgres + spec: + containers: + - env: + - name: POSTGRES_DB + value: ushadow + - name: POSTGRES_PASSWORD + value: ushadow + - name: POSTGRES_USER + value: ushadow + image: postgres:16-alpine + name: postgres + ports: + - containerPort: 5432 + protocol: TCP + restartPolicy: Always diff --git a/k8s/infra/postgres-service.yaml b/k8s/infra/postgres-service.yaml new file mode 100644 index 00000000..9964d515 --- /dev/null +++ b/k8s/infra/postgres-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + io.kompose.service: postgres + name: postgres +spec: + ports: + - name: "5432" + port: 5432 + targetPort: 5432 + selector: + io.kompose.service: postgres diff --git a/k8s/infra/qdrant-deployment.yaml b/k8s/infra/qdrant-deployment.yaml new file mode 100644 index 00000000..36a0e773 --- /dev/null +++ b/k8s/infra/qdrant-deployment.yaml @@ -0,0 +1,25 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + io.kompose.service: qdrant + name: qdrant +spec: + replicas: 1 + selector: + matchLabels: + io.kompose.service: qdrant + template: + metadata: + labels: + io.kompose.service: qdrant + spec: + containers: + - image: qdrant/qdrant:latest + name: qdrant + ports: + - containerPort: 6333 + protocol: TCP + - containerPort: 6334 + protocol: TCP + restartPolicy: Always diff --git a/k8s/infra/qdrant-service.yaml b/k8s/infra/qdrant-service.yaml new file mode 100644 index 00000000..b249eb93 --- /dev/null +++ b/k8s/infra/qdrant-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + io.kompose.service: qdrant + name: qdrant +spec: + ports: + - name: "6333" + port: 6333 + targetPort: 6333 + - name: "6334" + port: 6334 + targetPort: 6334 + selector: + io.kompose.service: qdrant diff --git a/k8s/infra/redis-deployment.yaml b/k8s/infra/redis-deployment.yaml new file mode 100644 index 00000000..7cde7ea7 --- /dev/null +++ b/k8s/infra/redis-deployment.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + io.kompose.service: redis + name: redis +spec: + replicas: 1 + selector: + matchLabels: + io.kompose.service: redis + template: + metadata: + labels: + io.kompose.service: redis + spec: + containers: + - args: + - redis-server + - --appendonly + - "yes" + image: redis:7-alpine + name: redis + ports: + - containerPort: 6379 + protocol: TCP + restartPolicy: Always diff --git a/k8s/infra/redis-service.yaml b/k8s/infra/redis-service.yaml new file mode 100644 index 00000000..db0491a1 --- /dev/null +++ b/k8s/infra/redis-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + io.kompose.service: redis + name: redis +spec: + ports: + - name: "6379" + port: 6379 + targetPort: 6379 + selector: + io.kompose.service: redis diff --git a/k8s/kustomization.yaml b/k8s/kustomization.yaml new file mode 100644 index 00000000..996c6f5d --- /dev/null +++ b/k8s/kustomization.yaml @@ -0,0 +1,22 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: ushadow + +resources: + - namespace.yaml + - configmap.yaml + - secret.yaml + - infra/ + - base/ + # - tweaks/ingress-example.yaml # Uncomment when ready + +# Add common labels to all resources +commonLabels: + app: ushadow + env: purple + +# Add annotations +commonAnnotations: + managed-by: kustomize + deployed-from: docker-compose diff --git a/k8s/namespace.yaml b/k8s/namespace.yaml new file mode 100644 index 00000000..1fb469b1 --- /dev/null +++ b/k8s/namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ushadow + labels: + name: ushadow + env: purple diff --git a/k8s/secret.yaml b/k8s/secret.yaml new file mode 100644 index 00000000..ea35e5b7 --- /dev/null +++ b/k8s/secret.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ushadow-secret + namespace: ushadow +type: Opaque +stringData: + # TODO: Add your secrets here or use kubectl create secret + # Example: + # MONGODB_URI: "mongodb://mongo:27017/ushadow" + # REDIS_PASSWORD: "your-redis-password" + # API_KEYS: "your-api-keys" diff --git a/k8s/tweaks/README.md b/k8s/tweaks/README.md new file mode 100644 index 00000000..dce1a0d3 --- /dev/null +++ b/k8s/tweaks/README.md @@ -0,0 +1,63 @@ +# Manual Adjustments Needed + +Kompose does a good job, but these adjustments are recommended for production: + +## 1. Infrastructure Services - Use StatefulSets + +For MongoDB, Redis, Qdrant, PostgreSQL, Neo4j: +- Convert Deployment → StatefulSet +- Add proper volumeClaimTemplates +- Configure podManagementPolicy: Parallel or OrderedReady +- Add headless service for stable network identities + +## 2. Persistent Storage + +Add StorageClass and PersistentVolumeClaims: +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mongo-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: standard # or your storage class +``` + +## 3. Resource Limits + +Add resource requests and limits to all deployments: +```yaml +resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" +``` + +## 4. Ingress for External Access + +Create an Ingress resource (see ingress-example.yaml) + +## 5. Network Policies + +Add NetworkPolicy for security (optional but recommended) + +## 6. ConfigMaps and Secrets + +- Move environment variables to ConfigMaps +- Move sensitive data to Secrets +- Use external secret management (e.g., Sealed Secrets, External Secrets Operator) + +## Files Generated + +- `infra/` - Infrastructure services (databases, cache) +- `base/` - Application services (backend, frontend) +- `namespace.yaml` - Namespace definition +- `configmap.yaml` - Configuration data +- `secret.yaml` - Secrets (template only) diff --git a/k8s/tweaks/ingress-example.yaml b/k8s/tweaks/ingress-example.yaml new file mode 100644 index 00000000..22fae944 --- /dev/null +++ b/k8s/tweaks/ingress-example.yaml @@ -0,0 +1,32 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: ushadow-ingress + namespace: ushadow + annotations: + # nginx.ingress.kubernetes.io/rewrite-target: / + # cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + ingressClassName: nginx # or your ingress class + rules: + - host: ushadow.example.com # CHANGE THIS + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: webui + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: backend + port: + number: 8000 + # tls: + # - hosts: + # - ushadow.example.com + # secretName: ushadow-tls diff --git a/k8s/tweaks/mongo-statefulset-example.yaml b/k8s/tweaks/mongo-statefulset-example.yaml new file mode 100644 index 00000000..a96dee5c --- /dev/null +++ b/k8s/tweaks/mongo-statefulset-example.yaml @@ -0,0 +1,54 @@ +apiVersion: v1 +kind: Service +metadata: + name: mongo-headless + namespace: ushadow +spec: + clusterIP: None + selector: + app: mongo + ports: + - port: 27017 + name: mongodb +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: ushadow +spec: + serviceName: mongo-headless + replicas: 1 + selector: + matchLabels: + app: mongo + template: + metadata: + labels: + app: mongo + spec: + containers: + - name: mongo + image: mongo:8.0 + ports: + - containerPort: 27017 + name: mongodb + volumeMounts: + - name: mongo-data + mountPath: /data/db + resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "1Gi" + cpu: "1000m" + volumeClaimTemplates: + - metadata: + name: mongo-data + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 10Gi + # storageClassName: "standard" # Specify your storage class diff --git a/scripts/rename_to_service_config.py b/scripts/rename_to_service_config.py new file mode 100755 index 00000000..7921c48c --- /dev/null +++ b/scripts/rename_to_service_config.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 +""" +Rename Instance → ServiceConfig across the entire codebase. + +This script performs a systematic renaming: +- Instance → ServiceConfig +- InstanceManager → ServiceConfigManager +- instances.yaml → service_configs.yaml +- /api/instances → /api/svc-configs +- All related variable names and field names + +Usage: + python scripts/rename_to_service_config.py [--dry-run] [--backup] + +Options: + --dry-run Show what would be changed without making changes + --backup Create .bak files before modifying +""" + +import os +import re +import argparse +import shutil +from pathlib import Path +from typing import List, Tuple, Dict + +# Root directory +ROOT = Path(__file__).parent.parent + +# Renaming rules (order matters - more specific first!) +RENAMES = [ + # Models and classes + ("InstanceManager", "ServiceConfigManager"), + ("InstanceStatus", "ServiceConfigStatus"), + ("InstanceConfig", "ConfigValues"), + ("InstanceOutputs", "ServiceOutputs"), + ("InstanceCreate", "ServiceConfigCreate"), + ("InstanceUpdate", "ServiceConfigUpdate"), + ("InstanceSummary", "ServiceConfigSummary"), + ("Instance", "ServiceConfig"), # Must be last (most generic) + + # Wiring field names + ("source_instance_id", "source_config_id"), + ("target_instance_id", "target_config_id"), + + # Variables and parameters + ("instance_id", "config_id"), + ("instances_api", "svc_configs_api"), + ("instancesApi", "svcConfigsApi"), + ("consumer_instance_id", "consumer_config_id"), + ("provider_instance", "provider_config"), + ("instance_configs", "service_configs"), + ("_instances", "_service_configs"), + + # Files and paths + ("instances.yaml", "service_configs.yaml"), + ("instance.py", "service_config.py"), + ("instance_manager.py", "service_config_manager.py"), + + # API paths (be careful with these) + ("/api/instances", "/api/svc-configs"), + ("InstancesPage", "ServiceConfigsPage"), + + # Comments and docstrings + ("instance of a template", "service configuration of a template"), + ("An instance", "A service config"), + ("the instance", "the service config"), +] + +# Files to exclude from renaming +EXCLUDE_PATTERNS = [ + "*.pyc", + "__pycache__", + "node_modules", + ".git", + "*.bak", + ".env", + ".venv", + "venv", + "REFACTORING_PLAN.md", + "UNIFIED_CONFIG_ARCHITECTURE.md", + "ARCHITECTURE_OVERVIEW.md", + "rename_to_service_config.py", # This script itself! +] + +# File extensions to process +INCLUDE_EXTENSIONS = [ + ".py", + ".ts", + ".tsx", + ".yaml", + ".yml", + ".md", + ".json", +] + + +def should_process_file(file_path: Path) -> bool: + """Check if file should be processed.""" + # Check extension + if file_path.suffix not in INCLUDE_EXTENSIONS: + return False + + # Check exclude patterns + for pattern in EXCLUDE_PATTERNS: + if pattern in str(file_path): + return False + + return True + + +def find_files_to_process() -> List[Path]: + """Find all files that should be processed.""" + files = [] + + # Backend Python files + backend_dir = ROOT / "ushadow" / "backend" / "src" + if backend_dir.exists(): + for file_path in backend_dir.rglob("*"): + if file_path.is_file() and should_process_file(file_path): + files.append(file_path) + + # Frontend TypeScript files + frontend_dir = ROOT / "ushadow" / "frontend" / "src" + if frontend_dir.exists(): + for file_path in frontend_dir.rglob("*"): + if file_path.is_file() and should_process_file(file_path): + files.append(file_path) + + # Config files + config_dir = ROOT / "config" + if config_dir.exists(): + for file_path in config_dir.rglob("*.yaml"): + if file_path.is_file() and should_process_file(file_path): + files.append(file_path) + for file_path in config_dir.rglob("*.yml"): + if file_path.is_file() and should_process_file(file_path): + files.append(file_path) + + return files + + +def apply_renames_to_content(content: str) -> Tuple[str, int]: + """Apply all rename rules to content. Returns (new_content, num_changes).""" + new_content = content + total_changes = 0 + + for old, new in RENAMES: + # Count occurrences + count = new_content.count(old) + if count > 0: + new_content = new_content.replace(old, new) + total_changes += count + + return new_content, total_changes + + +def process_file(file_path: Path, dry_run: bool = False, backup: bool = False) -> Dict: + """Process a single file. Returns dict with stats.""" + result = { + "path": str(file_path), + "changes": 0, + "error": None, + } + + try: + # Read file + with open(file_path, 'r', encoding='utf-8') as f: + original_content = f.read() + + # Apply renames + new_content, num_changes = apply_renames_to_content(original_content) + + result["changes"] = num_changes + + # If no changes, skip + if num_changes == 0: + return result + + # If dry run, just report + if dry_run: + return result + + # Create backup if requested + if backup: + backup_path = file_path.with_suffix(file_path.suffix + '.bak') + shutil.copy2(file_path, backup_path) + + # Write changes + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + + except Exception as e: + result["error"] = str(e) + + return result + + +def rename_files(dry_run: bool = False) -> List[Tuple[Path, Path]]: + """Rename files themselves. Returns list of (old_path, new_path) tuples.""" + renames = [] + + # Find files to rename + for old_pattern, new_pattern in RENAMES: + if not old_pattern.endswith('.py') and not old_pattern.endswith('.yaml'): + continue + + # Find all files matching old pattern + for root, dirs, files in os.walk(ROOT): + # Skip excluded directories + if any(excl in root for excl in EXCLUDE_PATTERNS): + continue + + for filename in files: + if old_pattern in filename: + old_path = Path(root) / filename + new_filename = filename.replace(old_pattern, new_pattern) + new_path = Path(root) / new_filename + + if old_path.exists(): + renames.append((old_path, new_path)) + + # Perform renames + if not dry_run: + for old_path, new_path in renames: + print(f" Renaming: {old_path.name} → {new_path.name}") + old_path.rename(new_path) + + return renames + + +def main(): + parser = argparse.ArgumentParser(description="Rename Instance → ServiceConfig") + parser.add_argument("--dry-run", action="store_true", help="Show changes without applying") + parser.add_argument("--backup", action="store_true", help="Create .bak files") + args = parser.parse_args() + + print("=" * 80) + print("Instance → ServiceConfig Renaming Script") + print("=" * 80) + + if args.dry_run: + print("\n[DRY RUN MODE - No changes will be made]\n") + + # Find files to process + print("\n1. Finding files to process...") + files = find_files_to_process() + print(f" Found {len(files)} files to process") + + # Process content + print("\n2. Processing file contents...") + results = [] + total_changes = 0 + + for file_path in files: + result = process_file(file_path, dry_run=args.dry_run, backup=args.backup) + if result["changes"] > 0 or result["error"]: + results.append(result) + total_changes += result["changes"] + + # Report changes + print(f"\n Files with changes: {len([r for r in results if r['changes'] > 0])}") + print(f" Total changes: {total_changes}") + + # Show files with most changes + if results: + print("\n Top files by number of changes:") + sorted_results = sorted(results, key=lambda r: r["changes"], reverse=True) + for result in sorted_results[:10]: + if result["changes"] > 0: + path = Path(result["path"]).relative_to(ROOT) + print(f" {result['changes']:4d} changes - {path}") + + # Report errors + errors = [r for r in results if r["error"]] + if errors: + print(f"\n ⚠️ Errors: {len(errors)}") + for result in errors: + path = Path(result["path"]).relative_to(ROOT) + print(f" {path}: {result['error']}") + + # Rename files themselves + print("\n3. Renaming files...") + file_renames = rename_files(dry_run=args.dry_run) + print(f" Files renamed: {len(file_renames)}") + + if file_renames: + print("\n Renamed files:") + for old_path, new_path in file_renames: + old_rel = old_path.relative_to(ROOT) + new_rel = new_path.relative_to(ROOT) + print(f" {old_rel}") + print(f" → {new_rel}") + + # Summary + print("\n" + "=" * 80) + print("Summary") + print("=" * 80) + print(f"Files processed: {len(files)}") + print(f"Files changed: {len([r for r in results if r['changes'] > 0])}") + print(f"Total changes: {total_changes}") + print(f"Files renamed: {len(file_renames)}") + print(f"Errors: {len(errors)}") + + if args.dry_run: + print("\n[DRY RUN - No actual changes made]") + print("Run without --dry-run to apply changes") + else: + print("\n✅ Renaming complete!") + if args.backup: + print(" Backup files (.bak) created") + print("\nNext steps:") + print("1. Test the backend: docker restart ushadow-purple-backend") + print("2. Test the frontend: npm run dev") + print("3. Check for any import errors") + print("4. Run tests") + + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/test_k8s_deploy.py b/test_k8s_deploy.py new file mode 100644 index 00000000..c537e449 --- /dev/null +++ b/test_k8s_deploy.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +""" +Test script for Kubernetes deployment API. + +Tests the deployment flow end-to-end and helps debug issues. +""" + +import json +import requests +import sys +from pprint import pprint + +# Configuration +BASE_URL = "http://localhost:8400" +CLUSTER_ID = "003fd5798ebbea9f" +SERVICE_ID = "openmemory-compose:mem0-ui" +NAMESPACE = "ushadow" + +# Get auth token (adjust as needed) +def get_auth_token(): + """Get authentication token.""" + # For testing, you might need to adjust this based on your auth setup + # Option 1: Login + # response = requests.post(f"{BASE_URL}/api/auth/login", json={"username": "...", "password": "..."}) + # return response.json()["access_token"] + + # Option 2: Use existing token + # return "your-token-here" + + # Option 3: No auth (if auth is disabled for testing) + return None + +def test_get_available_services(): + """Test: Get list of available services.""" + print("\n" + "="*80) + print("TEST: Get Available Services") + print("="*80) + + response = requests.get(f"{BASE_URL}/api/kubernetes/services/available") + print(f"Status: {response.status_code}") + + if response.status_code == 200: + data = response.json() + services = data.get("services", []) + print(f"Found {len(services)} services") + + # Find mem0-ui + mem0_ui = next((s for s in services if s["service_name"] == "mem0-ui"), None) + if mem0_ui: + print("\nmem0-ui service:") + pprint(mem0_ui) + return mem0_ui + else: + print("ERROR: mem0-ui not found in services") + return None + else: + print(f"ERROR: {response.text}") + return None + +def test_get_env_config(service_name): + """Test: Get environment configuration for service.""" + print("\n" + "="*80) + print(f"TEST: Get Env Config for {service_name}") + print("="*80) + + response = requests.get(f"{BASE_URL}/api/services/{service_name}/env") + print(f"Status: {response.status_code}") + + if response.status_code == 200: + data = response.json() + print(f"Required env vars: {len(data.get('required_env_vars', []))}") + print(f"Optional env vars: {len(data.get('optional_env_vars', []))}") + return data + else: + print(f"ERROR: {response.text}") + return None + +def test_create_envmap(): + """Test: Create ConfigMap and Secret.""" + print("\n" + "="*80) + print("TEST: Create Envmap") + print("="*80) + + payload = { + "service_name": "mem0-ui", + "namespace": NAMESPACE, + "env_vars": { + "VITE_API_URL": "8765", + "API_URL": "http://mem0:8765" + } + } + + print("Request payload:") + pprint(payload) + + response = requests.post( + f"{BASE_URL}/api/kubernetes/{CLUSTER_ID}/envmap", + json=payload + ) + print(f"\nStatus: {response.status_code}") + + if response.status_code == 200: + data = response.json() + print("Response:") + pprint(data) + return data + else: + print(f"ERROR: {response.text}") + return None + +def test_deploy_service(): + """Test: Deploy service to Kubernetes.""" + print("\n" + "="*80) + print("TEST: Deploy Service") + print("="*80) + + payload = { + "service_id": SERVICE_ID, + "namespace": NAMESPACE + } + + print("Request payload:") + pprint(payload) + + response = requests.post( + f"{BASE_URL}/api/kubernetes/{CLUSTER_ID}/deploy", + json=payload + ) + print(f"\nStatus: {response.status_code}") + print("Response:") + + try: + data = response.json() + pprint(data) + + if response.status_code == 200: + print("\n✅ DEPLOYMENT SUCCESSFUL!") + return True + else: + print(f"\n❌ DEPLOYMENT FAILED") + print(f"Error: {data.get('detail', 'Unknown error')}") + return False + except Exception as e: + print(f"Failed to parse response: {e}") + print(response.text) + return False + +def check_backend_version(): + """Check if backend has latest code.""" + print("\n" + "="*80) + print("TEST: Check Backend Code Version") + print("="*80) + + # Try to deploy and check logs + print("Checking if image variables are resolved...") + print("Looking for: image=ghcr.io/ushadow-io/u-mem0-ui:latest (resolved)") + print("NOT: image=ghcr.io/ushadow-io/u-mem0-ui:${OPENMEMORY_IMAGE_TAG:-latest} (unresolved)") + print("\nCheck backend logs for 'Service definition:' line") + +def main(): + """Run all tests.""" + print("="*80) + print("Kubernetes Deployment API Test Suite") + print("="*80) + print(f"Base URL: {BASE_URL}") + print(f"Cluster ID: {CLUSTER_ID}") + print(f"Service ID: {SERVICE_ID}") + print(f"Namespace: {NAMESPACE}") + + # Test 1: Check backend version + check_backend_version() + + # Test 2: Get available services + service = test_get_available_services() + if not service: + print("\n❌ Failed to get services") + sys.exit(1) + + # Test 3: Get env config + env_config = test_get_env_config("mem0-ui") + if not env_config: + print("\n❌ Failed to get env config") + sys.exit(1) + + # Test 4: Create envmap + envmap = test_create_envmap() + if not envmap: + print("\n❌ Failed to create envmap") + sys.exit(1) + + # Test 5: Deploy service + success = test_deploy_service() + + # Summary + print("\n" + "="*80) + print("TEST SUMMARY") + print("="*80) + if success: + print("✅ All tests passed - deployment successful!") + sys.exit(0) + else: + print("❌ Deployment failed - check logs above") + print("\nNext steps:") + print("1. Check backend logs: docker logs ushadow-purple-backend") + print("2. Check generated manifest: docker exec ushadow-purple-backend cat /tmp/k8s-manifests/{cluster-id}/ushadow/mem0-ui-deployment.yaml") + print("3. Check K8s deployment: kubectl get deployments,pods -n ushadow") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/ushadow/backend/src/config/instances.yaml b/ushadow/backend/src/config/instances.yaml index db6b14b4..898b57e1 100644 --- a/ushadow/backend/src/config/instances.yaml +++ b/ushadow/backend/src/config/instances.yaml @@ -1,34 +1,52 @@ instances: openai-1: template_id: openai - name: oss - config: - model: gpt-oss + name: openai-1 deployment_target: cloud status: n/a - created_at: '2026-01-04T14:18:43.443875+00:00' - openai-2: - template_id: openai - name: openai-2 - config: - model: xdsdf - api_key: '123' - deployment_target: cloud - status: n/a - created_at: '2026-01-04T14:19:23.753674+00:00' - openai-3: - template_id: openai - name: red - deployment_target: cloud - status: n/a - created_at: '2026-01-08T16:10:24.972709+00:00' - whisper-local-1: - template_id: whisper-local - name: whisper-local-1 - deployment_target: local - created_at: '2026-01-17T17:06:53.998791+00:00' - ollama-2: - template_id: ollama - name: ollama-2 - deployment_target: local - created_at: '2026-01-17T17:35:48.517963+00:00' + created_at: '2026-01-15T20:36:55.092787+00:00' + openmemory-compose-mem0-unode-1768509418855: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: stopped + created_at: '2026-01-15T20:36:58.858635+00:00' + deployment_id: 91f60bb2 + container_id: cf9ec7d53922a9d40babf69d5845c109b30d1931d74ca8bb1ebcf761232f6221 + container_name: mem0-91f60bb2 + openmemory-compose-mem0-unode-1768511009309: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: error + created_at: '2026-01-15T21:03:29.314118+00:00' + error: 'Docker deployment failed: 500 Server Error for http+docker://localhost/v1.51/containers/c9ff4abaf1c15a28c7e6972004e1036b0f0a7f0559a4c8c81b4e6411b8941e10/start: + Internal Server Error ("failed to set up container networking: driver failed + programming external connectivity on endpoint mem0-1dce9e9c (66e4f0d502125429290ec142e2f2ac486d432ccf6d6ffa52ce8c671e1fdc2716): + Bind for 0.0.0.0:8765 failed: port is already allocated")' + openmemory-compose-mem0-unode-1768555072128: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: stopped + outputs: + access_url: https://purple.spangled-kettle.ts.net/openmemory-compose:mem0 + created_at: '2026-01-16T09:17:52.180807+00:00' + deployment_id: 4716fbe0 + container_id: 1ccf6ebcf9fe3b7c45d3c702262742090239a1fa99a90945d86ecfc27cc2c281 + container_name: mem0-4716fbe0 + openmemory-compose-mem0-unode-1768555097629: + template_id: openmemory-compose:mem0 + name: OpenMemory (ushadow-purple) + description: uNode deployment to ushadow-purple + deployment_target: ushadow-purple + status: stopped + outputs: + access_url: https://purple.spangled-kettle.ts.net/openmemory-compose:mem0 + created_at: '2026-01-16T09:18:17.648798+00:00' + deployment_id: 13e2b5e8 + container_id: 749fdc055e33127a8befb7ee26b941a433e35a937ede432acecc1bd94cf97a25 + container_name: mem0-13e2b5e8 diff --git a/ushadow/backend/src/config/wiring.yaml b/ushadow/backend/src/config/wiring.yaml index b0c43821..bab4e340 100644 --- a/ushadow/backend/src/config/wiring.yaml +++ b/ushadow/backend/src/config/wiring.yaml @@ -1,17 +1,7 @@ defaults: {} wiring: -- id: 3cacc293 - source_instance_id: openai-2 +- id: 891f06d2 + source_instance_id: openai-1 source_capability: llm target_instance_id: openmemory-compose:mem0 target_capability: llm -- id: 19679d70 - source_instance_id: whisper-local-1 - source_capability: transcription - target_instance_id: chronicle-compose:chronicle-backend - target_capability: transcription -- id: 0bd39f1d - source_instance_id: ollama-2 - source_capability: llm - target_instance_id: chronicle-compose:chronicle-backend - target_capability: llm diff --git a/ushadow/backend/src/middleware/app_middleware.py b/ushadow/backend/src/middleware/app_middleware.py index fb93d08a..5b2be756 100644 --- a/ushadow/backend/src/middleware/app_middleware.py +++ b/ushadow/backend/src/middleware/app_middleware.py @@ -288,6 +288,33 @@ async def http_exception_handler(request: Request, exc: HTTPException): content={"detail": exc.detail} ) + @app.exception_handler(Exception) + async def general_exception_handler(request: Request, exc: Exception): + """ + Catch-all handler for any unhandled exceptions. + + Logs full stack trace and returns structured error response. + This ensures all errors are visible in logs for debugging. + """ + import traceback + + # Log full error with stack trace + logger.error( + f"Unhandled exception in {request.method} {request.url.path}: " + f"{type(exc).__name__}: {exc}" + ) + logger.error(f"Stack trace:\n{traceback.format_exc()}") + + # Return structured error response + return JSONResponse( + status_code=500, + content={ + "detail": str(exc), + "error_type": type(exc).__name__, + "error_category": "internal_error" + } + ) + def setup_middleware(app: FastAPI) -> None: """Set up all middleware for the FastAPI application.""" From 24bde82dd0bf06fad034cb66b792ddd01a2d54f5 Mon Sep 17 00:00:00 2001 From: Stu Alexander Date: Sat, 17 Jan 2026 17:42:12 +0000 Subject: [PATCH 04/45] refectored instances and k8s deployds --- FIX_UV_DNS.md | 177 ++++++ ushadow/backend/main.py | 4 +- ushadow/backend/src/models/deployment.py | 4 +- ushadow/backend/src/models/integration.py | 4 +- ushadow/backend/src/models/kubernetes.py | 6 + .../models/{instance.py => service_config.py} | 64 ++- ushadow/backend/src/routers/kubernetes.py | 48 +- .../{instances.py => service_configs.py} | 160 +++--- ushadow/backend/src/routers/services.py | 4 +- .../src/services/capability_resolver.py | 252 ++++++-- .../backend/src/services/compose_registry.py | 2 +- .../src/services/deployment_backends.py | 2 +- .../src/services/deployment_manager.py | 33 +- .../backend/src/services/docker_manager.py | 69 ++- .../src/services/integration_operations.py | 90 +-- .../src/services/kubernetes_manager.py | 57 +- ...e_manager.py => service_config_manager.py} | 435 ++++++++------ .../src/services/service_orchestrator.py | 4 +- .../backend/tests/test_instance_manager.py | 538 ----------------- .../backend/tests/test_instances_router.py | 544 ------------------ ushadow/frontend/src/App.tsx | 4 +- .../src/components/DeployToK8sModal.tsx | 21 +- .../frontend/src/components/EnvVarEditor.tsx | 6 +- .../src/components/TailscaleOriginBanner.tsx | 105 ++++ .../frontend/src/components/layout/Layout.tsx | 6 +- .../src/components/services/ServiceCard.tsx | 6 +- .../services/ServiceCategoryList.tsx | 6 +- .../components/services/ServiceConfigForm.tsx | 4 +- .../components/services/ServiceStatsCards.tsx | 2 +- .../services/ServiceStatusBadge.tsx | 4 +- .../src/components/wiring/WiringBoard.tsx | 32 +- .../frontend/src/contexts/ServicesContext.tsx | 26 +- .../frontend/src/hooks/useServiceStatus.ts | 6 +- ushadow/frontend/src/pages/N8NPage.tsx | 2 +- ...stancesPage.tsx => ServiceConfigsPage.tsx} | 370 ++++++------ ushadow/frontend/src/services/api.ts | 104 ++-- 36 files changed, 1397 insertions(+), 1804 deletions(-) create mode 100644 FIX_UV_DNS.md rename ushadow/backend/src/models/{instance.py => service_config.py} (77%) rename ushadow/backend/src/routers/{instances.py => service_configs.py} (84%) rename ushadow/backend/src/services/{instance_manager.py => service_config_manager.py} (63%) delete mode 100644 ushadow/backend/tests/test_instance_manager.py delete mode 100644 ushadow/backend/tests/test_instances_router.py create mode 100644 ushadow/frontend/src/components/TailscaleOriginBanner.tsx rename ushadow/frontend/src/pages/{InstancesPage.tsx => ServiceConfigsPage.tsx} (91%) diff --git a/FIX_UV_DNS.md b/FIX_UV_DNS.md new file mode 100644 index 00000000..d07806da --- /dev/null +++ b/FIX_UV_DNS.md @@ -0,0 +1,177 @@ +# Fix for uv DNS Issues in Kubernetes + +## Problem + +Chronicle backend uses `uv 0.6.10` (Rust-based package manager) which fails to download packages from PyPI with error: + +``` +error: Failed to fetch: `https://pypi.org/simple/setuptools/` + Caused by: dns error: failed to lookup address information: Name has no usable address +``` + +## Root Cause + +Kubernetes adds search domains and `ndots:5` to `/etc/resolv.conf`: + +``` +search default.svc.cluster.local svc.cluster.local cluster.local communityfibre.co.uk +nameserver 10.152.183.10 +options ndots:5 +``` + +**Rust's DNS resolver (in uv 0.6.10) does NOT handle search domain expansion properly in dual-stack IPv6 environments.** + +## Proof + +When we remove search domains: +```bash +# Broken (with search domains) +uv pip install setuptools +# error: dns error: failed to lookup address information + +# Works (without search domains) +echo "nameserver 10.152.183.10" > /etc/resolv.conf +uv pip install setuptools +# ✅ Successfully downloaded setuptools +``` + +## Solution: Configure dnsPolicy for Chronicle + +Add `dnsPolicy: Default` to Chronicle deployment to bypass Kubernetes DNS and use node's `/etc/resolv.conf`: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: chronicle-backend +spec: + dnsPolicy: Default # Use node DNS, not cluster DNS + containers: + - name: backend + image: ghcr.io/ushadow-io/chronicle/backend:nodeps1 + # ... rest of config +``` + +### Alternative: Custom DNS Config + +If you need cluster DNS for service discovery but want simpler config for external domains: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: chronicle-backend +spec: + dnsPolicy: None + dnsConfig: + nameservers: + - 10.152.183.10 # CoreDNS + - 8.8.8.8 # Google DNS as fallback + options: + - name: ndots + value: "1" # Reduce from 5 to 1 +``` + +## Testing the Fix + +### Test with dnsPolicy: Default + +```bash +cat < bool: @@ -197,23 +197,23 @@ async def get_template( # ============================================================================= -# Instance Endpoints +# ServiceConfig Endpoints # ============================================================================= -@router.get("", response_model=List[InstanceSummary]) -async def list_instances( +@router.get("", response_model=List[ServiceConfigSummary]) +async def list_service_configs( current_user: dict = Depends(get_current_user), -) -> List[InstanceSummary]: +) -> List[ServiceConfigSummary]: """List all instances.""" - manager = get_instance_manager() - return manager.list_instances() + manager = get_service_config_manager() + return manager.list_service_configs() -@router.get("/{instance_id}", response_model=Instance) +@router.get("/{config_id}", response_model=ServiceConfig) async def get_instance( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), -) -> Instance: +) -> ServiceConfig: """Get an instance by ID. The config.values will only contain override values - values that differ @@ -221,13 +221,13 @@ async def get_instance( 1. OmegaConf.is_interpolation to identify inherited values (interpolations) 2. Comparison with template defaults for direct values """ - manager = get_instance_manager() - instance = manager.get_instance(instance_id) + manager = get_service_config_manager() + instance = manager.get_service_config(config_id) if not instance: - raise HTTPException(status_code=404, detail=f"Instance not found: {instance_id}") + raise HTTPException(status_code=404, detail=f"ServiceConfig not found: {config_id}") # Get raw overrides (non-interpolation values) - overrides = manager.get_config_overrides(instance_id) + overrides = manager.get_config_overrides(config_id) # For existing instances with direct values, also compare with template defaults # to filter out values that match the template @@ -267,11 +267,11 @@ async def get_instance( return instance -@router.post("", response_model=Instance) +@router.post("", response_model=ServiceConfig) async def create_instance( - data: InstanceCreate, + data: ServiceConfigCreate, current_user: dict = Depends(get_current_user), -) -> Instance: +) -> ServiceConfig: """Create a new instance from a template. Config values that match template defaults are filtered out, @@ -314,10 +314,10 @@ async def create_instance( # Fall back to using all provided config # Create instance with filtered config - manager = get_instance_manager() + manager = get_service_config_manager() try: # Create a modified data object with filtered config - from src.models.instance import InstanceCreate as IC + from src.models.service_config import ServiceConfigCreate as IC filtered_data = IC( id=data.id, template_id=data.template_id, @@ -331,18 +331,18 @@ async def create_instance( raise HTTPException(status_code=400, detail=str(e)) -@router.put("/{instance_id}", response_model=Instance) +@router.put("/{config_id}", response_model=ServiceConfig) async def update_instance( - instance_id: str, - data: InstanceUpdate, + config_id: str, + data: ServiceConfigUpdate, current_user: dict = Depends(get_current_user), -) -> Instance: +) -> ServiceConfig: """Update an instance. Config values that match template defaults are filtered out, so only actual overrides are stored. """ - manager = get_instance_manager() + manager = get_service_config_manager() # If config is being updated, filter to only include overrides if data.config is not None: @@ -350,8 +350,8 @@ async def update_instance( if filtered_config: try: - # Get the instance to find its template_id - instance = manager.get_instance(instance_id) + # Get the service config to find its template_id + instance = manager.get_service_config(config_id) if instance: from src.services.capability_resolver import get_capability_resolver settings = get_settings_store() @@ -382,7 +382,7 @@ async def update_instance( logger.debug(f"Could not filter against template defaults: {e}") # Create a modified data object with filtered config - from src.models.instance import InstanceUpdate as IU + from src.models.service_config import ServiceConfigUpdate as IU data = IU( name=data.name, description=data.description, @@ -391,52 +391,52 @@ async def update_instance( ) try: - return manager.update_instance(instance_id, data) + return manager.update_instance(config_id, data) except ValueError as e: raise HTTPException(status_code=404, detail=str(e)) -@router.delete("/{instance_id}") +@router.delete("/{config_id}") async def delete_instance( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """Delete an instance.""" - manager = get_instance_manager() - if not manager.delete_instance(instance_id): - raise HTTPException(status_code=404, detail=f"Instance not found: {instance_id}") - return {"success": True, "message": f"Instance {instance_id} deleted"} + manager = get_service_config_manager() + if not manager.delete_instance(config_id): + raise HTTPException(status_code=404, detail=f"ServiceConfig not found: {config_id}") + return {"success": True, "message": f"ServiceConfig {config_id} deleted"} -@router.post("/{instance_id}/deploy") +@router.post("/{config_id}/deploy") async def deploy_instance( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """Deploy/start an instance. For compose services, this starts the docker container. - For cloud providers, this marks the instance as active. + For cloud providers, this marks the service config as active. """ - manager = get_instance_manager() - success, message = await manager.deploy_instance(instance_id) + manager = get_service_config_manager() + success, message = await manager.deploy_instance(config_id) if not success: raise HTTPException(status_code=400, detail=message) return {"success": True, "message": message} -@router.post("/{instance_id}/undeploy") +@router.post("/{config_id}/undeploy") async def undeploy_instance( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """Stop/undeploy an instance. For compose services, this stops the docker container. - For cloud providers, this marks the instance as inactive. + For cloud providers, this marks the service config as inactive. """ - manager = get_instance_manager() - success, message = await manager.undeploy_instance(instance_id) + manager = get_service_config_manager() + success, message = await manager.undeploy_instance(config_id) if not success: raise HTTPException(status_code=400, detail=message) return {"success": True, "message": message} @@ -451,7 +451,7 @@ async def list_wiring( current_user: dict = Depends(get_current_user), ) -> List[Wiring]: """List all wiring connections.""" - manager = get_instance_manager() + manager = get_service_config_manager() return manager.list_wiring() @@ -460,21 +460,21 @@ async def get_defaults( current_user: dict = Depends(get_current_user), ) -> Dict[str, str]: """Get default capability -> instance mappings.""" - manager = get_instance_manager() + manager = get_service_config_manager() return manager.get_defaults() @router.put("/wiring/defaults/{capability}") async def set_default( capability: str, - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """Set default instance for a capability.""" - manager = get_instance_manager() + manager = get_service_config_manager() try: - manager.set_default(capability, instance_id) - return {"success": True, "capability": capability, "instance_id": instance_id} + manager.set_default(capability, config_id) + return {"success": True, "capability": capability, "config_id": config_id} except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -485,7 +485,7 @@ async def create_wiring( current_user: dict = Depends(get_current_user), ) -> Wiring: """Create a wiring connection.""" - manager = get_instance_manager() + manager = get_service_config_manager() try: return manager.create_wiring(data) except ValueError as e: @@ -498,32 +498,32 @@ async def delete_wiring( current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """Delete a wiring connection.""" - manager = get_instance_manager() + manager = get_service_config_manager() if not manager.delete_wiring(wiring_id): raise HTTPException(status_code=404, detail=f"Wiring not found: {wiring_id}") return {"success": True, "message": f"Wiring {wiring_id} deleted"} -@router.get("/{instance_id}/wiring", response_model=List[Wiring]) +@router.get("/{config_id}/wiring", response_model=List[Wiring]) async def get_instance_wiring( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> List[Wiring]: """Get wiring connections for an instance.""" - manager = get_instance_manager() - instance = manager.get_instance(instance_id) + manager = get_service_config_manager() + instance = manager.get_service_config(config_id) if not instance: - raise HTTPException(status_code=404, detail=f"Instance not found: {instance_id}") - return manager.get_wiring_for_instance(instance_id) + raise HTTPException(status_code=404, detail=f"ServiceConfig not found: {config_id}") + return manager.get_wiring_for_instance(config_id) # ============================================================================= # Integration-Specific Endpoints # ============================================================================= -@router.post("/{instance_id}/test-connection") +@router.post("/{config_id}/test-connection") async def test_integration_connection( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """ @@ -534,14 +534,14 @@ async def test_integration_connection( from src.services.integration_operations import get_integration_operations ops = get_integration_operations() - success, message = await ops.test_connection(instance_id) + success, message = await ops.test_connection(config_id) return {"success": success, "message": message} -@router.post("/{instance_id}/sync") +@router.post("/{config_id}/sync") async def trigger_integration_sync( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """ @@ -552,7 +552,7 @@ async def trigger_integration_sync( from src.services.integration_operations import get_integration_operations ops = get_integration_operations() - result = await ops.sync_now(instance_id) + result = await ops.sync_now(config_id) if not result.get("success"): raise HTTPException(status_code=400, detail=result.get("error", "Sync failed")) @@ -560,9 +560,9 @@ async def trigger_integration_sync( return result -@router.get("/{instance_id}/sync-status") +@router.get("/{config_id}/sync-status") async def get_integration_sync_status( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """ @@ -573,7 +573,7 @@ async def get_integration_sync_status( from src.services.integration_operations import get_integration_operations ops = get_integration_operations() - result = ops.get_sync_status(instance_id) + result = ops.get_sync_status(config_id) if "error" in result: raise HTTPException(status_code=404, detail=result["error"]) @@ -581,21 +581,21 @@ async def get_integration_sync_status( return result -@router.post("/{instance_id}/sync/enable") +@router.post("/{config_id}/sync/enable") async def enable_integration_auto_sync( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """ Enable automatic syncing for an integration. Only works for integration instances (instances with integration_type set). - Requires sync_interval to be configured on the instance. + Requires sync_interval to be configured on the service config. """ from src.services.integration_operations import get_integration_operations ops = get_integration_operations() - success, message = await ops.enable_auto_sync(instance_id) + success, message = await ops.enable_auto_sync(config_id) if not success: raise HTTPException(status_code=400, detail=message) @@ -603,9 +603,9 @@ async def enable_integration_auto_sync( return {"success": True, "message": message} -@router.post("/{instance_id}/sync/disable") +@router.post("/{config_id}/sync/disable") async def disable_integration_auto_sync( - instance_id: str, + config_id: str, current_user: dict = Depends(get_current_user), ) -> Dict[str, Any]: """ @@ -616,7 +616,7 @@ async def disable_integration_auto_sync( from src.services.integration_operations import get_integration_operations ops = get_integration_operations() - success, message = await ops.disable_auto_sync(instance_id) + success, message = await ops.disable_auto_sync(config_id) if not success: raise HTTPException(status_code=400, detail=message) diff --git a/ushadow/backend/src/routers/services.py b/ushadow/backend/src/routers/services.py index 4144f3d0..611380ee 100644 --- a/ushadow/backend/src/routers/services.py +++ b/ushadow/backend/src/routers/services.py @@ -634,8 +634,8 @@ async def start_service( discovered = registry.get_service_by_name(name) service_id = discovered.service_id if discovered else name - # Pass full service_id as instance_id to enable wiring-aware env resolution - result = await orchestrator.start_service(name, instance_id=service_id) + # Pass full service_id as config_id to enable wiring-aware env resolution + result = await orchestrator.start_service(name, config_id=service_id) if not result.success and result.message in ["Service not found", "Operation not permitted"]: raise HTTPException(status_code=403, detail=result.message) diff --git a/ushadow/backend/src/services/capability_resolver.py b/ushadow/backend/src/services/capability_resolver.py index 64cbab51..76132364 100644 --- a/ushadow/backend/src/services/capability_resolver.py +++ b/ushadow/backend/src/services/capability_resolver.py @@ -83,7 +83,7 @@ async def resolve_for_service(self, service_id: str) -> Dict[str, str]: return env - async def resolve_for_instance(self, instance_id: str) -> Dict[str, str]: + async def resolve_for_instance(self, config_id: str) -> Dict[str, str]: """ Resolve all env vars for an instance or compose service, using its wiring configuration. @@ -91,7 +91,7 @@ async def resolve_for_instance(self, instance_id: str) -> Dict[str, str]: global selected_providers. Args: - instance_id: Instance identifier or compose service ID (e.g., "chronicle-compose:chronicle-backend") + config_id: ServiceConfig identifier or compose service ID (e.g., "chronicle-compose:chronicle-backend") Returns: Dict of ENV_VAR_NAME -> value @@ -99,36 +99,36 @@ async def resolve_for_instance(self, instance_id: str) -> Dict[str, str]: Raises: ValueError: If instance not found or required capability missing """ - from src.services.instance_manager import get_instance_manager + from src.services.service_config_manager import get_service_config_manager from src.services.compose_registry import get_compose_registry - instance_manager = get_instance_manager() - instance = instance_manager.get_instance(instance_id) + service_config_manager = get_service_config_manager() + instance = service_config_manager.get_service_config(config_id) # If not an instance, check if it's a compose service if not instance: # Try to get compose service registry = get_compose_registry() - compose_service = registry.get_service(instance_id) + compose_service = registry.get_service(config_id) if compose_service: # Resolve capabilities for compose service using wiring - return await self._resolve_for_compose_service(compose_service, instance_id) - raise ValueError(f"Instance or service '{instance_id}' not found") + return await self._resolve_for_compose_service(compose_service, config_id) + raise ValueError(f"ServiceConfig or service '{config_id}' not found") - # Get the service config from the instance's template + # Get the service config from the service config's template service_config = self._load_service_config(instance.template_id) if not service_config: raise ValueError( - f"Service template '{instance.template_id}' not found for instance '{instance_id}'" + f"Service template '{instance.template_id}' not found for instance '{config_id}'" ) env: Dict[str, str] = {} errors: List[str] = [] - # Resolve each capability, passing the instance ID for wiring lookup + # Resolve each capability, passing the service config ID for wiring lookup for use in service_config.get('uses', []): try: - capability_env = await self._resolve_capability(use, instance_id) + capability_env = await self._resolve_capability(use, config_id) env.update(capability_env) except ValueError as e: if use.get('required', True): @@ -147,11 +147,137 @@ async def resolve_for_instance(self, instance_id: str) -> Dict[str, str]: if errors: raise ValueError( - f"Instance '{instance_id}' has unresolved capabilities:\n" + f"ServiceConfig '{config_id}' has unresolved capabilities:\n" + "\n".join(f" - {e}" for e in errors) ) - logger.info(f"Resolved {len(env)} env vars for instance '{instance_id}'") + logger.info(f"Resolved {len(env)} env vars for instance '{config_id}'") + return env + + async def resolve_for_instance_with_sources(self, config_id: str) -> Dict[str, 'EnvVarValue']: + """ + Resolve all env vars for an instance WITH source tracking. + + Args: + config_id: ServiceConfig identifier or compose service ID + + Returns: + Dict of ENV_VAR_NAME -> EnvVarValue (with value, source, source_path) + + Raises: + ValueError: If instance not found or required capability missing + """ + from src.models.service_config import EnvVarValue, EnvVarSource + from src.services.service_config_manager import get_service_config_manager + + service_config_manager = get_service_config_manager() + instance = service_config_manager.get_service_config(config_id) + + # If not an instance, check if it's a compose service + if not instance: + from src.services.compose_registry import get_compose_registry + registry = get_compose_registry() + compose_service = registry.get_service(config_id) + if compose_service: + # For compose services, use the non-source-tracked method for now + # TODO: Add source tracking for compose services + simple_env = await self._resolve_for_compose_service(compose_service, config_id) + return { + k: EnvVarValue(value=v, source=EnvVarSource.PROVIDER, source_path=None) + for k, v in simple_env.items() + } + raise ValueError(f"ServiceConfig or service '{config_id}' not found") + + # Get the service config from the service config's template + service_config = self._load_service_config(instance.template_id) + if not service_config: + raise ValueError( + f"Service template '{instance.template_id}' not found for instance '{config_id}'" + ) + + env: Dict[str, EnvVarValue] = {} + errors: List[str] = [] + + # Resolve each capability with source tracking + for use in service_config.get('uses', []): + try: + capability_env = await self._resolve_capability_with_sources(use, config_id) + env.update(capability_env) + except ValueError as e: + if use.get('required', True): + errors.append(str(e)) + else: + logger.warning(f"Optional capability failed: {e}") + + if errors: + raise ValueError( + f"ServiceConfig '{config_id}' has unresolved capabilities:\n" + + "\n".join(f" - {e}" for e in errors) + ) + + logger.info(f"Resolved {len(env)} env vars with sources for instance '{config_id}'") + return env + + async def _resolve_capability_with_sources( + self, + use: dict, + consumer_config_id: Optional[str] = None + ) -> Dict[str, 'EnvVarValue']: + """ + Resolve a single capability usage WITH source tracking. + + Args: + use: Dict with 'capability', 'required', 'env_mapping' + consumer_config_id: Optional instance ID if resolving for an instance + + Returns: + Dict of env vars with EnvVarValue objects + """ + from src.models.service_config import EnvVarValue, EnvVarSource + + capability = use['capability'] + env_mapping = use.get('env_mapping', {}) + + # Get the selected provider for this capability + provider, provider_config = await self._get_selected_provider(capability, consumer_config_id) + if not provider: + raise ValueError( + f"No provider selected for capability '{capability}'. " + f"Run the wizard or set selected_providers.{capability} in settings." + ) + + # Resolve each env mapping the provider offers + env: Dict[str, EnvVarValue] = {} + + for env_map in provider.env_maps: + # Resolve with source tracking + result = await self._resolve_env_map_with_source(env_map, provider_config) + + if result is None: + if env_map.required: + raise ValueError( + f"Provider '{provider.id}' requires {env_map.key} but it's not configured. " + f"Set {env_map.settings_path or env_map.key} in settings." + ) + continue + + value, source, source_path = result + + # Use provider's env_var directly, apply service env_mapping only for overrides + provider_env = env_map.env_var or env_map.key.upper() + service_env = env_mapping.get(provider_env, provider_env) + + env[service_env] = EnvVarValue( + value=value, + source=source, + source_path=source_path + ) + + logger.debug( + f"Resolved {capability}.{env_map.key}: " + f"{provider_env} -> {service_env} = *** (source={source})" + ) + return env async def _resolve_for_compose_service(self, compose_service, service_id: str) -> Dict[str, str]: @@ -194,14 +320,14 @@ async def _resolve_for_compose_service(self, compose_service, service_id: str) - async def _resolve_capability( self, use: dict, - consumer_instance_id: Optional[str] = None + consumer_config_id: Optional[str] = None ) -> Dict[str, str]: """ Resolve a single capability usage. Args: use: Dict with 'capability', 'required', 'env_mapping' - consumer_instance_id: Optional instance ID if resolving for an instance + consumer_config_id: Optional instance ID if resolving for an instance Returns: Dict of env vars for this capability @@ -210,7 +336,7 @@ async def _resolve_capability( env_mapping = use.get('env_mapping', {}) # Get the selected provider for this capability (and instance if applicable) - provider, provider_instance = await self._get_selected_provider(capability, consumer_instance_id) + provider, provider_config = await self._get_selected_provider(capability, consumer_config_id) if not provider: raise ValueError( f"No provider selected for capability '{capability}'. " @@ -221,8 +347,8 @@ async def _resolve_capability( env: Dict[str, str] = {} for env_map in provider.env_maps: - # Pass provider_instance so we can check instance-specific config overrides - value = await self._resolve_env_map(env_map, provider_instance) + # Pass provider_config so we can check instance-specific config overrides + value = await self._resolve_env_map(env_map, provider_config) if value is None: if env_map.required: @@ -247,39 +373,39 @@ async def _resolve_capability( async def _get_selected_provider( self, capability: str, - consumer_instance_id: Optional[str] = None + consumer_config_id: Optional[str] = None ) -> tuple[Optional[Provider], Optional[any]]: """ Get the provider selected for a capability. Resolution order: - 1. Instance wiring (if consumer_instance_id provided) - 2. Instance defaults (if consumer_instance_id provided) + 1. ServiceConfig wiring (if consumer_config_id provided) + 2. ServiceConfig defaults (if consumer_config_id provided) 3. settings.selected_providers 4. Default based on wizard_mode Returns: - Tuple of (Provider, provider_instance) + Tuple of (Provider, provider_config) - Provider: The provider template - - provider_instance: The specific instance if wired, None if using global config + - provider_config: The specific instance if wired, None if using global config """ # 1. Check instance wiring and defaults (if resolving for an instance) - if consumer_instance_id: - from src.services.instance_manager import get_instance_manager - instance_manager = get_instance_manager() - provider_instance = instance_manager.get_provider_for_capability( - consumer_instance_id, capability + if consumer_config_id: + from src.services.service_config_manager import get_service_config_manager + service_config_manager = get_service_config_manager() + provider_config = service_config_manager.get_provider_for_capability( + consumer_config_id, capability ) - if provider_instance: + if provider_config: # The instance's template_id is the provider ID - provider = self._provider_registry.get_provider(provider_instance.template_id) + provider = self._provider_registry.get_provider(provider_config.template_id) if provider: logger.info( - f"Using wired provider instance '{provider_instance.id}' " - f"for {capability} (consumer={consumer_instance_id})" + f"Using wired provider instance '{provider_config.id}' " + f"for {capability} (consumer={consumer_config_id})" ) # Return both provider template AND instance for config override - return provider, provider_instance + return provider, provider_config # 2. Try to get explicit selection from settings selected = await self._settings.get(f"selected_providers.{capability}") @@ -303,30 +429,30 @@ async def _get_selected_provider( return None, None - async def _resolve_env_map(self, env_map, provider_instance=None) -> Optional[str]: + async def _resolve_env_map(self, env_map, provider_config=None) -> Optional[str]: """ Resolve an env mapping to its actual value. Priority: - 1. Instance-specific config override (if provider_instance provided) + 1. ServiceConfig-specific config override (if provider_config provided) 2. Settings path lookup (global config) 3. Default value (provider's default) Args: env_map: The environment map to resolve - provider_instance: Optional instance with config overrides + provider_config: Optional instance with config overrides """ # 1. Check instance-specific config override first - if provider_instance and hasattr(provider_instance, 'config'): - # instance.config is a Pydantic InstanceConfig model with values dict - config_values = provider_instance.config.values if provider_instance.config else {} + if provider_config and hasattr(provider_config, 'config'): + # instance.config is a Pydantic ConfigValues model with values dict + config_values = provider_config.config.values if provider_config.config else {} # The key in instance config matches the env_map.key (e.g., 'api_key') if env_map.key in config_values: value = config_values[env_map.key] if value: logger.info( f"[Capability Resolver] {env_map.key} -> {mask_if_secret(env_map.key, str(value))} " - f"(from instance '{provider_instance.id}' config override)" + f"(from instance '{provider_config.id}' config override)" ) return str(value) @@ -350,6 +476,50 @@ async def _resolve_env_map(self, env_map, provider_instance=None) -> Optional[st return None + async def _resolve_env_map_with_source(self, env_map, provider_config=None) -> Optional[tuple[str, str, Optional[str]]]: + """ + Resolve an env mapping to its actual value WITH source tracking. + + Returns: + Tuple of (value, source, source_path) or None if not resolved + - value: The resolved string value + - source: One of: "override", "settings", "default" + - source_path: Settings path or provider ID for the source + """ + from src.models.service_config import EnvVarSource + + # 1. Check instance-specific config override first + if provider_config and hasattr(provider_config, 'config'): + config_values = provider_config.config.values if provider_config.config else {} + if env_map.key in config_values: + value = config_values[env_map.key] + if value: + logger.info( + f"[Capability Resolver] {env_map.key} -> {mask_if_secret(env_map.key, str(value))} " + f"(from instance '{provider_config.id}' config override)" + ) + return (str(value), EnvVarSource.OVERRIDE.value, provider_config.id) + + # 2. Try settings path (global config) + if env_map.settings_path: + value = await self._settings.get(env_map.settings_path) + if value: + logger.info( + f"[Capability Resolver] {env_map.key} -> {mask_if_secret(env_map.key, str(value))} " + f"(from global settings: {env_map.settings_path})" + ) + return (str(value), EnvVarSource.SETTINGS.value, env_map.settings_path) + + # 3. Fall back to provider's default + if env_map.default is not None: + logger.info( + f"[Capability Resolver] {env_map.key} -> {mask_if_secret(env_map.key, env_map.default)} " + f"(using provider default)" + ) + return (env_map.default, EnvVarSource.DEFAULT.value, None) + + return None + async def _resolve_config_item(self, config: dict) -> Optional[str]: """Resolve a service-specific config item.""" import secrets diff --git a/ushadow/backend/src/services/compose_registry.py b/ushadow/backend/src/services/compose_registry.py index 3287f161..aa13b1f8 100644 --- a/ushadow/backend/src/services/compose_registry.py +++ b/ushadow/backend/src/services/compose_registry.py @@ -462,7 +462,7 @@ def resolve_env_vars( # ============================================================================ -# Global Instance +# Global ServiceConfig # ============================================================================ _registry: Optional[ComposeServiceRegistry] = None diff --git a/ushadow/backend/src/services/deployment_backends.py b/ushadow/backend/src/services/deployment_backends.py index 4643a4ad..f1161b0e 100644 --- a/ushadow/backend/src/services/deployment_backends.py +++ b/ushadow/backend/src/services/deployment_backends.py @@ -393,7 +393,7 @@ async def deploy( "cluster_id": cluster_id, "namespace": namespace, "deployment_name": result["deployment_name"], - "instance_id": result["instance_id"], + "config_id": result["config_id"], } ) diff --git a/ushadow/backend/src/services/deployment_manager.py b/ushadow/backend/src/services/deployment_manager.py index 7ffe79f8..568fbd5a 100644 --- a/ushadow/backend/src/services/deployment_manager.py +++ b/ushadow/backend/src/services/deployment_manager.py @@ -129,7 +129,8 @@ async def close(self): async def resolve_service_for_deployment( self, - service_id: str + service_id: str, + config_id: Optional[str] = None ) -> "ResolvedServiceDefinition": """ Resolve all variables for a service using docker-compose config. @@ -139,13 +140,14 @@ async def resolve_service_for_deployment( Steps: 1. Get service from compose registry - 2. Get user's saved env configuration + 2. Get user's saved env configuration (from ServiceConfig if config_id provided) 3. Run `docker-compose -f config ` with resolved env vars 4. Parse the resolved YAML output (all ${VAR:-default} substituted) 5. Return ResolvedServiceDefinition with clean values Args: service_id: Service identifier (e.g., "openmemory-compose:mem0-ui") + config_id: Optional ServiceConfig ID to load env var overrides from Returns: ResolvedServiceDefinition with all variables resolved @@ -169,9 +171,10 @@ async def resolve_service_for_deployment( from src.services.docker_manager import get_docker_manager docker_manager = get_docker_manager() - # Build environment variables with user configuration + # Build environment variables with user configuration (including ServiceConfig overrides) subprocess_env, container_env = await docker_manager._build_env_vars_for_service( - service.service_name + service.service_name, + config_id=config_id ) # Get compose file path (DiscoveredService has compose_file as direct attribute) @@ -326,7 +329,7 @@ async def resolve_service_for_deployment( logger.info( f"Resolved service {service_id}: image={image}, " - f"ports={ports}, env_vars={len(environment)}" + f"ports={ports}, env_vars={len(environment)}, volumes={len(volumes)}" ) return resolved @@ -441,7 +444,7 @@ async def deploy_service( service_id: str, unode_hostname: str, namespace: Optional[str] = None, - instance_id: Optional[str] = None + config_id: Optional[str] = None ) -> Deployment: """ Deploy a service to any deployment target (Docker unode or K8s cluster). @@ -453,7 +456,7 @@ async def deploy_service( service_id: Service to deploy unode_hostname: Target unode hostname (Docker host or K8s cluster ID) namespace: Optional K8s namespace (only used for K8s deployments) - instance_id: Optional instance ID (for instance-based deployments) + config_id: Optional instance ID (for instance-based deployments) """ # Resolve service with all variables substituted try: @@ -474,23 +477,23 @@ async def deploy_service( unode = UNode(**unode_dict) # Check if already deployed - # If instance_id is provided, check for that specific instance + # If config_id is provided, check for that specific instance # Otherwise, check for any deployment of this service (legacy behavior) query = { "service_id": service_id, "unode_hostname": unode_hostname } - if instance_id: - query["instance_id"] = instance_id + if config_id: + query["config_id"] = config_id existing = await self.deployments_collection.find_one(query) if existing and existing.get("status") in [ DeploymentStatus.RUNNING, DeploymentStatus.DEPLOYING ]: - if instance_id: + if config_id: raise ValueError( - f"Instance {instance_id} already deployed to {unode_hostname}" + f"ServiceConfig {config_id} already deployed to {unode_hostname}" ) else: raise ValueError( @@ -555,8 +558,8 @@ async def deploy_service( namespace=namespace ) - # Set instance_id on the deployment - deployment.instance_id = instance_id + # Set config_id on the deployment + deployment.config_id = config_id # For Docker deployments, update tailscale serve routes if deployment.backend_type == "docker": @@ -593,7 +596,7 @@ async def deploy_service( id=deployment_id, service_id=service_id, unode_hostname=unode_hostname, - instance_id=instance_id, + config_id=config_id, status=DeploymentStatus.FAILED, created_at=datetime.now(timezone.utc), deployed_config=resolved_service.model_dump(), diff --git a/ushadow/backend/src/services/docker_manager.py b/ushadow/backend/src/services/docker_manager.py index bfe91c04..f3d35085 100644 --- a/ushadow/backend/src/services/docker_manager.py +++ b/ushadow/backend/src/services/docker_manager.py @@ -848,18 +848,18 @@ def check_port_conflicts(self, service_name: str) -> List[PortConflict]: return conflicts - async def start_service(self, service_name: str, instance_id: Optional[str] = None) -> tuple[bool, str]: + async def start_service(self, service_name: str, config_id: Optional[str] = None) -> tuple[bool, str]: """ Start a Docker service. Args: service_name: Name of the service to start - instance_id: Optional instance ID for wiring-aware env resolution + config_id: Optional instance ID for wiring-aware env resolution Returns: Tuple of (success: bool, message: str) """ - logger.info(f"start_service called with: {repr(service_name)}, instance_id={instance_id}") + logger.info(f"start_service called with: {repr(service_name)}, config_id={config_id}") # Validate service name first valid, error_msg = self.validate_service_name(service_name) @@ -887,7 +887,7 @@ async def start_service(self, service_name: str, instance_id: Optional[str] = No # Container doesn't exist - try to start via compose if compose_file is specified compose_file = self.MANAGEABLE_SERVICES[service_name].get("compose_file") if compose_file: - return await self._start_service_via_compose(service_name, compose_file, instance_id) + return await self._start_service_via_compose(service_name, compose_file, config_id) logger.error(f"Container not found for service: {service_name}") return False, "Service not found" @@ -963,7 +963,7 @@ async def _build_env_vars_from_compose_config( return resolved async def _build_env_vars_for_service( - self, service_name: str, instance_id: Optional[str] = None + self, service_name: str, config_id: Optional[str] = None ) -> tuple[Dict[str, str], Dict[str, str]]: """ Build environment variables for a service. @@ -973,7 +973,7 @@ async def _build_env_vars_for_service( Args: service_name: Name of the service - instance_id: Optional instance ID for wiring-aware resolution + config_id: Optional instance ID for wiring-aware resolution Returns: Tuple of (subprocess_env, container_env): @@ -1006,11 +1006,11 @@ async def _build_env_vars_for_service( # Get env vars from capability resolver # Capability resolver takes priority over compose config because: # - Wired provider instances may have custom config overrides - # - Instance-specific config should override global defaults + # - ServiceConfig-specific config should override global defaults try: - # Use instance-aware resolution if instance_id provided - if instance_id: - cap_env = await resolver.resolve_for_instance(instance_id) + # Use instance-aware resolution if config_id provided + if config_id: + cap_env = await resolver.resolve_for_instance(config_id) else: cap_env = await resolver.resolve_for_service(service_name) @@ -1029,6 +1029,41 @@ async def _build_env_vars_for_service( except Exception as e: logger.debug(f"CapabilityResolver fallback for {service_name}: {e}") + # Apply ServiceConfig-specific env var overrides (highest priority) + if config_id: + from src.services.service_config_manager import get_service_config_manager + sc_manager = get_service_config_manager() + service_config = sc_manager.get_service_config(config_id) + + if service_config and service_config.config.values: + for key, value in service_config.config.values.items(): + # Skip internal metadata fields (prefixed with _) + if key.startswith('_'): + continue + + # Handle _from_setting references + if isinstance(value, dict) and '_from_setting' in value: + # Resolve the setting path + from src.config.omegaconf_settings import get_settings_store + settings = get_settings_store() + setting_path = value['_from_setting'] + resolved_value = await settings.get(setting_path) + if resolved_value: + value = str(resolved_value) + else: + continue + + # Apply the override + if key in container_env and str(container_env[key]) != str(value): + old_val = mask_if_secret(key, container_env[key]) + new_val = mask_if_secret(key, value) + logger.info( + f"[ServiceConfig Override] {key}: {old_val} -> {new_val} " + f"(config_id={config_id})" + ) + container_env[key] = str(value) + subprocess_env[key] = str(value) + # Apply port overrides from services.{name}.ports from src.config.omegaconf_settings import get_settings_store settings = get_settings_store() @@ -1084,9 +1119,9 @@ async def _build_env_vars_for_service( logger.warning(f"Service {service_name}: {warning}") # Resolve all env vars for the container - # Use instance-aware resolution if instance_id provided - if instance_id: - container_env = await resolver.resolve_for_instance(instance_id) + # Use instance-aware resolution if config_id provided + if config_id: + container_env = await resolver.resolve_for_instance(config_id) else: container_env = await resolver.resolve_for_service(service_name) @@ -1095,7 +1130,7 @@ async def _build_env_vars_for_service( logger.info( f"Resolved {len(container_env)} env vars for {service_name} " - f"via capability resolver" + (f" (instance={instance_id})" if instance_id else "") + f"via capability resolver" + (f" (instance={config_id})" if config_id else "") ) except ValueError: @@ -1164,14 +1199,14 @@ async def _start_infra_services(self, infra_services: list[str]) -> tuple[bool, logger.error(f"Error starting infra services: {e}") return False, f"Failed to start infrastructure: {str(e)}" - async def _start_service_via_compose(self, service_name: str, compose_file: str, instance_id: Optional[str] = None) -> tuple[bool, str]: + async def _start_service_via_compose(self, service_name: str, compose_file: str, config_id: Optional[str] = None) -> tuple[bool, str]: """ Start a service using docker-compose. Args: service_name: Name of the service to start compose_file: Relative path to the compose file (from project root) - instance_id: Optional instance ID for wiring-aware env resolution + config_id: Optional instance ID for wiring-aware env resolution Returns: Tuple of (success: bool, message: str) @@ -1232,7 +1267,7 @@ async def _start_service_via_compose(self, service_name: str, compose_file: str, # Build environment variables from service configuration # All env vars are passed via subprocess_env for compose ${VAR} substitution - subprocess_env, container_env = await self._build_env_vars_for_service(service_name, instance_id) + subprocess_env, container_env = await self._build_env_vars_for_service(service_name, config_id) # Suppress orphan warnings when running services from different compose files # in the same project namespace (e.g., chronicle + main backend share auth) diff --git a/ushadow/backend/src/services/integration_operations.py b/ushadow/backend/src/services/integration_operations.py index 7a22db0f..b7c0b08b 100644 --- a/ushadow/backend/src/services/integration_operations.py +++ b/ushadow/backend/src/services/integration_operations.py @@ -2,18 +2,18 @@ Integration Operations Service Handles integration-specific operations like connection testing, syncing, and state management. -This service works with the InstanceManager for CRUD operations and adds integration-specific logic. +This service works with the ServiceConfigManager for CRUD operations and adds integration-specific logic. """ import logging from datetime import datetime, timedelta, timezone from typing import Dict, Any, Tuple, Optional -from src.models.instance import Instance +from src.models.service_config import ServiceConfig from src.models.integration import IntegrationConfig, ConnectionConfig, AuthConfig, AuthMethod from src.memory.adapters.base import MemoryAdapter from src.memory.adapters.factory import AdapterFactory -from .instance_manager import InstanceManager, get_instance_manager +from .service_config_manager import ServiceConfigManager, get_service_config_manager logger = logging.getLogger(__name__) @@ -28,31 +28,31 @@ class IntegrationOperations: - State tracking """ - def __init__(self, instance_manager: Optional[InstanceManager] = None): + def __init__(self, instance_manager: Optional[ServiceConfigManager] = None): """ Initialize integration operations. Args: - instance_manager: Instance manager (defaults to singleton) + instance_manager: ServiceConfig manager (defaults to singleton) """ - self.instance_manager = instance_manager or get_instance_manager() + self.instance_manager = instance_manager or get_service_config_manager() - async def test_connection(self, instance_id: str) -> Tuple[bool, str]: + async def test_connection(self, config_id: str) -> Tuple[bool, str]: """ Test connection to an integration. Args: - instance_id: ID of the integration instance + config_id: ID of the integration instance Returns: Tuple of (success: bool, message: str) """ - instance = self.instance_manager.get_instance(instance_id) + instance = self.instance_manager.get_service_config(config_id) if not instance: - return False, f"Instance '{instance_id}' not found" + return False, f"ServiceConfig '{config_id}' not found" if not instance.integration_type: - return False, f"Instance '{instance_id}' is not an integration" + return False, f"ServiceConfig '{config_id}' is not an integration" try: # Create adapter for this integration @@ -62,22 +62,22 @@ async def test_connection(self, instance_id: str) -> Tuple[bool, str]: success = await adapter.test_connection() if success: - logger.info(f"Connection test successful for integration '{instance_id}'") + logger.info(f"Connection test successful for integration '{config_id}'") return True, "Connection successful" else: - logger.warning(f"Connection test failed for integration '{instance_id}'") + logger.warning(f"Connection test failed for integration '{config_id}'") return False, "Connection failed" except Exception as e: - logger.error(f"Connection test error for '{instance_id}': {e}") + logger.error(f"Connection test error for '{config_id}': {e}") return False, f"Connection error: {str(e)}" - async def sync_now(self, instance_id: str) -> Dict[str, Any]: + async def sync_now(self, config_id: str) -> Dict[str, Any]: """ Trigger immediate sync for an integration. Args: - instance_id: ID of the integration instance + config_id: ID of the integration instance Returns: Dict with sync results: { @@ -87,19 +87,19 @@ async def sync_now(self, instance_id: str) -> Dict[str, Any]: "error": str (if failed) } """ - instance = self.instance_manager.get_instance(instance_id) + instance = self.instance_manager.get_service_config(config_id) if not instance: - return {"success": False, "error": f"Instance '{instance_id}' not found"} + return {"success": False, "error": f"ServiceConfig '{config_id}' not found"} if not instance.integration_type: - return {"success": False, "error": f"Instance '{instance_id}' is not an integration"} + return {"success": False, "error": f"ServiceConfig '{config_id}' is not an integration"} # Update status to syncing instance.last_sync_status = "in_progress" - self.instance_manager._save_instances() + self.instance_manager._save_service_configs() try: - logger.info(f"Starting sync for integration '{instance_id}'") + logger.info(f"Starting sync for integration '{config_id}'") # Create adapter and fetch items adapter = await self._create_adapter(instance) @@ -125,9 +125,9 @@ async def sync_now(self, instance_id: str) -> Dict[str, Any]: if instance.sync_enabled and instance.sync_interval: instance.next_sync_at = now + timedelta(seconds=instance.sync_interval) - self.instance_manager._save_instances() + self.instance_manager._save_service_configs() - logger.info(f"Sync completed for '{instance_id}': {saved_count} items") + logger.info(f"Sync completed for '{config_id}': {saved_count} items") return { "success": True, @@ -136,34 +136,34 @@ async def sync_now(self, instance_id: str) -> Dict[str, Any]: } except Exception as e: - logger.error(f"Sync failed for '{instance_id}': {e}") + logger.error(f"Sync failed for '{config_id}': {e}") # Update error status instance.last_sync_status = "error" instance.last_sync_error = str(e) - self.instance_manager._save_instances() + self.instance_manager._save_service_configs() return { "success": False, "error": str(e) } - async def enable_auto_sync(self, instance_id: str) -> Tuple[bool, str]: + async def enable_auto_sync(self, config_id: str) -> Tuple[bool, str]: """ Enable automatic syncing for an integration. Args: - instance_id: ID of the integration instance + config_id: ID of the integration instance Returns: Tuple of (success: bool, message: str) """ - instance = self.instance_manager.get_instance(instance_id) + instance = self.instance_manager.get_service_config(config_id) if not instance: - return False, f"Instance '{instance_id}' not found" + return False, f"ServiceConfig '{config_id}' not found" if not instance.integration_type: - return False, f"Instance '{instance_id}' is not an integration" + return False, f"ServiceConfig '{config_id}' is not an integration" if not instance.sync_interval: return False, "Sync interval not configured. Set sync_interval first." @@ -177,51 +177,51 @@ async def enable_auto_sync(self, instance_id: str) -> Tuple[bool, str]: # Never synced, schedule for now + interval instance.next_sync_at = datetime.now(timezone.utc) + timedelta(seconds=instance.sync_interval) - self.instance_manager._save_instances() + self.instance_manager._save_service_configs() - logger.info(f"Auto-sync enabled for '{instance_id}', interval: {instance.sync_interval}s") + logger.info(f"Auto-sync enabled for '{config_id}', interval: {instance.sync_interval}s") return True, "Auto-sync enabled" - async def disable_auto_sync(self, instance_id: str) -> Tuple[bool, str]: + async def disable_auto_sync(self, config_id: str) -> Tuple[bool, str]: """ Disable automatic syncing for an integration. Args: - instance_id: ID of the integration instance + config_id: ID of the integration instance Returns: Tuple of (success: bool, message: str) """ - instance = self.instance_manager.get_instance(instance_id) + instance = self.instance_manager.get_service_config(config_id) if not instance: - return False, f"Instance '{instance_id}' not found" + return False, f"ServiceConfig '{config_id}' not found" if not instance.integration_type: - return False, f"Instance '{instance_id}' is not an integration" + return False, f"ServiceConfig '{config_id}' is not an integration" instance.sync_enabled = False instance.next_sync_at = None - self.instance_manager._save_instances() + self.instance_manager._save_service_configs() - logger.info(f"Auto-sync disabled for '{instance_id}'") + logger.info(f"Auto-sync disabled for '{config_id}'") return True, "Auto-sync disabled" - def get_sync_status(self, instance_id: str) -> Dict[str, Any]: + def get_sync_status(self, config_id: str) -> Dict[str, Any]: """ Get current sync status for an integration. Args: - instance_id: ID of the integration instance + config_id: ID of the integration instance Returns: Dict with sync status information """ - instance = self.instance_manager.get_instance(instance_id) + instance = self.instance_manager.get_service_config(config_id) if not instance: - return {"error": f"Instance '{instance_id}' not found"} + return {"error": f"ServiceConfig '{config_id}' not found"} if not instance.integration_type: - return {"error": f"Instance '{instance_id}' is not an integration"} + return {"error": f"ServiceConfig '{config_id}' is not an integration"} return { "integration_id": instance.id, @@ -235,7 +235,7 @@ def get_sync_status(self, instance_id: str) -> Dict[str, Any]: "next_sync_at": instance.next_sync_at.isoformat() if instance.next_sync_at else None, } - async def _create_adapter(self, instance: Instance) -> MemoryAdapter: + async def _create_adapter(self, instance: ServiceConfig) -> MemoryAdapter: """ Create appropriate memory adapter for an integration instance. diff --git a/ushadow/backend/src/services/kubernetes_manager.py b/ushadow/backend/src/services/kubernetes_manager.py index 903b4938..94e20596 100644 --- a/ushadow/backend/src/services/kubernetes_manager.py +++ b/ushadow/backend/src/services/kubernetes_manager.py @@ -623,6 +623,15 @@ async def compile_service_to_k8s( "annotations": spec.annotations }, "spec": { + # Use ClusterFirst for K8s service DNS resolution + "dnsPolicy": spec.dns_policy or "ClusterFirst", + # Fix ndots:5 breaking uv/Rust DNS while keeping ClusterFirst + # See: docs/IPV6_DNS_FIX.md for why ndots:1 is needed for uv + "dnsConfig": { + "options": [ + {"name": "ndots", "value": "1"} + ] + }, "containers": [{ "name": name, "image": image, @@ -759,17 +768,28 @@ async def ensure_namespace_exists( Returns True if namespace exists or was created successfully. """ + import asyncio + try: + logger.info(f"Getting K8s client for cluster {cluster_id}...") core_api, _ = self._get_kube_client(cluster_id) + logger.info(f"K8s client obtained successfully") - # Check if namespace exists + # Check if namespace exists (run in executor to avoid blocking) try: - core_api.read_namespace(name=namespace) + logger.info(f"Checking if namespace {namespace} exists...") + await asyncio.get_event_loop().run_in_executor( + None, + core_api.read_namespace, + namespace + ) logger.info(f"Namespace {namespace} already exists") return True except ApiException as e: + logger.info(f"Namespace check failed with status {e.status}: {e.reason}") if e.status == 404: # Namespace doesn't exist, create it + logger.info(f"Namespace {namespace} not found, creating...") namespace_manifest = { "apiVersion": "v1", "kind": "Namespace", @@ -780,15 +800,28 @@ async def ensure_namespace_exists( } } } - core_api.create_namespace(body=namespace_manifest) + await asyncio.get_event_loop().run_in_executor( + None, + core_api.create_namespace, + namespace_manifest + ) logger.info(f"Created namespace {namespace}") return True else: # Some other error occurred + logger.error(f"API error checking namespace: status={e.status}, reason={e.reason}") raise + except ApiException as e: + logger.error(f"K8s API exception in ensure_namespace_exists: {e}") + logger.error(f"Status: {e.status}, Reason: {e.reason}") + if hasattr(e, 'body'): + logger.error(f"Body: {e.body}") + raise except Exception as e: - logger.error(f"Error ensuring namespace exists: {e}") + logger.error(f"Error ensuring namespace exists: {type(e).__name__}: {e}") + import traceback + logger.error(f"Traceback: {traceback.format_exc()}") raise async def scan_cluster_for_infra_services( @@ -1166,10 +1199,24 @@ async def deploy_to_kubernetes( logger.info(f"Service definition: image={service_def.get('image')}, ports={service_def.get('ports')}") # Ensure namespace exists first - await self.ensure_namespace_exists(cluster_id, namespace) + logger.info(f"Ensuring namespace {namespace} exists...") + import asyncio + try: + await asyncio.wait_for( + self.ensure_namespace_exists(cluster_id, namespace), + timeout=15.0 # 15 second timeout for namespace check + ) + logger.info(f"Namespace {namespace} ready") + except asyncio.TimeoutError: + raise Exception( + f"Timeout connecting to Kubernetes cluster. " + f"The cluster may be unreachable. Check network connectivity and kubeconfig." + ) # Compile manifests + logger.info(f"Compiling K8s manifests for {service_name}...") manifests = await self.compile_service_to_k8s(service_def, namespace, k8s_spec) + logger.info(f"Manifests compiled successfully") # Log generated manifests for debugging logger.info(f"Generated manifests for {service_name}:") diff --git a/ushadow/backend/src/services/instance_manager.py b/ushadow/backend/src/services/service_config_manager.py similarity index 63% rename from ushadow/backend/src/services/instance_manager.py rename to ushadow/backend/src/services/service_config_manager.py index 36909e87..13040abc 100644 --- a/ushadow/backend/src/services/instance_manager.py +++ b/ushadow/backend/src/services/service_config_manager.py @@ -1,5 +1,5 @@ """ -Instance Manager - Manages service/provider instances and wiring. +ServiceConfig Manager - Manages service/provider instances and wiring. Handles: - Loading instances and wiring from config files @@ -17,14 +17,14 @@ import yaml from omegaconf import OmegaConf, DictConfig -from src.models.instance import ( - Instance, - InstanceConfig, - InstanceCreate, - InstanceOutputs, - InstanceStatus, - InstanceSummary, - InstanceUpdate, +from src.models.service_config import ( + ServiceConfig, + ConfigValues, + ServiceConfigCreate, + ServiceOutputs, + ServiceConfigStatus, + ServiceConfigSummary, + ServiceConfigUpdate, Template, TemplateSource, Wiring, @@ -46,7 +46,7 @@ def _get_config_dir() -> Path: current = Path(__file__).resolve() for parent in current.parents: candidate = parent / "config" - if candidate.exists() and (candidate / "instances.yaml").exists(): + if candidate.exists() and (candidate / "service_configs.yaml").exists(): return candidate # Also check parent (for repo root) candidate = parent.parent / "config" @@ -57,25 +57,69 @@ def _get_config_dir() -> Path: return Path(__file__).resolve().parents[4] / "config" -class InstanceManager: +class ServiceConfigManager: """ Manages instances and wiring. - Instances are stored in config/instances.yaml. + ServiceConfigs are stored in config/service_configs.yaml. Wiring is stored in config/wiring.yaml. """ def __init__(self, config_dir: Optional[Path] = None): self.config_dir = config_dir or _get_config_dir() - self.instances_path = self.config_dir / "instances.yaml" + self.instances_path = self.config_dir / "service_configs.yaml" self.wiring_path = self.config_dir / "wiring.yaml" - self._instances: Dict[str, Instance] = {} - self._instance_configs: Dict[str, DictConfig] = {} # Raw OmegaConf configs (preserves interpolations) - self._wiring: List[Wiring] = {} + # Dual storage: ServiceConfig objects for runtime, DictConfig for persistence + self._service_configs: Dict[str, ServiceConfig] = {} # Resolved configs (for runtime use) + self._omegaconf_configs: Dict[str, DictConfig] = {} # Raw configs with interpolations (for saving) + self._wiring: List[Wiring] = [] self._defaults: Dict[str, str] = {} # capability -> default instance self._loaded = False + def _parse_service_outputs(self, outputs_data: Dict[str, Any]) -> ServiceOutputs: + """ + Parse ServiceOutputs from YAML data, handling both old and new formats. + + Old format: env_vars is Dict[str, str] + New format: env_vars is Dict[str, EnvVarValue] with value/source/source_path + + Args: + outputs_data: Raw outputs dict from YAML + + Returns: + ServiceOutputs with properly typed env_vars + """ + from src.models.service_config import EnvVarValue, EnvVarSource + + env_vars_raw = outputs_data.get('env_vars', {}) + env_vars_parsed = {} + + for key, value in env_vars_raw.items(): + if isinstance(value, dict) and 'value' in value: + # New format: already has EnvVarValue structure + env_vars_parsed[key] = EnvVarValue( + value=value['value'], + source=value.get('source', EnvVarSource.DEFAULT), + source_path=value.get('source_path') + ) + elif isinstance(value, str): + # Old format: just a string value + # Assume it came from default/provider (we don't know the exact source) + env_vars_parsed[key] = EnvVarValue( + value=value, + source=EnvVarSource.DEFAULT, + source_path=None + ) + else: + logger.warning(f"Unexpected env_vars format for {key}: {type(value)}") + + return ServiceOutputs( + access_url=outputs_data.get('access_url'), + env_vars=env_vars_parsed, + capability_values=outputs_data.get('capability_values', {}) + ) + def _ensure_loaded(self) -> None: """Ensure config is loaded.""" if not self._loaded: @@ -83,49 +127,47 @@ def _ensure_loaded(self) -> None: def _load(self) -> None: """Load instances and wiring from config files.""" - self._load_instances() + self._load_service_configs() self._load_wiring() self._loaded = True - def _load_instances(self) -> None: - """Load instances from instances.yaml using OmegaConf to preserve interpolations.""" - self._instances = {} - self._instance_configs = {} + def _load_service_configs(self) -> None: + """Load instances from service_configs.yaml using OmegaConf to preserve interpolations.""" + self._service_configs = {} + self._omegaconf_configs = {} if not self.instances_path.exists(): logger.debug(f"No instances file at {self.instances_path}") return try: - # Load with OmegaConf to preserve interpolations + # Load with OmegaConf to preserve interpolations in raw format raw_cfg = OmegaConf.load(self.instances_path) instances_data = raw_cfg.get('instances', {}) or {} - for instance_id, instance_data in instances_data.items(): + for config_id, instance_data in instances_data.items(): if instance_data is None: continue - # Store raw OmegaConf config to check for interpolations later + # Get config data - store both raw (with interpolations) and resolved config_data = instance_data.get('config', {}) + + # Store raw OmegaConf config (preserves interpolations like ${api_keys.openai}) if config_data: - self._instance_configs[instance_id] = OmegaConf.create(config_data) + self._omegaconf_configs[config_id] = OmegaConf.create(config_data) - # Resolve values for the Instance object (resolves interpolations) + # Resolve values for the ServiceConfig object (for runtime use) resolved_config = OmegaConf.to_container(config_data, resolve=True) if config_data else {} - instance = Instance( - id=instance_id, + instance = ServiceConfig( + id=config_id, template_id=instance_data.get('template_id', ''), - name=instance_data.get('name', instance_id), + name=instance_data.get('name', config_id), description=instance_data.get('description'), - config=InstanceConfig(values=resolved_config), + config=ConfigValues(values=resolved_config), deployment_target=instance_data.get('deployment_target'), - status=InstanceStatus(instance_data.get('status', 'pending')), - outputs=InstanceOutputs( - access_url=instance_data.get('outputs', {}).get('access_url') if instance_data.get('outputs') else None, - env_vars=instance_data.get('outputs', {}).get('env_vars', {}) if instance_data.get('outputs') else {}, - capability_values=instance_data.get('outputs', {}).get('capability_values', {}) if instance_data.get('outputs') else {}, - ), + status=ServiceConfigStatus(instance_data.get('status', 'pending')), + outputs=self._parse_service_outputs(instance_data.get('outputs', {})), created_at=instance_data.get('created_at'), deployed_at=instance_data.get('deployed_at'), updated_at=instance_data.get('updated_at'), @@ -144,9 +186,9 @@ def _load_instances(self) -> None: last_sync_error=instance_data.get('last_sync_error'), next_sync_at=instance_data.get('next_sync_at'), ) - self._instances[instance_id] = instance + self._service_configs[config_id] = instance - logger.info(f"Loaded {len(self._instances)} instances") + logger.info(f"Loaded {len(self._service_configs)} instances") except Exception as e: logger.error(f"Failed to load instances: {e}") @@ -171,9 +213,9 @@ def _load_wiring(self) -> None: for wire_data in data.get('wiring', []) or []: wire = Wiring( id=wire_data.get('id', str(uuid.uuid4())[:8]), - source_instance_id=wire_data['source_instance_id'], + source_config_id=wire_data['source_config_id'], source_capability=wire_data['source_capability'], - target_instance_id=wire_data['target_instance_id'], + target_config_id=wire_data['target_config_id'], target_capability=wire_data['target_capability'], created_at=wire_data.get('created_at'), ) @@ -184,31 +226,46 @@ def _load_wiring(self) -> None: except Exception as e: logger.error(f"Failed to load wiring: {e}") - def _save_instances(self) -> None: - """Save instances to instances.yaml.""" + def _save_service_configs(self) -> None: + """Save instances to service_configs.yaml.""" data = {'instances': {}} - for instance_id, instance in self._instances.items(): + for config_id, instance in self._service_configs.items(): instance_data = { 'template_id': instance.template_id, 'name': instance.name, } if instance.description: instance_data['description'] = instance.description + + # Save config with interpolations preserved (if available) if instance.config.values: - instance_data['config'] = instance.config.values + # Use raw OmegaConf config to preserve interpolations like ${api_keys.openai} + if config_id in self._omegaconf_configs: + # Get unresolved config (preserves ${...} interpolations) + instance_data['config'] = OmegaConf.to_container( + self._omegaconf_configs[config_id], + resolve=False + ) + else: + # Fallback: no raw config available, save resolved values + instance_data['config'] = instance.config.values if instance.deployment_target: instance_data['deployment_target'] = instance.deployment_target - if instance.status != InstanceStatus.PENDING: + if instance.status != ServiceConfigStatus.PENDING: # Handle both enum and string status values - status_value = instance.status.value if isinstance(instance.status, InstanceStatus) else instance.status + status_value = instance.status.value if isinstance(instance.status, ServiceConfigStatus) else instance.status instance_data['status'] = status_value if instance.outputs.access_url or instance.outputs.env_vars: instance_data['outputs'] = {} if instance.outputs.access_url: instance_data['outputs']['access_url'] = instance.outputs.access_url if instance.outputs.env_vars: - instance_data['outputs']['env_vars'] = instance.outputs.env_vars + # Serialize EnvVarValue objects to dicts + instance_data['outputs']['env_vars'] = { + k: v.model_dump() if hasattr(v, 'model_dump') else v + for k, v in instance.outputs.env_vars.items() + } if instance.outputs.capability_values: instance_data['outputs']['capability_values'] = instance.outputs.capability_values if instance.created_at: @@ -242,12 +299,12 @@ def _save_instances(self) -> None: if instance.next_sync_at: instance_data['next_sync_at'] = instance.next_sync_at.isoformat() if isinstance(instance.next_sync_at, datetime) else instance.next_sync_at - data['instances'][instance_id] = instance_data + data['instances'][config_id] = instance_data try: with open(self.instances_path, 'w') as f: yaml.dump(data, f, default_flow_style=False, sort_keys=False) - logger.debug(f"Saved {len(self._instances)} instances") + logger.debug(f"Saved {len(self._service_configs)} instances") except Exception as e: logger.error(f"Failed to save instances: {e}") raise @@ -262,9 +319,9 @@ def _save_wiring(self) -> None: for wire in self._wiring: wire_data = { 'id': wire.id, - 'source_instance_id': wire.source_instance_id, + 'source_config_id': wire.source_config_id, 'source_capability': wire.source_capability, - 'target_instance_id': wire.target_instance_id, + 'target_config_id': wire.target_config_id, 'target_capability': wire.target_capability, } data['wiring'].append(wire_data) @@ -283,10 +340,10 @@ def reload(self) -> None: self._load() # ========================================================================= - # Instance Operations + # ServiceConfig Operations # ========================================================================= - def list_instances(self) -> List[InstanceSummary]: + def list_service_configs(self) -> List[ServiceConfigSummary]: """List all instances.""" self._ensure_loaded() @@ -295,14 +352,14 @@ def list_instances(self) -> List[InstanceSummary]: provider_registry = get_provider_registry() result = [] - for inst in self._instances.values(): + for inst in self._service_configs.values(): # Look up what capability this instance provides provides = None provider = provider_registry.get_provider(inst.template_id) if provider: provides = provider.capability - result.append(InstanceSummary( + result.append(ServiceConfigSummary( id=inst.id, template_id=inst.template_id, name=inst.name, @@ -314,140 +371,146 @@ def list_instances(self) -> List[InstanceSummary]: return result - def get_instance(self, instance_id: str) -> Optional[Instance]: - """Get an instance by ID.""" + def get_service_config(self, config_id: str) -> Optional[ServiceConfig]: + """Get a service config by ID.""" self._ensure_loaded() - return self._instances.get(instance_id) + return self._service_configs.get(config_id) - def get_config_overrides(self, instance_id: str) -> Dict[str, Any]: - """Get only the config values that are overrides (not interpolations). + def get_config_overrides(self, config_id: str) -> Dict[str, Any]: + """Get the config values for this instance, excluding interpolations. - Uses OmegaConf.is_interpolation to distinguish between: - - Overrides: Direct values like "gpt-4o" - - Interpolations: Values like "${api_keys.openai_api_key}" that inherit from settings + Returns only explicitly set config values (direct overrides), filtering out + interpolations like ${api_keys.openai} which come from SettingsStore. - Returns only the override values for display in the UI. + This is useful for the UI to show only user-overridden values. """ self._ensure_loaded() - raw_config = self._instance_configs.get(instance_id) + # Check if we have raw OmegaConf config (with interpolations) + raw_config = self._omegaconf_configs.get(config_id) if not raw_config: - return {} + # No raw config, return all values + instance = self._service_configs.get(config_id) + if not instance: + return {} + return instance.config.values if instance.config else {} + # Filter out interpolations - return only direct values overrides = {} for key in raw_config: - # Check if this key is an interpolation + # OmegaConf.is_interpolation() detects ${...} references if not OmegaConf.is_interpolation(raw_config, key): - # It's a direct value (override), include it - try: - overrides[key] = OmegaConf.select(raw_config, key) - except Exception: - # If we can't resolve it, skip - pass + # This is a direct value (user override), not an interpolation + value = OmegaConf.select(raw_config, key) + # Convert OmegaConf containers to regular Python types for Pydantic serialization + if isinstance(value, (DictConfig, type(OmegaConf.create([])))): + value = OmegaConf.to_container(value, resolve=True) + overrides[key] = value return overrides - def create_instance(self, data: InstanceCreate) -> Instance: + def create_instance(self, data: ServiceConfigCreate) -> ServiceConfig: """Create a new instance.""" self._ensure_loaded() - if data.id in self._instances: - raise ValueError(f"Instance already exists: {data.id}") + if data.id in self._service_configs: + raise ValueError(f"ServiceConfig already exists: {data.id}") now = datetime.now(timezone.utc) # Determine initial status - status = InstanceStatus.PENDING + status = ServiceConfigStatus.PENDING if data.deployment_target == "cloud": - status = InstanceStatus.NOT_APPLICABLE + status = ServiceConfigStatus.NOT_APPLICABLE - instance = Instance( + instance = ServiceConfig( id=data.id, template_id=data.template_id, name=data.name, description=data.description, - config=InstanceConfig(values=data.config), + config=ConfigValues(values=data.config), deployment_target=data.deployment_target, status=status, created_at=now, updated_at=now, ) - self._instances[data.id] = instance - - # Also add to _instance_configs for OmegaConf-based override detection + # Store both resolved ServiceConfig and raw OmegaConf config + self._service_configs[data.id] = instance if data.config: - self._instance_configs[data.id] = OmegaConf.create(data.config) + # Store raw config to preserve interpolations for saving + self._omegaconf_configs[data.id] = OmegaConf.create(data.config) - self._save_instances() + self._save_service_configs() logger.info(f"Created instance: {data.id} (template: {data.template_id})") return instance - def update_instance(self, instance_id: str, data: InstanceUpdate) -> Instance: + def update_instance(self, config_id: str, data: ServiceConfigUpdate) -> ServiceConfig: """Update an instance.""" self._ensure_loaded() - instance = self._instances.get(instance_id) + instance = self._service_configs.get(config_id) if not instance: - raise ValueError(f"Instance not found: {instance_id}") + raise ValueError(f"ServiceConfig not found: {config_id}") if data.name is not None: instance.name = data.name if data.description is not None: instance.description = data.description if data.config is not None: - instance.config = InstanceConfig(values=data.config) - # Also update _instance_configs for OmegaConf-based override detection + instance.config = ConfigValues(values=data.config) + # Update raw OmegaConf config to preserve interpolations if data.config: - self._instance_configs[instance_id] = OmegaConf.create(data.config) - elif instance_id in self._instance_configs: - del self._instance_configs[instance_id] + self._omegaconf_configs[config_id] = OmegaConf.create(data.config) + elif config_id in self._omegaconf_configs: + # Config cleared, remove raw config too + del self._omegaconf_configs[config_id] if data.deployment_target is not None: instance.deployment_target = data.deployment_target instance.updated_at = datetime.now(timezone.utc) - self._save_instances() - logger.info(f"Updated instance: {instance_id}") + self._save_service_configs() + logger.info(f"Updated instance: {config_id}") return instance - def delete_instance(self, instance_id: str) -> bool: + def delete_instance(self, config_id: str) -> bool: """Delete an instance.""" self._ensure_loaded() - if instance_id not in self._instances: + if config_id not in self._service_configs: return False # Remove any wiring referencing this instance self._wiring = [ w for w in self._wiring - if w.source_instance_id != instance_id and w.target_instance_id != instance_id + if w.source_config_id != config_id and w.target_config_id != config_id ] - del self._instances[instance_id] + del self._service_configs[config_id] - # Also clean up _instance_configs - if instance_id in self._instance_configs: - del self._instance_configs[instance_id] + # Also clean up raw OmegaConf config + if config_id in self._omegaconf_configs: + del self._omegaconf_configs[config_id] - self._save_instances() + self._save_service_configs() self._save_wiring() - logger.info(f"Deleted instance: {instance_id}") + logger.info(f"Deleted instance: {config_id}") return True def update_instance_status( self, - instance_id: str, - status: InstanceStatus, + config_id: str, + status: ServiceConfigStatus, access_url: Optional[str] = None, error: Optional[str] = None, - ) -> Optional[Instance]: + ) -> Optional[ServiceConfig]: """Update instance status after deployment.""" self._ensure_loaded() - instance = self._instances.get(instance_id) + instance = self._service_configs.get(config_id) if not instance: return None @@ -457,13 +520,13 @@ def update_instance_status( if access_url: instance.outputs.access_url = access_url - if status == InstanceStatus.RUNNING: + if status == ServiceConfigStatus.RUNNING: instance.deployed_at = datetime.now(timezone.utc) - self._save_instances() + self._save_service_configs() return instance - async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: + async def deploy_instance(self, config_id: str) -> tuple[bool, str]: """Deploy/start an instance. Routes deployment based on deployment_target: @@ -473,9 +536,9 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: """ self._ensure_loaded() - instance = self._instances.get(instance_id) + instance = self._service_configs.get(config_id) if not instance: - return False, f"Instance not found: {instance_id}" + return False, f"ServiceConfig not found: {config_id}" # Get template to determine deployment type from src.services.compose_registry import get_compose_registry @@ -492,15 +555,15 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: deployment_manager = get_deployment_manager() # Update status to deploying - instance.status = InstanceStatus.DEPLOYING - self._save_instances() + instance.status = ServiceConfigStatus.DEPLOYING + self._save_service_configs() try: # Deploy via deployment manager (creates Deployment record) deployment = await deployment_manager.deploy_service( service_id=compose_service.service_id, unode_hostname=instance.deployment_target, - instance_id=instance_id + config_id=config_id ) # Store deployment_id in instance @@ -511,23 +574,23 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: # Update instance status based on deployment if deployment.status == "running": self.update_instance_status( - instance_id, - InstanceStatus.RUNNING, + config_id, + ServiceConfigStatus.RUNNING, access_url=deployment.access_url, ) return True, f"Service deployed to {instance.deployment_target}" else: self.update_instance_status( - instance_id, - InstanceStatus.DEPLOYING, + config_id, + ServiceConfigStatus.DEPLOYING, ) return True, f"Service deploying to {instance.deployment_target}" except Exception as e: - logger.exception(f"Failed to deploy instance {instance_id} to unode") + logger.exception(f"Failed to deploy instance {config_id} to unode") self.update_instance_status( - instance_id, - InstanceStatus.ERROR, + config_id, + ServiceConfigStatus.ERROR, error=str(e), ) return False, str(e) @@ -542,8 +605,8 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: settings_store = get_settings_store() # Update status to deploying - instance.status = InstanceStatus.DEPLOYING - self._save_instances() + instance.status = ServiceConfigStatus.DEPLOYING + self._save_service_configs() # Use service_name (not template_id) for orchestrator calls service_name = compose_service.service_name @@ -563,7 +626,7 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: logger.info(f"Remapped {conflict.env_var}: {conflict.port} -> {conflict.suggested_port}") try: - result = await orchestrator.start_service(service_name, instance_id=instance_id) + result = await orchestrator.start_service(service_name, config_id=config_id) if result.success: # Get the service status to find access URL status_info = await orchestrator.get_service_status(service_name) @@ -580,32 +643,32 @@ async def deploy_instance(self, instance_id: str) -> tuple[bool, str]: break self.update_instance_status( - instance_id, - InstanceStatus.RUNNING, + config_id, + ServiceConfigStatus.RUNNING, access_url=access_url, ) return True, f"Service {service_name} started" else: self.update_instance_status( - instance_id, - InstanceStatus.ERROR, + config_id, + ServiceConfigStatus.ERROR, error=result.message, ) return False, result.message except Exception as e: - logger.exception(f"Failed to deploy instance {instance_id}") + logger.exception(f"Failed to deploy instance {config_id}") self.update_instance_status( - instance_id, - InstanceStatus.ERROR, + config_id, + ServiceConfigStatus.ERROR, error=str(e), ) return False, str(e) else: # Cloud provider - mark as N/A (always available) - self.update_instance_status(instance_id, InstanceStatus.NOT_APPLICABLE) + self.update_instance_status(config_id, ServiceConfigStatus.NOT_APPLICABLE) return True, "Cloud provider instance activated" - async def undeploy_instance(self, instance_id: str) -> tuple[bool, str]: + async def undeploy_instance(self, config_id: str) -> tuple[bool, str]: """Stop/undeploy an instance. For compose services: stops the docker container @@ -613,9 +676,9 @@ async def undeploy_instance(self, instance_id: str) -> tuple[bool, str]: """ self._ensure_loaded() - instance = self._instances.get(instance_id) + instance = self._service_configs.get(config_id) if not instance: - return False, f"Instance not found: {instance_id}" + return False, f"ServiceConfig not found: {config_id}" # Get template to determine deployment type from src.services.compose_registry import get_compose_registry @@ -635,16 +698,16 @@ async def undeploy_instance(self, instance_id: str) -> tuple[bool, str]: try: result = orchestrator.stop_service(service_name) if result.success: - self.update_instance_status(instance_id, InstanceStatus.STOPPED) + self.update_instance_status(config_id, ServiceConfigStatus.STOPPED) return True, f"Service {service_name} stopped" else: return False, result.message except Exception as e: - logger.exception(f"Failed to undeploy instance {instance_id}") + logger.exception(f"Failed to undeploy instance {config_id}") return False, str(e) else: # Cloud provider - just mark as stopped - self.update_instance_status(instance_id, InstanceStatus.STOPPED) + self.update_instance_status(config_id, ServiceConfigStatus.STOPPED) return True, "Cloud provider instance deactivated" # ========================================================================= @@ -656,16 +719,16 @@ def list_wiring(self) -> List[Wiring]: self._ensure_loaded() return list(self._wiring) - def get_wiring_for_instance(self, instance_id: str) -> List[Wiring]: + def get_wiring_for_instance(self, config_id: str) -> List[Wiring]: """Get wiring connections where this instance is the target.""" self._ensure_loaded() - return [w for w in self._wiring if w.target_instance_id == instance_id] + return [w for w in self._wiring if w.target_config_id == config_id] def get_provider_for_capability( self, - consumer_instance_id: str, + consumer_config_id: str, capability: str - ) -> Optional[Instance]: + ) -> Optional[ServiceConfig]: """ Get the provider instance to use for a capability. @@ -678,26 +741,26 @@ def get_provider_for_capability( # 1. Check explicit wiring for this consumer for wiring in self._wiring: - if (wiring.target_instance_id == consumer_instance_id and + if (wiring.target_config_id == consumer_config_id and wiring.target_capability == capability): - provider_instance = self.get_instance(wiring.source_instance_id) - if provider_instance: + provider_config = self.get_service_config(wiring.source_config_id) + if provider_config: logger.info( - f"Resolved {capability} for {consumer_instance_id} " - f"via wiring -> {wiring.source_instance_id}" + f"Resolved {capability} for {consumer_config_id} " + f"via wiring -> {wiring.source_config_id}" ) - return provider_instance + return provider_config # 2. Check defaults - default_instance_id = self._defaults.get(capability) - if default_instance_id: - provider_instance = self.get_instance(default_instance_id) - if provider_instance: + default_config_id = self._defaults.get(capability) + if default_config_id: + provider_config = self.get_service_config(default_config_id) + if provider_config: logger.info( - f"Resolved {capability} for {consumer_instance_id} " - f"via default -> {default_instance_id}" + f"Resolved {capability} for {consumer_config_id} " + f"via default -> {default_config_id}" ) - return provider_instance + return provider_config # 3. No instance-level resolution found return None @@ -706,30 +769,30 @@ def create_wiring(self, data: WiringCreate) -> Wiring: """Create a wiring connection. For the singleton model, instance IDs can be either: - - Actual instance IDs from instances.yaml + - Actual instance IDs from service_configs.yaml - Template/provider IDs (for configured providers/services) """ self._ensure_loaded() # Check for duplicate - only one provider per consumer+capability for wire in self._wiring: - if (wire.target_instance_id == data.target_instance_id and + if (wire.target_config_id == data.target_config_id and wire.target_capability == data.target_capability): # Update existing wiring instead of error - wire.source_instance_id = data.source_instance_id + wire.source_config_id = data.source_config_id wire.source_capability = data.source_capability self._save_wiring() logger.info( - f"Updated wiring: {data.source_instance_id}.{data.source_capability} -> " - f"{data.target_instance_id}.{data.target_capability}" + f"Updated wiring: {data.source_config_id}.{data.source_capability} -> " + f"{data.target_config_id}.{data.target_capability}" ) return wire wire = Wiring( id=str(uuid.uuid4())[:8], - source_instance_id=data.source_instance_id, + source_config_id=data.source_config_id, source_capability=data.source_capability, - target_instance_id=data.target_instance_id, + target_config_id=data.target_config_id, target_capability=data.target_capability, created_at=datetime.now(timezone.utc), ) @@ -738,8 +801,8 @@ def create_wiring(self, data: WiringCreate) -> Wiring: self._save_wiring() logger.info( - f"Created wiring: {data.source_instance_id}.{data.source_capability} -> " - f"{data.target_instance_id}.{data.target_capability}" + f"Created wiring: {data.source_config_id}.{data.source_capability} -> " + f"{data.target_config_id}.{data.target_capability}" ) return wire @@ -761,24 +824,24 @@ def get_defaults(self) -> Dict[str, str]: self._ensure_loaded() return dict(self._defaults) - def set_default(self, capability: str, instance_id: str) -> None: + def set_default(self, capability: str, config_id: str) -> None: """Set default instance/provider for a capability. - For the singleton model, instance_id can be either: - - An actual instance ID from instances.yaml + For the singleton model, config_id can be either: + - An actual instance ID from service_configs.yaml - A template/provider ID (for configured providers acting as singletons) """ self._ensure_loaded() # Store the mapping - we accept both instance IDs and template/provider IDs # The resolution happens at runtime when the capability is needed - if instance_id: - self._defaults[capability] = instance_id + if config_id: + self._defaults[capability] = config_id elif capability in self._defaults: del self._defaults[capability] self._save_wiring() - logger.info(f"Set default for {capability}: {instance_id}") + logger.info(f"Set default for {capability}: {config_id}") # ========================================================================= # Resolution @@ -786,9 +849,9 @@ def set_default(self, capability: str, instance_id: str) -> None: def resolve_capability_for_instance( self, - instance_id: str, + config_id: str, capability: str, - ) -> Optional[Instance]: + ) -> Optional[ServiceConfig]: """ Resolve which instance provides a capability for the given instance. @@ -800,13 +863,13 @@ def resolve_capability_for_instance( # Check explicit wiring for wire in self._wiring: - if wire.target_instance_id == instance_id and wire.target_capability == capability: - return self._instances.get(wire.source_instance_id) + if wire.target_config_id == config_id and wire.target_capability == capability: + return self._service_configs.get(wire.source_config_id) # Check defaults - default_instance_id = self._defaults.get(capability) - if default_instance_id: - return self._instances.get(default_instance_id) + default_config_id = self._defaults.get(capability) + if default_config_id: + return self._service_configs.get(default_config_id) return None @@ -815,12 +878,12 @@ def resolve_capability_for_instance( # Singleton # ============================================================================= -_instance_manager: Optional[InstanceManager] = None +_service_config_manager: Optional[ServiceConfigManager] = None -def get_instance_manager() -> InstanceManager: - """Get the singleton InstanceManager.""" - global _instance_manager - if _instance_manager is None: - _instance_manager = InstanceManager() - return _instance_manager +def get_service_config_manager() -> ServiceConfigManager: + """Get the singleton ServiceConfigManager.""" + global _service_config_manager + if _service_config_manager is None: + _service_config_manager = ServiceConfigManager() + return _service_config_manager diff --git a/ushadow/backend/src/services/service_orchestrator.py b/ushadow/backend/src/services/service_orchestrator.py index 9d9f12d7..b0ceb591 100644 --- a/ushadow/backend/src/services/service_orchestrator.py +++ b/ushadow/backend/src/services/service_orchestrator.py @@ -366,7 +366,7 @@ async def get_docker_details(self, name: str) -> Optional[DockerDetails]: # Lifecycle Methods # ========================================================================= - async def start_service(self, name: str, instance_id: Optional[str] = None) -> ActionResult: + async def start_service(self, name: str, config_id: Optional[str] = None) -> ActionResult: """Start a service container.""" success, message = await self.docker_manager.start_service(name, instance_id) return ActionResult(success=success, message=message) @@ -869,7 +869,7 @@ def _mask_sensitive(self, name: str, value: str) -> str: # ============================================================================= -# Singleton Instance +# Singleton ServiceConfig # ============================================================================= _orchestrator: Optional[ServiceOrchestrator] = None diff --git a/ushadow/backend/tests/test_instance_manager.py b/ushadow/backend/tests/test_instance_manager.py deleted file mode 100644 index 6a0bcefb..00000000 --- a/ushadow/backend/tests/test_instance_manager.py +++ /dev/null @@ -1,538 +0,0 @@ -""" -Tests for the Instance Manager. - -Tests instance CRUD, wiring management, and config override detection. -""" - -import pytest -from pathlib import Path -import tempfile -import shutil -from datetime import datetime, timezone - -# Add src to path for imports -import sys -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from services.instance_manager import InstanceManager -from models.instance import ( - Instance, - InstanceCreate, - InstanceUpdate, - InstanceConfig, - InstanceStatus, - WiringCreate, -) - - -@pytest.fixture -def temp_config_dir(): - """Create a temporary config directory for testing.""" - temp_dir = tempfile.mkdtemp() - yield Path(temp_dir) - shutil.rmtree(temp_dir) - - -@pytest.fixture -def instance_manager(temp_config_dir): - """Create an instance manager with temp directory.""" - # Create empty config files - instances_file = temp_config_dir / "instances.yaml" - instances_file.write_text("instances: {}\n") - - wiring_file = temp_config_dir / "wiring.yaml" - wiring_file.write_text("defaults: {}\nwiring: []\n") - - return InstanceManager(config_dir=temp_config_dir) - - -class TestInstanceCRUD: - """Tests for instance create, read, update, delete operations.""" - - def test_create_instance_basic(self, instance_manager): - """Test creating a basic instance.""" - data = InstanceCreate( - id="openai-1", - template_id="openai", - name="OpenAI Instance", - deployment_target="cloud", - ) - - instance = instance_manager.create_instance(data) - - assert instance.id == "openai-1" - assert instance.template_id == "openai" - assert instance.name == "OpenAI Instance" - assert instance.deployment_target == "cloud" - assert instance.status == InstanceStatus.NOT_APPLICABLE # cloud = n/a - - def test_create_instance_with_config(self, instance_manager): - """Test creating an instance with config overrides.""" - data = InstanceCreate( - id="openai-custom", - template_id="openai", - name="Custom OpenAI", - deployment_target="cloud", - config={"model": "gpt-4", "temperature": "0.7"}, - ) - - instance = instance_manager.create_instance(data) - - assert instance.config.values == {"model": "gpt-4", "temperature": "0.7"} - - def test_create_instance_local_status(self, instance_manager): - """Test that local instances get pending status.""" - data = InstanceCreate( - id="ollama-1", - template_id="ollama", - name="Local Ollama", - deployment_target="local", - ) - - instance = instance_manager.create_instance(data) - - assert instance.status == InstanceStatus.PENDING - - def test_create_instance_duplicate_raises(self, instance_manager): - """Test that creating duplicate instance raises error.""" - data = InstanceCreate( - id="test-instance", - template_id="openai", - name="Test", - deployment_target="cloud", - ) - - instance_manager.create_instance(data) - - with pytest.raises(ValueError, match="already exists"): - instance_manager.create_instance(data) - - def test_get_instance(self, instance_manager): - """Test retrieving an instance by ID.""" - data = InstanceCreate( - id="my-instance", - template_id="openai", - name="My Instance", - deployment_target="cloud", - ) - instance_manager.create_instance(data) - - instance = instance_manager.get_instance("my-instance") - - assert instance is not None - assert instance.id == "my-instance" - - def test_get_instance_not_found(self, instance_manager): - """Test retrieving non-existent instance returns None.""" - instance = instance_manager.get_instance("nonexistent") - assert instance is None - - def test_list_instances_empty(self, instance_manager): - """Test listing when no instances exist.""" - instances = instance_manager.list_instances() - assert instances == [] - - def test_list_instances(self, instance_manager): - """Test listing multiple instances.""" - for i in range(3): - data = InstanceCreate( - id=f"instance-{i}", - template_id="openai", - name=f"Instance {i}", - deployment_target="cloud", - ) - instance_manager.create_instance(data) - - instances = instance_manager.list_instances() - - assert len(instances) == 3 - ids = [i.id for i in instances] - assert "instance-0" in ids - assert "instance-1" in ids - assert "instance-2" in ids - - def test_update_instance_name(self, instance_manager): - """Test updating instance name.""" - data = InstanceCreate( - id="test-update", - template_id="openai", - name="Original Name", - deployment_target="cloud", - ) - instance_manager.create_instance(data) - - update = InstanceUpdate(name="Updated Name") - updated = instance_manager.update_instance("test-update", update) - - assert updated.name == "Updated Name" - - def test_update_instance_config(self, instance_manager): - """Test updating instance config.""" - data = InstanceCreate( - id="test-config", - template_id="openai", - name="Config Test", - deployment_target="cloud", - config={"model": "gpt-3.5"}, - ) - instance_manager.create_instance(data) - - update = InstanceUpdate(config={"model": "gpt-4"}) - updated = instance_manager.update_instance("test-config", update) - - assert updated.config.values == {"model": "gpt-4"} - - def test_update_instance_not_found(self, instance_manager): - """Test updating non-existent instance raises error.""" - update = InstanceUpdate(name="New Name") - - with pytest.raises(ValueError, match="not found"): - instance_manager.update_instance("nonexistent", update) - - def test_delete_instance(self, instance_manager): - """Test deleting an instance.""" - data = InstanceCreate( - id="to-delete", - template_id="openai", - name="Delete Me", - deployment_target="cloud", - ) - instance_manager.create_instance(data) - - result = instance_manager.delete_instance("to-delete") - - assert result is True - assert instance_manager.get_instance("to-delete") is None - - def test_delete_instance_not_found(self, instance_manager): - """Test deleting non-existent instance returns False.""" - result = instance_manager.delete_instance("nonexistent") - assert result is False - - -class TestConfigOverrides: - """Tests for config override detection using OmegaConf.""" - - def test_get_config_overrides_empty(self, instance_manager): - """Test getting overrides when config is empty.""" - data = InstanceCreate( - id="no-config", - template_id="openai", - name="No Config", - deployment_target="cloud", - ) - instance_manager.create_instance(data) - - overrides = instance_manager.get_config_overrides("no-config") - - assert overrides == {} - - def test_get_config_overrides_with_values(self, instance_manager): - """Test getting overrides returns direct values.""" - data = InstanceCreate( - id="with-config", - template_id="openai", - name="With Config", - deployment_target="cloud", - config={"model": "gpt-4", "temperature": "0.5"}, - ) - instance_manager.create_instance(data) - - overrides = instance_manager.get_config_overrides("with-config") - - assert overrides == {"model": "gpt-4", "temperature": "0.5"} - - def test_get_config_overrides_not_found(self, instance_manager): - """Test getting overrides for non-existent instance.""" - overrides = instance_manager.get_config_overrides("nonexistent") - assert overrides == {} - - -class TestWiringCRUD: - """Tests for wiring create, read, delete operations.""" - - def test_create_wiring(self, instance_manager): - """Test creating a wiring connection.""" - # Create source and target instances first - instance_manager.create_instance(InstanceCreate( - id="openai-1", - template_id="openai", - name="OpenAI", - deployment_target="cloud", - )) - - data = WiringCreate( - source_instance_id="openai-1", - source_capability="llm", - target_instance_id="chronicle", - target_capability="llm", - ) - - wiring = instance_manager.create_wiring(data) - - assert wiring.source_instance_id == "openai-1" - assert wiring.source_capability == "llm" - assert wiring.target_instance_id == "chronicle" - assert wiring.id is not None - - def test_create_wiring_generates_id(self, instance_manager): - """Test that wiring ID is auto-generated.""" - instance_manager.create_instance(InstanceCreate( - id="source", - template_id="openai", - name="Source", - deployment_target="cloud", - )) - - data = WiringCreate( - source_instance_id="source", - source_capability="llm", - target_instance_id="target", - target_capability="llm", - ) - - wiring = instance_manager.create_wiring(data) - - assert len(wiring.id) == 8 # Short UUID - - def test_list_wiring_empty(self, instance_manager): - """Test listing when no wiring exists.""" - wiring_list = instance_manager.list_wiring() - assert wiring_list == [] - - def test_list_wiring(self, instance_manager): - """Test listing multiple wiring connections.""" - instance_manager.create_instance(InstanceCreate( - id="provider", - template_id="openai", - name="Provider", - deployment_target="cloud", - )) - - for i in range(2): - instance_manager.create_wiring(WiringCreate( - source_instance_id="provider", - source_capability="llm", - target_instance_id=f"consumer-{i}", - target_capability="llm", - )) - - wiring_list = instance_manager.list_wiring() - - assert len(wiring_list) == 2 - - def test_get_wiring_for_instance(self, instance_manager): - """Test getting wiring for a specific instance (as target).""" - instance_manager.create_instance(InstanceCreate( - id="provider", - template_id="openai", - name="Provider", - deployment_target="cloud", - )) - - # Create wiring where consumer-1 is the target - instance_manager.create_wiring(WiringCreate( - source_instance_id="provider", - source_capability="llm", - target_instance_id="consumer-1", - target_capability="llm", - )) - instance_manager.create_wiring(WiringCreate( - source_instance_id="other-provider", - source_capability="llm", - target_instance_id="consumer-2", - target_capability="llm", - )) - - # get_wiring_for_instance returns wiring where instance is TARGET - wiring = instance_manager.get_wiring_for_instance("consumer-1") - - assert len(wiring) == 1 - assert wiring[0].target_instance_id == "consumer-1" - - def test_delete_wiring(self, instance_manager): - """Test deleting a wiring connection.""" - instance_manager.create_instance(InstanceCreate( - id="provider", - template_id="openai", - name="Provider", - deployment_target="cloud", - )) - - wiring = instance_manager.create_wiring(WiringCreate( - source_instance_id="provider", - source_capability="llm", - target_instance_id="consumer", - target_capability="llm", - )) - - result = instance_manager.delete_wiring(wiring.id) - - assert result is True - assert len(instance_manager.list_wiring()) == 0 - - def test_delete_wiring_not_found(self, instance_manager): - """Test deleting non-existent wiring returns False.""" - result = instance_manager.delete_wiring("nonexistent") - assert result is False - - -class TestDefaults: - """Tests for default capability mappings.""" - - def test_get_defaults_empty(self, instance_manager): - """Test getting defaults when none set.""" - defaults = instance_manager.get_defaults() - assert defaults == {} - - def test_set_default(self, instance_manager): - """Test setting a default for a capability.""" - instance_manager.create_instance(InstanceCreate( - id="openai-default", - template_id="openai", - name="Default OpenAI", - deployment_target="cloud", - )) - - instance_manager.set_default("llm", "openai-default") - - defaults = instance_manager.get_defaults() - assert defaults["llm"] == "openai-default" - - def test_set_default_overwrites(self, instance_manager): - """Test that setting default overwrites previous.""" - instance_manager.create_instance(InstanceCreate( - id="first", - template_id="openai", - name="First", - deployment_target="cloud", - )) - instance_manager.create_instance(InstanceCreate( - id="second", - template_id="openai", - name="Second", - deployment_target="cloud", - )) - - instance_manager.set_default("llm", "first") - instance_manager.set_default("llm", "second") - - defaults = instance_manager.get_defaults() - assert defaults["llm"] == "second" - - -class TestPersistence: - """Tests for config file persistence.""" - - def test_instances_persist_to_file(self, temp_config_dir): - """Test that instances are saved to YAML file.""" - # Create empty files - (temp_config_dir / "instances.yaml").write_text("instances: {}\n") - (temp_config_dir / "wiring.yaml").write_text("defaults: {}\nwiring: []\n") - - manager = InstanceManager(config_dir=temp_config_dir) - manager.create_instance(InstanceCreate( - id="persistent", - template_id="openai", - name="Persistent Instance", - deployment_target="cloud", - )) - - # Create new manager to load from file - manager2 = InstanceManager(config_dir=temp_config_dir) - instance = manager2.get_instance("persistent") - - assert instance is not None - assert instance.name == "Persistent Instance" - - def test_wiring_persists_to_file(self, temp_config_dir): - """Test that wiring is saved to YAML file.""" - (temp_config_dir / "instances.yaml").write_text("instances: {}\n") - (temp_config_dir / "wiring.yaml").write_text("defaults: {}\nwiring: []\n") - - manager = InstanceManager(config_dir=temp_config_dir) - manager.create_instance(InstanceCreate( - id="provider", - template_id="openai", - name="Provider", - deployment_target="cloud", - )) - manager.create_wiring(WiringCreate( - source_instance_id="provider", - source_capability="llm", - target_instance_id="consumer", - target_capability="llm", - )) - - # Create new manager to load from file - manager2 = InstanceManager(config_dir=temp_config_dir) - wiring = manager2.list_wiring() - - assert len(wiring) == 1 - assert wiring[0].source_instance_id == "provider" - - def test_defaults_persist_to_file(self, temp_config_dir): - """Test that defaults are saved to YAML file.""" - (temp_config_dir / "instances.yaml").write_text("instances: {}\n") - (temp_config_dir / "wiring.yaml").write_text("defaults: {}\nwiring: []\n") - - manager = InstanceManager(config_dir=temp_config_dir) - manager.create_instance(InstanceCreate( - id="default-provider", - template_id="openai", - name="Default Provider", - deployment_target="cloud", - )) - manager.set_default("llm", "default-provider") - - # Create new manager to load from file - manager2 = InstanceManager(config_dir=temp_config_dir) - defaults = manager2.get_defaults() - - assert defaults["llm"] == "default-provider" - - -class TestEdgeCases: - """Tests for edge cases and error handling.""" - - def test_delete_instance_cleans_up_wiring(self, instance_manager): - """Test that deleting instance removes associated wiring.""" - instance_manager.create_instance(InstanceCreate( - id="provider", - template_id="openai", - name="Provider", - deployment_target="cloud", - )) - instance_manager.create_wiring(WiringCreate( - source_instance_id="provider", - source_capability="llm", - target_instance_id="consumer", - target_capability="llm", - )) - - # Delete the provider - wiring should be cleaned up - instance_manager.delete_instance("provider") - - # Wiring should be removed (cleaned up, not orphaned) - wiring = instance_manager.list_wiring() - assert len(wiring) == 0 - - def test_empty_config_not_stored(self, instance_manager): - """Test that empty config dict is handled correctly.""" - data = InstanceCreate( - id="empty-config", - template_id="openai", - name="Empty Config", - deployment_target="cloud", - config={}, - ) - - instance = instance_manager.create_instance(data) - - assert instance.config.values == {} - overrides = instance_manager.get_config_overrides("empty-config") - assert overrides == {} - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/ushadow/backend/tests/test_instances_router.py b/ushadow/backend/tests/test_instances_router.py deleted file mode 100644 index 5160c430..00000000 --- a/ushadow/backend/tests/test_instances_router.py +++ /dev/null @@ -1,544 +0,0 @@ -""" -Tests for the Instances Router API endpoints. - -Tests the endpoint functions directly, bypassing HTTP transport for cleaner unit tests. -This approach tests the business logic of each endpoint without complex dependency injection. - -Tests cover: -- Instance CRUD operations -- Deploy/undeploy operations -- Wiring CRUD operations -- Default capability mappings -""" - -import pytest -from unittest.mock import AsyncMock, MagicMock, patch -from fastapi import HTTPException - -# Add src to path for imports -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from models.instance import ( - Instance, - InstanceConfig, - InstanceCreate, - InstanceSummary, - InstanceStatus, - InstanceUpdate, - Wiring, - WiringCreate, -) - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - -@pytest.fixture -def mock_current_user(): - """Mock authenticated user.""" - return MagicMock(id="test-user", email="test@example.com") - - -@pytest.fixture -def mock_instance_manager(): - """Create a mock instance manager.""" - manager = MagicMock() - - # Default return values - manager.list_instances.return_value = [] - manager.get_instance.return_value = None - manager.list_wiring.return_value = [] - manager.get_defaults.return_value = {} - manager.get_config_overrides.return_value = {} - manager.get_wiring_for_instance.return_value = [] - manager.deploy_instance = AsyncMock(return_value=(True, "Success")) - manager.undeploy_instance = AsyncMock(return_value=(True, "Success")) - - return manager - - -@pytest.fixture -def sample_instance(): - """Create a sample instance for testing.""" - return Instance( - id="test-instance", - template_id="openai", - name="Test Instance", - deployment_target="cloud", - status=InstanceStatus.NOT_APPLICABLE, - config=InstanceConfig(values={}), - ) - - -@pytest.fixture -def sample_wiring(): - """Create a sample wiring for testing.""" - return Wiring( - id="wiring-1", - source_instance_id="openai-1", - source_capability="llm", - target_instance_id="chronicle", - target_capability="llm", - ) - - -# ============================================================================= -# Instance Endpoint Tests -# ============================================================================= - -class TestListInstances: - """Tests for list_instances endpoint.""" - - @pytest.mark.asyncio - async def test_list_instances_empty(self, mock_instance_manager, mock_current_user): - """Test listing when no instances exist.""" - mock_instance_manager.list_instances.return_value = [] - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import list_instances - result = await list_instances(current_user=mock_current_user) - - assert result == [] - - @pytest.mark.asyncio - async def test_list_instances_returns_summaries(self, mock_instance_manager, mock_current_user): - """Test listing returns instance summaries.""" - summaries = [ - InstanceSummary( - id="instance-1", - template_id="openai", - name="Instance 1", - deployment_target="cloud", - status=InstanceStatus.NOT_APPLICABLE, - ), - InstanceSummary( - id="instance-2", - template_id="ollama", - name="Instance 2", - deployment_target="local", - status=InstanceStatus.RUNNING, - ), - ] - mock_instance_manager.list_instances.return_value = summaries - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import list_instances - result = await list_instances(current_user=mock_current_user) - - assert len(result) == 2 - assert result[0].id == "instance-1" - assert result[1].id == "instance-2" - - -class TestGetInstance: - """Tests for get_instance endpoint.""" - - @pytest.mark.asyncio - async def test_get_instance_found(self, mock_instance_manager, mock_current_user, sample_instance): - """Test retrieving an existing instance.""" - mock_instance_manager.get_instance.return_value = sample_instance - mock_instance_manager.get_config_overrides.return_value = {} - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_instance - result = await get_instance(instance_id="test-instance", current_user=mock_current_user) - - assert result.id == "test-instance" - assert result.template_id == "openai" - - @pytest.mark.asyncio - async def test_get_instance_not_found(self, mock_instance_manager, mock_current_user): - """Test retrieving non-existent instance raises 404.""" - mock_instance_manager.get_instance.return_value = None - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_instance - with pytest.raises(HTTPException) as exc_info: - await get_instance(instance_id="nonexistent", current_user=mock_current_user) - - assert exc_info.value.status_code == 404 - - @pytest.mark.asyncio - async def test_get_instance_with_overrides(self, mock_instance_manager, mock_current_user, sample_instance): - """Test that config overrides are applied to result.""" - mock_instance_manager.get_instance.return_value = sample_instance - mock_instance_manager.get_config_overrides.return_value = {"model": "gpt-4"} - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_instance - result = await get_instance(instance_id="test-instance", current_user=mock_current_user) - - assert result.config.values == {"model": "gpt-4"} - - -class TestCreateInstance: - """Tests for create_instance endpoint.""" - - @pytest.mark.asyncio - async def test_create_instance_success(self, mock_instance_manager, mock_current_user, sample_instance): - """Test creating a new instance.""" - mock_instance_manager.create_instance.return_value = sample_instance - data = InstanceCreate( - id="test-instance", - template_id="openai", - name="Test Instance", - deployment_target="cloud", - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import create_instance - result = await create_instance(data=data, current_user=mock_current_user) - - assert result.id == "test-instance" - - @pytest.mark.asyncio - async def test_create_instance_duplicate_raises(self, mock_instance_manager, mock_current_user): - """Test creating duplicate instance raises 400.""" - mock_instance_manager.create_instance.side_effect = ValueError("Instance already exists") - data = InstanceCreate( - id="existing", - template_id="openai", - name="Duplicate", - deployment_target="cloud", - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import create_instance - with pytest.raises(HTTPException) as exc_info: - await create_instance(data=data, current_user=mock_current_user) - - assert exc_info.value.status_code == 400 - - -class TestUpdateInstance: - """Tests for update_instance endpoint.""" - - @pytest.mark.asyncio - async def test_update_instance_name(self, mock_instance_manager, mock_current_user, sample_instance): - """Test updating instance name.""" - updated = Instance( - id="test-instance", - template_id="openai", - name="Updated Name", - deployment_target="cloud", - status=InstanceStatus.NOT_APPLICABLE, - config=InstanceConfig(values={}), - ) - mock_instance_manager.get_instance.return_value = sample_instance - mock_instance_manager.update_instance.return_value = updated - data = InstanceUpdate(name="Updated Name") - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import update_instance - result = await update_instance( - instance_id="test-instance", data=data, current_user=mock_current_user - ) - - assert result.name == "Updated Name" - - @pytest.mark.asyncio - async def test_update_instance_not_found(self, mock_instance_manager, mock_current_user): - """Test updating non-existent instance raises 404.""" - mock_instance_manager.update_instance.side_effect = ValueError("Instance not found") - data = InstanceUpdate(name="New Name") - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import update_instance - with pytest.raises(HTTPException) as exc_info: - await update_instance( - instance_id="nonexistent", data=data, current_user=mock_current_user - ) - - assert exc_info.value.status_code == 404 - - -class TestDeleteInstance: - """Tests for delete_instance endpoint.""" - - @pytest.mark.asyncio - async def test_delete_instance_success(self, mock_instance_manager, mock_current_user): - """Test deleting an instance.""" - mock_instance_manager.delete_instance.return_value = True - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import delete_instance - result = await delete_instance(instance_id="test-instance", current_user=mock_current_user) - - assert result["success"] is True - - @pytest.mark.asyncio - async def test_delete_instance_not_found(self, mock_instance_manager, mock_current_user): - """Test deleting non-existent instance raises 404.""" - mock_instance_manager.delete_instance.return_value = False - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import delete_instance - with pytest.raises(HTTPException) as exc_info: - await delete_instance(instance_id="nonexistent", current_user=mock_current_user) - - assert exc_info.value.status_code == 404 - - -# ============================================================================= -# Deploy/Undeploy Endpoint Tests -# ============================================================================= - -class TestDeployInstance: - """Tests for deploy_instance endpoint.""" - - @pytest.mark.asyncio - async def test_deploy_instance_success(self, mock_instance_manager, mock_current_user): - """Test deploying an instance.""" - mock_instance_manager.deploy_instance = AsyncMock( - return_value=(True, "Instance deployed successfully") - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import deploy_instance - result = await deploy_instance(instance_id="test-instance", current_user=mock_current_user) - - assert result["success"] is True - - @pytest.mark.asyncio - async def test_deploy_instance_failure(self, mock_instance_manager, mock_current_user): - """Test deploy failure raises 400.""" - mock_instance_manager.deploy_instance = AsyncMock( - return_value=(False, "Failed to deploy: container error") - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import deploy_instance - with pytest.raises(HTTPException) as exc_info: - await deploy_instance(instance_id="test-instance", current_user=mock_current_user) - - assert exc_info.value.status_code == 400 - - -class TestUndeployInstance: - """Tests for undeploy_instance endpoint.""" - - @pytest.mark.asyncio - async def test_undeploy_instance_success(self, mock_instance_manager, mock_current_user): - """Test undeploying an instance.""" - mock_instance_manager.undeploy_instance = AsyncMock( - return_value=(True, "Instance stopped successfully") - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import undeploy_instance - result = await undeploy_instance(instance_id="test-instance", current_user=mock_current_user) - - assert result["success"] is True - - @pytest.mark.asyncio - async def test_undeploy_instance_failure(self, mock_instance_manager, mock_current_user): - """Test undeploy failure raises 400.""" - mock_instance_manager.undeploy_instance = AsyncMock( - return_value=(False, "Instance not running") - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import undeploy_instance - with pytest.raises(HTTPException) as exc_info: - await undeploy_instance(instance_id="test-instance", current_user=mock_current_user) - - assert exc_info.value.status_code == 400 - - -# ============================================================================= -# Wiring Endpoint Tests -# ============================================================================= - -class TestListWiring: - """Tests for list_wiring endpoint.""" - - @pytest.mark.asyncio - async def test_list_wiring_empty(self, mock_instance_manager, mock_current_user): - """Test listing when no wiring exists.""" - mock_instance_manager.list_wiring.return_value = [] - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import list_wiring - result = await list_wiring(current_user=mock_current_user) - - assert result == [] - - @pytest.mark.asyncio - async def test_list_wiring_returns_connections(self, mock_instance_manager, mock_current_user, sample_wiring): - """Test listing returns wiring connections.""" - mock_instance_manager.list_wiring.return_value = [sample_wiring] - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import list_wiring - result = await list_wiring(current_user=mock_current_user) - - assert len(result) == 1 - assert result[0].source_instance_id == "openai-1" - - -class TestGetDefaults: - """Tests for get_defaults endpoint.""" - - @pytest.mark.asyncio - async def test_get_defaults_empty(self, mock_instance_manager, mock_current_user): - """Test getting defaults when none set.""" - mock_instance_manager.get_defaults.return_value = {} - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_defaults - result = await get_defaults(current_user=mock_current_user) - - assert result == {} - - @pytest.mark.asyncio - async def test_get_defaults_returns_mappings(self, mock_instance_manager, mock_current_user): - """Test getting default capability mappings.""" - mock_instance_manager.get_defaults.return_value = { - "llm": "openai-1", - "embedding": "openai-embed", - } - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_defaults - result = await get_defaults(current_user=mock_current_user) - - assert result["llm"] == "openai-1" - assert result["embedding"] == "openai-embed" - - -class TestSetDefault: - """Tests for set_default endpoint.""" - - @pytest.mark.asyncio - async def test_set_default_success(self, mock_instance_manager, mock_current_user): - """Test setting a default for a capability.""" - mock_instance_manager.set_default.return_value = None - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import set_default - result = await set_default( - capability="llm", instance_id="openai-1", current_user=mock_current_user - ) - - assert result["success"] is True - assert result["capability"] == "llm" - - @pytest.mark.asyncio - async def test_set_default_invalid_instance(self, mock_instance_manager, mock_current_user): - """Test setting default with invalid instance raises 400.""" - mock_instance_manager.set_default.side_effect = ValueError("Instance not found") - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import set_default - with pytest.raises(HTTPException) as exc_info: - await set_default( - capability="llm", instance_id="nonexistent", current_user=mock_current_user - ) - - assert exc_info.value.status_code == 400 - - -class TestCreateWiring: - """Tests for create_wiring endpoint.""" - - @pytest.mark.asyncio - async def test_create_wiring_success(self, mock_instance_manager, mock_current_user, sample_wiring): - """Test creating a wiring connection.""" - mock_instance_manager.create_wiring.return_value = sample_wiring - data = WiringCreate( - source_instance_id="openai-1", - source_capability="llm", - target_instance_id="chronicle", - target_capability="llm", - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import create_wiring - result = await create_wiring(data=data, current_user=mock_current_user) - - assert result.source_instance_id == "openai-1" - assert result.id is not None - - @pytest.mark.asyncio - async def test_create_wiring_invalid(self, mock_instance_manager, mock_current_user): - """Test creating invalid wiring raises 400.""" - mock_instance_manager.create_wiring.side_effect = ValueError("Source instance not found") - data = WiringCreate( - source_instance_id="nonexistent", - source_capability="llm", - target_instance_id="chronicle", - target_capability="llm", - ) - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import create_wiring - with pytest.raises(HTTPException) as exc_info: - await create_wiring(data=data, current_user=mock_current_user) - - assert exc_info.value.status_code == 400 - - -class TestDeleteWiring: - """Tests for delete_wiring endpoint.""" - - @pytest.mark.asyncio - async def test_delete_wiring_success(self, mock_instance_manager, mock_current_user): - """Test deleting a wiring connection.""" - mock_instance_manager.delete_wiring.return_value = True - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import delete_wiring - result = await delete_wiring(wiring_id="wiring-1", current_user=mock_current_user) - - assert result["success"] is True - - @pytest.mark.asyncio - async def test_delete_wiring_not_found(self, mock_instance_manager, mock_current_user): - """Test deleting non-existent wiring raises 404.""" - mock_instance_manager.delete_wiring.return_value = False - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import delete_wiring - with pytest.raises(HTTPException) as exc_info: - await delete_wiring(wiring_id="nonexistent", current_user=mock_current_user) - - assert exc_info.value.status_code == 404 - - -class TestGetInstanceWiring: - """Tests for get_instance_wiring endpoint.""" - - @pytest.mark.asyncio - async def test_get_instance_wiring_found( - self, mock_instance_manager, mock_current_user, sample_instance, sample_wiring - ): - """Test getting wiring for an instance.""" - mock_instance_manager.get_instance.return_value = sample_instance - mock_instance_manager.get_wiring_for_instance.return_value = [sample_wiring] - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_instance_wiring - result = await get_instance_wiring( - instance_id="test-instance", current_user=mock_current_user - ) - - assert len(result) == 1 - - @pytest.mark.asyncio - async def test_get_instance_wiring_instance_not_found(self, mock_instance_manager, mock_current_user): - """Test getting wiring for non-existent instance raises 404.""" - mock_instance_manager.get_instance.return_value = None - - with patch("src.routers.instances.get_instance_manager", return_value=mock_instance_manager): - from src.routers.instances import get_instance_wiring - with pytest.raises(HTTPException) as exc_info: - await get_instance_wiring(instance_id="nonexistent", current_user=mock_current_user) - - assert exc_info.value.status_code == 404 - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/ushadow/frontend/src/App.tsx b/ushadow/frontend/src/App.tsx index 070fbc1a..64bef73e 100644 --- a/ushadow/frontend/src/App.tsx +++ b/ushadow/frontend/src/App.tsx @@ -35,7 +35,7 @@ import AgentZeroPage from './pages/AgentZeroPage' import N8NPage from './pages/N8NPage' import ServicesPage from './pages/ServicesPage' import SettingsPage from './pages/SettingsPage' -import InstancesPage from './pages/InstancesPage' +import ServiceConfigsPage from './pages/ServiceConfigsPage' import MemoriesPage from './pages/MemoriesPage' import ClusterPage from './pages/ClusterPage' import SpeakerRecognitionPage from './pages/SpeakerRecognitionPage' @@ -103,7 +103,7 @@ function AppContent() { } /> } /> } /> - } /> + } /> } /> } /> } /> diff --git a/ushadow/frontend/src/components/DeployToK8sModal.tsx b/ushadow/frontend/src/components/DeployToK8sModal.tsx index 03ae1404..5ae651aa 100644 --- a/ushadow/frontend/src/components/DeployToK8sModal.tsx +++ b/ushadow/frontend/src/components/DeployToK8sModal.tsx @@ -2,7 +2,7 @@ import { useState, useEffect } from 'react' import { CheckCircle, Loader, ChevronRight } from 'lucide-react' import Modal from './Modal' import EnvVarEditor from './EnvVarEditor' -import { kubernetesApi, servicesApi, instancesApi, KubernetesCluster, EnvVarInfo, EnvVarConfig } from '../services/api' +import { kubernetesApi, servicesApi, svcConfigsApi, KubernetesCluster, EnvVarInfo, EnvVarConfig } from '../services/api' interface DeployToK8sModalProps { isOpen: boolean @@ -197,7 +197,8 @@ export default function DeployToK8sModal({ isOpen, onClose, cluster: initialClus // Generate instance ID for this deployment target (only lowercase, numbers, hyphens) const sanitizedServiceId = selectedService.service_id.replace(/[^a-z0-9-]/g, '-') - const instanceId = `${sanitizedServiceId}-k8s-${selectedCluster.cluster_id}-${namespace}` + const clusterName = selectedCluster.name.toLowerCase().replace(/[^a-z0-9-]/g, '-') + const instanceId = `${sanitizedServiceId}-${clusterName}` const deploymentTarget = `k8s://${selectedCluster.cluster_id}/${namespace}` // Convert env configs to instance config format @@ -219,17 +220,17 @@ export default function DeployToK8sModal({ isOpen, onClose, cluster: initialClus // Step 1: Create or update instance with this configuration try { // Try to get existing instance - await instancesApi.getInstance(instanceId) - // Instance exists - update it - await instancesApi.updateInstance(instanceId, { + await svcConfigsApi.getServiceConfig(instanceId) + // ServiceConfig exists - update it + await svcConfigsApi.updateServiceConfig(instanceId, { name: `${selectedService.display_name} (${selectedCluster.name}/${namespace})`, description: `K8s deployment to ${selectedCluster.name} in ${namespace} namespace`, config: configValues, deployment_target: deploymentTarget }) } catch { - // Instance doesn't exist - create it - await instancesApi.createInstance({ + // ServiceConfig doesn't exist - create it + await svcConfigsApi.createServiceConfig({ id: instanceId, template_id: selectedService.service_id, name: `${selectedService.display_name} (${selectedCluster.name}/${namespace})`, @@ -239,14 +240,14 @@ export default function DeployToK8sModal({ isOpen, onClose, cluster: initialClus }) } - // Step 2: Deploy the instance to K8s - // The backend will use centralized resolution which reads from the instance config + // Step 2: Deploy the service config to K8s + // The backend will use centralized resolution which reads from the service config config const deployResponse = await kubernetesApi.deployService( selectedCluster.cluster_id, { service_id: selectedService.service_id, namespace: namespace, - instance_id: instanceId + config_id: instanceId } ) diff --git a/ushadow/frontend/src/components/EnvVarEditor.tsx b/ushadow/frontend/src/components/EnvVarEditor.tsx index f98e87ce..09c67958 100644 --- a/ushadow/frontend/src/components/EnvVarEditor.tsx +++ b/ushadow/frontend/src/components/EnvVarEditor.tsx @@ -21,7 +21,7 @@ interface EnvVarEditorProps { * Used by: * - ServicesPage (for Docker service configuration) * - DeployToK8sModal (for K8s deployment configuration) - * - InstancesPage (for instance configuration) + * - ServiceConfigsPage (for instance configuration) */ export default function EnvVarEditor({ envVar, config, onChange }: EnvVarEditorProps) { const [editing, setEditing] = useState(false) @@ -177,9 +177,9 @@ export default function EnvVarEditor({ envVar, config, onChange }: EnvVarEditorP // Value input handleValueChange(e.target.value)} - placeholder="enter value" + placeholder={envVar.resolved_value ? `using: ${envVar.resolved_value}` : 'enter value'} className="flex-1 px-2 py-1.5 text-xs rounded border-0 bg-neutral-700/50 text-neutral-200 focus:outline-none focus:ring-1 focus:ring-primary-500 placeholder:text-neutral-500" autoFocus={editing} onBlur={() => { diff --git a/ushadow/frontend/src/components/TailscaleOriginBanner.tsx b/ushadow/frontend/src/components/TailscaleOriginBanner.tsx new file mode 100644 index 00000000..6a66725f --- /dev/null +++ b/ushadow/frontend/src/components/TailscaleOriginBanner.tsx @@ -0,0 +1,105 @@ +import { useState, useEffect } from 'react' +import { ExternalLink, X } from 'lucide-react' +import { api } from '../services/api' + +interface TailscaleStatus { + authenticated: boolean + hostname: string | null +} + +/** + * Shows a banner when user is on localhost but Tailscale is configured. + * Suggests switching to the Tailscale URL for consistent auth across devices. + */ +export default function TailscaleOriginBanner() { + const [show, setShow] = useState(false) + const [tailscaleHostname, setTailscaleHostname] = useState(null) + const [dismissed, setDismissed] = useState(false) + + useEffect(() => { + checkOriginMismatch() + }, []) + + const checkOriginMismatch = async () => { + // Check if user dismissed this session + if (dismissed) return + + // Only show on localhost/127.0.0.1 + const isLocalhost = window.location.hostname === 'localhost' || + window.location.hostname === '127.0.0.1' + if (!isLocalhost) return + + try { + // Check if Tailscale is configured + const response = await api.get('/api/tailscale/container/status') + + if (response.data.authenticated && response.data.hostname) { + setTailscaleHostname(response.data.hostname) + setShow(true) + } + } catch (error) { + // Tailscale not configured or error - don't show banner + console.debug('Tailscale status check failed:', error) + } + } + + const handleSwitchToTailscale = () => { + if (!tailscaleHostname) return + + // Transfer to Tailscale URL, preserving the current path + const tailscaleUrl = `https://${tailscaleHostname}${window.location.pathname}${window.location.search}` + window.location.href = tailscaleUrl + } + + const handleDismiss = () => { + setDismissed(true) + setShow(false) + } + + if (!show || !tailscaleHostname) return null + + return ( +
+
+
+
+ +
+

+ Tailscale is configured.{' '} + Switch to + https://{tailscaleHostname} + for secure access from any device. +

+

+ Your auth token on localhost won't work on the Tailscale URL (browser security). +

+
+
+ +
+ + +
+
+
+
+ ) +} diff --git a/ushadow/frontend/src/components/layout/Layout.tsx b/ushadow/frontend/src/components/layout/Layout.tsx index 0cf9501c..2a33dae0 100644 --- a/ushadow/frontend/src/components/layout/Layout.tsx +++ b/ushadow/frontend/src/components/layout/Layout.tsx @@ -11,6 +11,7 @@ import { useMobileQrCode } from '../../hooks/useQrCode' import FeatureFlagsDrawer from './FeatureFlagsDrawer' import { StatusBadge, type BadgeVariant } from '../StatusBadge' import Modal from '../Modal' +import TailscaleOriginBanner from '../TailscaleOriginBanner' import type { LucideIcon } from 'lucide-react' interface NavigationItem { @@ -65,7 +66,7 @@ export default function Layout() { { path: '/agent-zero', label: 'Agent Zero', icon: Bot, featureFlag: 'agent_zero' }, { path: '/n8n', label: 'n8n Workflows', icon: Workflow, featureFlag: 'n8n_workflows' }, { path: '/services', label: 'Services', icon: Server }, - { path: '/instances', label: 'Instances', icon: Layers, featureFlag: 'instances_management' }, + { path: '/instances', label: 'ServiceConfigs', icon: Layers, featureFlag: 'instances_management' }, ...(isEnabled('memories_page') ? [ { path: '/memories', label: 'Memories', icon: Brain }, ] : []), @@ -427,6 +428,9 @@ export default function Layout() { + {/* Tailscale Origin Banner */} + + {/* Main Container */}
diff --git a/ushadow/frontend/src/components/services/ServiceCard.tsx b/ushadow/frontend/src/components/services/ServiceCard.tsx index a40edc09..71a37b0b 100644 --- a/ushadow/frontend/src/components/services/ServiceCard.tsx +++ b/ushadow/frontend/src/components/services/ServiceCard.tsx @@ -9,7 +9,7 @@ import { Loader2, } from 'lucide-react' import { useServiceStatus } from '../../hooks/useServiceStatus' -import type { ServiceInstance, ContainerStatus } from '../../contexts/ServicesContext' +import type { ServiceServiceConfig, ContainerStatus } from '../../contexts/ServicesContext' import { ServiceStatusBadge } from './ServiceStatusBadge' import { ServiceConfigForm } from './ServiceConfigForm' @@ -19,7 +19,7 @@ import { ServiceConfigForm } from './ServiceConfigForm' interface ServiceCardProps { /** The service instance */ - service: ServiceInstance + service: ServiceServiceConfig /** Current saved config for this service */ config: Record /** Container status for this service (local services only) */ @@ -62,7 +62,7 @@ interface ServiceCardProps { // Helper Functions // ============================================================================ -function getBorderClasses(service: ServiceInstance, state: string): string { +function getBorderClasses(service: ServiceServiceConfig, state: string): string { // Disabled services get grayed out appearance if (!service.enabled) { return 'border-neutral-200 dark:border-neutral-700 bg-neutral-100 dark:bg-neutral-800/50 shadow-sm opacity-60' diff --git a/ushadow/frontend/src/components/services/ServiceCategoryList.tsx b/ushadow/frontend/src/components/services/ServiceCategoryList.tsx index 877800e9..0eed276b 100644 --- a/ushadow/frontend/src/components/services/ServiceCategoryList.tsx +++ b/ushadow/frontend/src/components/services/ServiceCategoryList.tsx @@ -1,6 +1,6 @@ import { ReactNode } from 'react' import { ChevronDown, ChevronRight } from 'lucide-react' -import type { ServiceInstance } from '../../contexts/ServicesContext' +import type { ServiceServiceConfig } from '../../contexts/ServicesContext' // ============================================================================ // Types @@ -16,13 +16,13 @@ interface ServiceCategoryListProps { /** Service categories to display */ categories: ServiceCategory[] /** Services grouped by category ID */ - servicesByCategory: Record + servicesByCategory: Record /** Set of expanded category IDs */ expandedCategories: Set /** Callback when category is toggled */ onToggleCategory: (categoryId: string) => void /** Render function for each service card */ - renderServiceCard: (service: ServiceInstance) => ReactNode + renderServiceCard: (service: ServiceServiceConfig) => ReactNode } // ============================================================================ diff --git a/ushadow/frontend/src/components/services/ServiceConfigForm.tsx b/ushadow/frontend/src/components/services/ServiceConfigForm.tsx index 4b090663..b69aa4a0 100644 --- a/ushadow/frontend/src/components/services/ServiceConfigForm.tsx +++ b/ushadow/frontend/src/components/services/ServiceConfigForm.tsx @@ -1,6 +1,6 @@ import { useState } from 'react' import { Edit2, Save, X, Loader2, Plus, Trash2, Key, FileText } from 'lucide-react' -import type { ConfigField, ServiceInstance } from '../../contexts/ServicesContext' +import type { ConfigField, ServiceServiceConfig } from '../../contexts/ServicesContext' import { shouldShowField, maskValue } from '../../hooks/useServiceStatus' import { SecretInput, SettingField } from '../settings' @@ -10,7 +10,7 @@ import { SecretInput, SettingField } from '../settings' interface ServiceConfigFormProps { /** The service being configured */ - service: ServiceInstance + service: ServiceServiceConfig /** Current saved config values */ config: Record /** Whether we're in edit mode */ diff --git a/ushadow/frontend/src/components/services/ServiceStatsCards.tsx b/ushadow/frontend/src/components/services/ServiceStatsCards.tsx index ba36cdd1..37f321b2 100644 --- a/ushadow/frontend/src/components/services/ServiceStatsCards.tsx +++ b/ushadow/frontend/src/components/services/ServiceStatsCards.tsx @@ -22,7 +22,7 @@ interface ServiceStatsCardsProps { * * @example * diff --git a/ushadow/frontend/src/components/services/ServiceStatusBadge.tsx b/ushadow/frontend/src/components/services/ServiceStatusBadge.tsx index 3a88fce9..5b9f901d 100644 --- a/ushadow/frontend/src/components/services/ServiceStatusBadge.tsx +++ b/ushadow/frontend/src/components/services/ServiceStatusBadge.tsx @@ -1,6 +1,6 @@ import { Loader2, PlayCircle, StopCircle, LucideIcon } from 'lucide-react' import type { ServiceStatusResult } from '../../hooks/useServiceStatus' -import type { ServiceInstance } from '../../contexts/ServicesContext' +import type { ServiceServiceConfig } from '../../contexts/ServicesContext' // ============================================================================ // Types @@ -8,7 +8,7 @@ import type { ServiceInstance } from '../../contexts/ServicesContext' interface ServiceStatusBadgeProps { /** The service instance */ - service: ServiceInstance + service: ServiceServiceConfig /** Computed status from useServiceStatus hook */ status: ServiceStatusResult /** Whether service is currently starting */ diff --git a/ushadow/frontend/src/components/wiring/WiringBoard.tsx b/ushadow/frontend/src/components/wiring/WiringBoard.tsx index a559127c..8f4b4dd5 100644 --- a/ushadow/frontend/src/components/wiring/WiringBoard.tsx +++ b/ushadow/frontend/src/components/wiring/WiringBoard.tsx @@ -73,9 +73,9 @@ interface ConsumerInfo { interface WiringInfo { id: string - source_instance_id: string + source_config_id: string source_capability: string - target_instance_id: string + target_config_id: string target_capability: string } @@ -92,8 +92,8 @@ interface WiringBoardProps { onProviderDrop: (dropInfo: DropInfo) => void onDeleteWiring: (consumerId: string, capability: string) => Promise onEditProvider: (providerId: string, isTemplate: boolean) => void - onCreateInstance: (templateId: string) => void - onDeleteInstance: (instanceId: string) => void + onCreateServiceConfig: (templateId: string) => void + onDeleteServiceConfig: (instanceId: string) => void onStartProvider?: (providerId: string, isTemplate: boolean) => Promise onStopProvider?: (providerId: string, isTemplate: boolean) => Promise // Consumer/Service callbacks @@ -110,8 +110,8 @@ export default function WiringBoard({ onProviderDrop, onDeleteWiring, onEditProvider, - onCreateInstance, - onDeleteInstance, + onCreateServiceConfig, + onDeleteServiceConfig, onStartProvider, onStopProvider, onEditConsumer, @@ -214,11 +214,11 @@ export default function WiringBoard({ // Get provider for a specific consumer's capability slot const getProviderForSlot = (consumerId: string, capability: string) => { const wire = wiring.find( - (w) => w.target_instance_id === consumerId && w.target_capability === capability + (w) => w.target_config_id === consumerId && w.target_capability === capability ) if (wire) { return { - provider: providers.find((p) => p.id === wire.source_instance_id), + provider: providers.find((p) => p.id === wire.source_config_id), capability, } } @@ -262,7 +262,7 @@ export default function WiringBoard({ {Object.values(templates).map(({ template, instances }) => { if (!template) return null // Skip if template not loaded const templateConnectionCount = wiring.filter( - (w) => w.source_instance_id === template.id + (w) => w.source_config_id === template.id ).length return (
@@ -271,7 +271,7 @@ export default function WiringBoard({ provider={template} connectionCount={templateConnectionCount} onEdit={() => onEditProvider(template.id, true)} - onCreateInstance={() => onCreateInstance(template.id)} + onCreateServiceConfig={() => onCreateServiceConfig(template.id)} onStart={onStartProvider ? () => onStartProvider(template.id, true) : undefined} onStop={onStopProvider ? () => onStopProvider(template.id, true) : undefined} /> @@ -280,7 +280,7 @@ export default function WiringBoard({
{instances.map((instance) => { const instanceConnectionCount = wiring.filter( - (w) => w.source_instance_id === instance.id + (w) => w.source_config_id === instance.id ).length return ( onEditProvider(instance.id, false)} - onDelete={() => onDeleteInstance(instance.id)} + onDelete={() => onDeleteServiceConfig(instance.id)} onStart={onStartProvider ? () => onStartProvider(instance.id, false) : undefined} onStop={onStopProvider ? () => onStopProvider(instance.id, false) : undefined} templateProvider={template} @@ -392,14 +392,14 @@ interface DraggableProviderProps { provider: ProviderInfo connectionCount: number onEdit: () => void - onCreateInstance?: () => void // Only for templates + onCreateServiceConfig?: () => void // Only for templates onDelete?: () => void // Only for instances onStart?: () => Promise onStop?: () => Promise templateProvider?: ProviderInfo // Parent template for instances } -function DraggableProvider({ provider, connectionCount, onEdit, onCreateInstance, onDelete, onStart, onStop, templateProvider }: DraggableProviderProps) { +function DraggableProvider({ provider, connectionCount, onEdit, onCreateServiceConfig, onDelete, onStart, onStop }: DraggableProviderProps) { const [isStarting, setIsStarting] = useState(false) const { attributes, listeners, setNodeRef, isDragging } = useDraggable({ id: provider.id, @@ -529,9 +529,9 @@ function DraggableProvider({ provider, connectionCount, onEdit, onCreateInstance > - {provider.isTemplate && onCreateInstance && ( + {provider.isTemplate && onCreateServiceConfig && (
)} - {/* Templates - Compose Services with Instances */} + {/* Templates - Compose Services with ServiceConfigs */} {composeTemplates.length > 0 && (

@@ -1981,7 +1981,7 @@ export default function InstancesPage() {
{composeTemplates.map((template) => { - const templateInstances = instancesByTemplate[template.id] || [] + const templateServiceConfigs = instancesByTemplate[template.id] || [] const isExpanded = expandedTemplates.has(template.id) return ( @@ -2019,16 +2019,16 @@ export default function InstancesPage() { )}
- {templateInstances.length > 0 && ( + {templateServiceConfigs.length > 0 && ( - {templateInstances.length} {templateInstances.length === 1 ? 'instance' : 'instances'} + {templateServiceConfigs.length} {templateServiceConfigs.length === 1 ? 'instance' : 'instances'} )}
- {/* Service Instances (indented) */} - {isExpanded && templateInstances.length > 0 && ( + {/* Service ServiceConfigs (indented) */} + {isExpanded && templateServiceConfigs.length > 0 && (
- {templateInstances.map((instance) => { + {templateServiceConfigs.map((instance) => { const details = instanceDetails[instance.id] const isRunning = details?.status === 'running' @@ -2086,7 +2086,7 @@ export default function InstancesPage() { e.stopPropagation() handleRestartService(instance.id) }} - disabled={restartingInstance === instance.id} + disabled={restartingServiceConfig === instance.id} className="p-1.5 text-neutral-600 hover:text-neutral-900 dark:text-neutral-400 dark:hover:text-neutral-100 rounded hover:bg-neutral-200 dark:hover:bg-neutral-700" title="Restart" > @@ -2108,7 +2108,7 @@ export default function InstancesPage() {
@@ -2188,21 +2188,21 @@ export default function InstancesPage() { onProviderDrop={handleProviderDrop} onDeleteWiring={handleDeleteWiringFromBoard} onEditProvider={handleEditProviderFromBoard} - onCreateInstance={handleCreateInstanceFromBoard} - onDeleteInstance={handleDeleteInstance} + onCreateServiceConfig={handleCreateServiceConfigFromBoard} + onDeleteServiceConfig={handleDeleteServiceConfig} onStartProvider={async (providerId, isTemplate) => { if (isTemplate) { // For templates, we can't deploy them directly - need to create instance first // This case shouldn't happen as templates don't have start buttons in current UI return } - await handleDeployInstance(providerId) + await handleDeployServiceConfig(providerId) }} onStopProvider={async (providerId, isTemplate) => { if (isTemplate) { return } - await handleUndeployInstance(providerId) + await handleUndeployServiceConfig(providerId) }} onEditConsumer={handleEditConsumer} onStartConsumer={handleStartConsumer} @@ -2212,89 +2212,89 @@ export default function InstancesPage() {

- {/* Unified Create Instance Modal (used by both + button and drag-drop) */} + {/* Unified Create ServiceConfig Modal (used by both + button and drag-drop) */} setCreateInstanceState({ ...createInstanceState, isOpen: false })} - title={createInstanceState.wiring ? 'Connect Provider' : 'Create Instance'} - titleIcon={createInstanceState.wiring ? : } + isOpen={createServiceConfigState.isOpen} + onClose={() => setCreateServiceConfigState({ ...createServiceConfigState, isOpen: false })} + title={createServiceConfigState.wiring ? 'Connect Provider' : 'Create ServiceConfig'} + titleIcon={createServiceConfigState.wiring ? : } maxWidth="lg" testId="create-instance-modal" > - {createInstanceState.template && ( + {createServiceConfigState.template && (
{/* Wiring connection visual (only shown for drag-drop path) */} - {createInstanceState.wiring && ( + {createServiceConfigState.wiring && (

- {createInstanceState.template.name} + {createServiceConfigState.template.name}

-

{createInstanceState.wiring.capability}

+

{createServiceConfigState.wiring.capability}

- {createInstanceState.wiring.consumerName} + {createServiceConfigState.wiring.consumerName}

-

{createInstanceState.wiring.capability} slot

+

{createServiceConfigState.wiring.capability} slot

)} {/* Template info (only shown for + button path) */} - {!createInstanceState.wiring && ( + {!createServiceConfigState.wiring && (
- {createInstanceState.template.source === 'compose' ? ( + {createServiceConfigState.template.source === 'compose' ? ( ) : ( )}

- {createInstanceState.template.name} + {createServiceConfigState.template.name}

-

{createInstanceState.template.description}

+

{createServiceConfigState.template.description}

)} - {/* Instance Name */} + {/* ServiceConfig Name */}
- setCreateInstanceState((prev) => ({ + setCreateServiceConfigState((prev) => ({ ...prev, form: { ...prev.form, name: e.target.value }, })) } className="input w-full text-sm" - placeholder={createInstanceState.form.id} + placeholder={createServiceConfigState.form.id} data-testid="create-instance-name" />
{/* Config fields using ConfigFieldRow */} - {createInstanceState.template.config_schema && createInstanceState.template.config_schema.length > 0 && ( + {createServiceConfigState.template.config_schema && createServiceConfigState.template.config_schema.length > 0 && (
- {createInstanceState.template.config_schema.map((field: any) => ( + {createServiceConfigState.template.config_schema.map((field: any) => ( - setCreateInstanceState((prev) => ({ + setCreateServiceConfigState((prev) => ({ ...prev, form: { ...prev.form, @@ -2310,49 +2310,49 @@ export default function InstancesPage() { {/* Help text */}

- {createInstanceState.wiring - ? 'Instance will be created and connected to the service slot.' + {createServiceConfigState.wiring + ? 'ServiceConfig will be created and connected to the service slot.' : 'Leave fields blank to use default settings. Only modified values will be stored.'}

{/* Modal Footer */}
)} - {/* Edit Provider/Instance Modal */} + {/* Edit Provider/ServiceConfig Modal */} setEditingProvider(null)} - title={editingProvider?.isTemplate ? 'Edit Provider' : 'Edit Instance'} + title={editingProvider?.isTemplate ? 'Edit Provider' : 'Edit ServiceConfig'} titleIcon={} maxWidth="lg" testId="edit-provider-modal" > {editingProvider && editingProvider.template && (
- {/* Provider/Instance name */} + {/* Provider/ServiceConfig name */}

{editingProvider.name} @@ -2497,12 +2497,12 @@ export default function InstancesPage() { {/* Confirmation Dialog */} setConfirmDialog({ isOpen: false, instanceId: null })} /> @@ -2878,7 +2878,7 @@ function TemplateCard({ template, isExpanded, onToggle, onCreate, onRemove }: Te data-testid={`create-from-template-${template.id}`} > - Create Instance + Create ServiceConfig )}

diff --git a/ushadow/frontend/src/services/api.ts b/ushadow/frontend/src/services/api.ts index 4214f304..20d9a32d 100644 --- a/ushadow/frontend/src/services/api.ts +++ b/ushadow/frontend/src/services/api.ts @@ -573,7 +573,7 @@ export const kubernetesApi = { `/api/kubernetes/${clusterId}/envmap`, { namespace: 'default', ...data } ), - deployService: (clusterId: string, data: { service_id: string; namespace?: string; k8s_spec?: any; instance_id?: string }) => + deployService: (clusterId: string, data: { service_id: string; namespace?: string; k8s_spec?: any; config_id?: string }) => api.post<{ success: boolean; message: string; service_id: string; namespace: string }>( `/api/kubernetes/${clusterId}/deploy`, { namespace: 'default', ...data } @@ -1064,14 +1064,14 @@ export const memoriesApi = { } // ============================================================================= -// Instances API (templates, instances, wiring) +// ServiceConfigs API (templates, instances, wiring) // ============================================================================= /** Template source - where the template was discovered from */ export type TemplateSource = 'compose' | 'provider' -/** Instance status */ -export type InstanceStatus = 'pending' | 'deploying' | 'running' | 'stopped' | 'error' | 'n/a' +/** ServiceConfig status */ +export type ServiceConfigStatus = 'pending' | 'deploying' | 'running' | 'stopped' | 'error' | 'n/a' /** Template - discovered from compose or provider files */ export interface Template { @@ -1104,28 +1104,28 @@ export interface Template { installed: boolean // Whether service is installed (for compose services) } -/** Instance config values */ -export interface InstanceConfig { +/** ServiceConfig config values */ +export interface ConfigValues { values: Record } -/** Instance outputs after deployment */ -export interface InstanceOutputs { +/** ServiceConfig outputs after deployment */ +export interface ServiceOutputs { access_url?: string env_vars: Record capability_values: Record } -/** Instance - configured deployment of a template */ -export interface Instance { +/** ServiceConfig - configured deployment of a template */ +export interface ServiceConfig { id: string template_id: string name: string description?: string - config: InstanceConfig + config: ConfigValues deployment_target?: string - status: InstanceStatus - outputs: InstanceOutputs + status: ServiceConfigStatus + outputs: ServiceOutputs container_id?: string container_name?: string deployment_id?: string @@ -1144,12 +1144,12 @@ export interface Instance { next_sync_at?: string } -/** Instance summary for list views */ -export interface InstanceSummary { +/** ServiceConfig summary for list views */ +export interface ServiceConfigSummary { id: string template_id: string name: string - status: InstanceStatus + status: ServiceConfigStatus provides?: string deployment_target?: string access_url?: string @@ -1158,15 +1158,15 @@ export interface InstanceSummary { /** Wiring connection between instances */ export interface Wiring { id: string - source_instance_id: string + source_config_id: string source_capability: string - target_instance_id: string + target_config_id: string target_capability: string created_at?: string } /** Request to create an instance */ -export interface InstanceCreateRequest { +export interface ServiceConfigCreateRequest { id: string template_id: string name: string @@ -1176,7 +1176,7 @@ export interface InstanceCreateRequest { } /** Request to update an instance */ -export interface InstanceUpdateRequest { +export interface ServiceConfigUpdateRequest { name?: string description?: string config?: Record @@ -1185,75 +1185,75 @@ export interface InstanceUpdateRequest { /** Request to create wiring */ export interface WiringCreateRequest { - source_instance_id: string + source_config_id: string source_capability: string - target_instance_id: string + target_config_id: string target_capability: string } -export const instancesApi = { +export const svcConfigsApi = { // Templates /** List all templates (compose services + providers) */ getTemplates: (source?: TemplateSource) => - api.get('/api/instances/templates', { params: source ? { source } : {} }), + api.get('/api/svc-configs/templates', { params: source ? { source } : {} }), /** Get a template by ID */ getTemplate: (templateId: string) => - api.get