diff --git a/.env.example b/.env.example index dc45a80..9b5937c 100644 --- a/.env.example +++ b/.env.example @@ -1,176 +1,41 @@ # Code Interpreter API Configuration +# Only settings you're likely to change are listed here. +# All other settings have sensible defaults — see docs/CONFIGURATION.md for the full list. -# API Configuration -API_HOST=0.0.0.0 -API_PORT=8000 -API_DEBUG=false -API_RELOAD=false - -# SSL/HTTPS Configuration -ENABLE_HTTPS=false -HTTPS_PORT=443 -SSL_REDIRECT=false - -# Docker: Path to directory containing cert.pem and key.pem on the host -# The directory is mounted to /app/ssl/ inside the container automatically. -# Default is ./ssl (relative to docker-compose.yml) -# SSL_CERTS_PATH=/path/to/your/ssl/certs - -# Non-Docker only: Absolute paths to certificate files (not needed for Docker) -# SSL_CERT_FILE=/path/to/cert.pem -# SSL_KEY_FILE=/path/to/key.pem -# SSL_CA_CERTS=/path/to/ca.pem - -# Authentication Configuration +# ── Authentication ────────────────────────────────────────────── API_KEY=your-secure-api-key-here-change-this-in-production # API_KEYS=key1,key2,key3 # Additional API keys (comma-separated) -API_KEY_HEADER=x-api-key -API_KEY_CACHE_TTL=300 +# MASTER_API_KEY=your-secure-master-key # Required for admin dashboard CLI -# API Key Management Configuration -# MASTER_API_KEY=your-secure-master-key # Required for CLI key management -RATE_LIMIT_ENABLED=true - -# Redis Configuration +# ── Redis ─────────────────────────────────────────────────────── REDIS_HOST=localhost REDIS_PORT=6379 -REDIS_PASSWORD= -REDIS_DB=0 -# Alternative: Use Redis URL instead of individual settings -# REDIS_URL=redis://localhost:6379/0 -REDIS_MAX_CONNECTIONS=20 -REDIS_SOCKET_TIMEOUT=5 -REDIS_SOCKET_CONNECT_TIMEOUT=5 +# REDIS_PASSWORD= +# REDIS_URL=redis://localhost:6379/0 # Alternative to individual settings -# MinIO/S3 Configuration +# ── MinIO / S3 ───────────────────────────────────────────────── MINIO_ENDPOINT=localhost:9000 MINIO_ACCESS_KEY=minioadmin MINIO_SECRET_KEY=minioadmin -MINIO_SECURE=false -MINIO_BUCKET=code-interpreter-files -MINIO_REGION=us-east-1 - -# Docker Configuration -DOCKER_IMAGE_REGISTRY=code-interpreter -# DOCKER_BASE_URL=unix://var/run/docker.sock -DOCKER_TIMEOUT=60 -DOCKER_NETWORK_MODE=none -DOCKER_READ_ONLY=true - -# Resource Limits - Execution -MAX_EXECUTION_TIME=120 -MAX_MEMORY_MB=512 -MAX_CPUS=1 -MAX_PIDS=512 -MAX_OPEN_FILES=1024 - -# Resource Limits - Files -MAX_FILE_SIZE_MB=10 -MAX_TOTAL_FILE_SIZE_MB=50 -MAX_FILES_PER_SESSION=50 -MAX_OUTPUT_FILES=10 -MAX_FILENAME_LENGTH=255 - -# Resource Limits - Sessions -MAX_CONCURRENT_EXECUTIONS=10 -MAX_SESSIONS_PER_ENTITY=100 - -# Session Configuration -# TTL applies only to MinIO-stored session data (files/metadata). Containers are ephemeral per execution. -SESSION_TTL_HOURS=24 -SESSION_CLEANUP_INTERVAL_MINUTES=60 -SESSION_ID_LENGTH=32 +# MINIO_SECURE=false +# MINIO_BUCKET=code-interpreter-files -# MinIO Orphan Cleanup (optional) -# Enable periodic pruning of MinIO objects older than TTL with missing Redis sessions -ENABLE_ORPHAN_MINIO_CLEANUP=true +# ── Execution Limits ─────────────────────────────────────────── +# MAX_EXECUTION_TIME=30 # Seconds (default: 30) +# MAX_MEMORY_MB=512 # Per-execution memory limit -# Container Pool Configuration -CONTAINER_POOL_ENABLED=true -CONTAINER_POOL_WARMUP_ON_STARTUP=true +# ── Sandbox Pool (Python REPL) ───────────────────────────────── +# SANDBOX_POOL_ENABLED=true +# SANDBOX_POOL_PY=5 # Number of pre-warmed Python REPLs +# REPL_ENABLED=true -# Per-language pool sizes (0 = on-demand only, no pre-warming) -# Only set the languages you want to pre-warm -CONTAINER_POOL_PY=5 # Python -CONTAINER_POOL_JS=2 # JavaScript -# CONTAINER_POOL_TS=0 # TypeScript (default: 0) -# CONTAINER_POOL_GO=0 # Go (default: 0) -# CONTAINER_POOL_JAVA=0 # Java (default: 0) -# CONTAINER_POOL_C=0 # C (default: 0) -# CONTAINER_POOL_CPP=0 # C++ (default: 0) -# CONTAINER_POOL_PHP=0 # PHP (default: 0) -# CONTAINER_POOL_RS=0 # Rust (default: 0) -# CONTAINER_POOL_R=0 # R (default: 0) -# CONTAINER_POOL_F90=0 # Fortran (default: 0) -# CONTAINER_POOL_D=0 # D (default: 0) - -# Pool optimization settings -CONTAINER_POOL_PARALLEL_BATCH=5 -CONTAINER_POOL_REPLENISH_INTERVAL=2 -CONTAINER_POOL_EXHAUSTION_TRIGGER=true - -# REPL Configuration (Python Fast Execution) -# Pre-warmed Python interpreter for ~20-40ms execution latency -REPL_ENABLED=true -REPL_WARMUP_TIMEOUT_SECONDS=15 -REPL_HEALTH_CHECK_TIMEOUT_SECONDS=5 - -# State Persistence Configuration (Python) -# Enables Python variable/function persistence across executions within same session -STATE_PERSISTENCE_ENABLED=true -# Redis hot storage TTL (default: 2 hours) -STATE_TTL_SECONDS=7200 -# Maximum serialized state size -STATE_MAX_SIZE_MB=50 -# Capture state even on execution failure -STATE_CAPTURE_ON_ERROR=false - -# State Archival Configuration (Python) -# Archives inactive states from Redis to MinIO for long-term storage -STATE_ARCHIVE_ENABLED=true -# Archive to MinIO after this inactivity period (default: 1 hour) -STATE_ARCHIVE_AFTER_SECONDS=3600 -# Keep archived states in MinIO for this many days (default: 1 day / 24 hours) -STATE_ARCHIVE_TTL_DAYS=1 -# How often to check for states to archive -STATE_ARCHIVE_CHECK_INTERVAL_SECONDS=300 - -# Detailed Metrics Configuration -# Track per-API-key, per-language execution metrics -DETAILED_METRICS_ENABLED=true -# Maximum metrics to buffer in memory -METRICS_BUFFER_SIZE=10000 -# Archive metrics to MinIO for long-term analysis -METRICS_ARCHIVE_ENABLED=true -# Keep archived metrics for this many days -METRICS_ARCHIVE_RETENTION_DAYS=90 - -# Security Configuration -ENABLE_NETWORK_ISOLATION=true -ENABLE_FILESYSTEM_ISOLATION=true - -# WAN Network Access Configuration -# When enabled, execution containers can access the public internet -# but are blocked from accessing host, other containers, and private networks -# IMPORTANT: Requires NET_ADMIN capability for iptables management -ENABLE_WAN_ACCESS=false -WAN_NETWORK_NAME=code-interpreter-wan -# WAN_DNS_SERVERS=8.8.8.8,1.1.1.1,8.8.4.4 - -# Logging Configuration -LOG_LEVEL=INFO -LOG_FORMAT=json -# LOG_FILE=/var/log/code-interpreter-api.log -LOG_MAX_SIZE_MB=100 -LOG_BACKUP_COUNT=5 -ENABLE_ACCESS_LOGS=true -ENABLE_SECURITY_LOGS=true - -# Health Check Configuration -HEALTH_CHECK_INTERVAL=30 -HEALTH_CHECK_TIMEOUT=5 +# ── SSL/HTTPS ────────────────────────────────────────────────── +# ENABLE_HTTPS=false +# HTTPS_PORT=443 +# SSL_CERT_FILE=/path/to/cert.pem +# SSL_KEY_FILE=/path/to/key.pem -# Development Configuration -ENABLE_CORS=false -# CORS_ORIGINS=http://localhost:3000,http://localhost:8080 -ENABLE_DOCS=false \ No newline at end of file +# ── Logging ──────────────────────────────────────────────────── +# LOG_LEVEL=INFO # INFO = clean (1 log per execution); DEBUG = full detail +# LOG_FORMAT=json # json (structured) or text (human-readable, colored) +# ENABLE_ACCESS_LOGS=false # Set true to enable uvicorn per-request access logs diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 81f62a1..4327eca 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -6,6 +6,7 @@ on: tags: ["v*.*.*"] paths: - 'Dockerfile' + - 'docker/**' - 'src/**' - 'requirements.txt' - 'pyproject.toml' diff --git a/.github/workflows/execution-env-publish.yml b/.github/workflows/execution-env-publish.yml deleted file mode 100644 index 8b681e0..0000000 --- a/.github/workflows/execution-env-publish.yml +++ /dev/null @@ -1,127 +0,0 @@ -name: Execution Env Build and Publish - -on: - push: - branches: ["main", "dev"] - paths: - - "docker/**" - - ".github/workflows/execution-env-publish.yml" - workflow_dispatch: - inputs: - build_all: - description: 'Force build all images' - type: boolean - default: false - -env: - REGISTRY: ghcr.io - IMAGE_BASE: ${{ github.repository }} - ALL_LANGUAGES: '["python", "nodejs", "go", "java", "c-cpp", "php", "rust", "fortran", "r", "d"]' - -jobs: - filter: - runs-on: ubuntu-latest - outputs: - changes: ${{ steps.determine-targets.outputs.languages }} - steps: - - uses: actions/checkout@v4 - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - rebuild_all: - - '.github/workflows/execution-env-publish.yml' - - 'docker/entrypoint.sh' - python: - - 'docker/python.Dockerfile' - - 'docker/repl_server.py' - - 'docker/requirements/**' - nodejs: - - 'docker/nodejs.Dockerfile' - go: - - 'docker/go.Dockerfile' - java: - - 'docker/java.Dockerfile' - c-cpp: - - 'docker/c-cpp.Dockerfile' - php: - - 'docker/php.Dockerfile' - rust: - - 'docker/rust.Dockerfile' - fortran: - - 'docker/fortran.Dockerfile' - r: - - 'docker/r.Dockerfile' - d: - - 'docker/d.Dockerfile' - - - name: Determine targets - id: determine-targets - run: | - if [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ "${{ github.event.inputs.build_all }}" == "true" ]; then - echo 'languages=${{ env.ALL_LANGUAGES }}' >> $GITHUB_OUTPUT - elif [ "${{ steps.changes.outputs.rebuild_all }}" == "true" ]; then - # If workflow or shared files changed, build everything - echo 'languages=${{ env.ALL_LANGUAGES }}' >> $GITHUB_OUTPUT - else - # Filter out rebuild_all from the changes list - CHANGES='${{ steps.changes.outputs.changes }}' - FILTERED=$(echo "$CHANGES" | jq -c '[.[] | select(. != "rebuild_all")]') - echo "languages=$FILTERED" >> $GITHUB_OUTPUT - fi - - build-images: - needs: filter - if: ${{ needs.filter.outputs.changes != '[]' && needs.filter.outputs.changes != '' }} - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - language: ${{ fromJSON(needs.filter.outputs.changes) }} - - permissions: - contents: read - packages: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log into registry ${{ env.REGISTRY }} - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract Docker metadata - id: meta - uses: docker/metadata-action@v5 - with: - # Image name format: ghcr.io/owner/repo/python - images: ${{ env.REGISTRY }}/${{ env.IMAGE_BASE }}/${{ matrix.language }} - tags: | - type=ref,event=branch - type=semver,pattern={{version}} - type=sha,format=long - type=raw,value=latest,enable={{is_default_branch}} - - - name: Build and push ${{ matrix.language }} image - uses: docker/build-push-action@v6 - with: - context: docker - file: docker/${{ matrix.language }}.Dockerfile - push: true - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - BUILD_DATE=$(date -u +'%Y-%m-%dT%H:%M:%SZ') diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 521afb0..e64994b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,7 +20,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pytest pytest-asyncio pytest-cov pytest-mock bandit + pip install flake8 black mypy pytest pytest-asyncio pytest-cov pytest-mock bandit - name: Lint with flake8 run: | @@ -45,3 +45,20 @@ jobs: - name: Run Unit Tests run: | pytest tests/unit/ + + docker-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Validate Docker build + uses: docker/build-push-action@v6 + with: + context: . + push: false + load: true + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/CHANGELOG.md b/CHANGELOG.md index c0f71d1..59ed3bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,18 +8,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Container hardening with host information masking to prevent information leakage -- Optional WAN-only network access mode for containers with private IP blocking +- nsjail-based sandboxing for code execution (replaces Docker socket-based approach) +- Single unified Docker image with all 12 language runtimes - Hour and day periods for execution heatmap visualizations - MyPy type checking integration with comprehensive type hints - Dynamic Content Security Policy headers based on request path ### Changed +- Migrated from per-language Docker containers to nsjail sandboxes for isolation +- Replaced ContainerPool/Manager/Executor with SandboxPool/Manager/Executor +- Simplified Docker setup: single Dockerfile and docker-compose.yml - Improved heatmap UI styling for better visualization -- Enhanced development environment with source directory mounting in docker-compose - Updated Pydantic settings configuration for better type safety ### Removed +- Per-language Docker images and build-images.sh script +- Docker SDK dependency (no Docker socket needed) +- docker-compose.ghcr.yml (single compose file now) - Deprecated baseline performance documentation files - Legacy deployment scripts @@ -30,24 +35,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 #### Core Features - Multi-language code execution supporting 12 languages: Python, JavaScript, TypeScript, Go, Java, C, C++, PHP, Rust, R, Fortran, and D - FastAPI-based REST API with interactive documentation -- Docker-based sandboxed execution environments with comprehensive security controls +- Sandboxed execution environments with comprehensive security controls - Redis-based session management with automatic cleanup - MinIO/S3-compatible storage integration for persistent file storage #### Performance Features - REPL mode for Python with pre-warmed interpreter achieving 20-40ms execution latency -- Container pooling system with pre-warmed containers for ~3ms acquisition time -- Per-language container pool sizing for optimized resource allocation +- Sandbox pooling system with pre-warmed sandboxes for ~3ms acquisition time - Thread-safe execution supporting 10+ concurrent requests - State persistence for Python sessions with Redis and MinIO archival #### Security Features - API key-based authentication with rate limiting -- Network isolation with `network_mode: none` by default -- Read-only filesystem in containers -- All Linux capabilities dropped (`cap_drop: ALL`) -- Tmpfs mounts with `noexec,nosuid` flags -- No-new-privileges security option +- nsjail-based sandbox isolation with PID, mount, and network namespaces +- Seccomp syscall filtering +- Cgroup-based resource limits for CPU, memory, and PID count +- Non-root code execution (uid 1001) - Resource limits for CPU, memory, and execution time - Input validation and sanitization diff --git a/Dockerfile b/Dockerfile index 96e40f4..ebfa4c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,82 +1,313 @@ -# Multi-stage build for Code Interpreter API -FROM python:3.11-slim as builder - -# Set build arguments -ARG BUILD_DATE -ARG VERSION=1.0.0 -ARG VCS_REF - -# Add metadata -LABEL maintainer="LibreCodeInterpreter Contributors" \ - org.opencontainers.image.title="LibreCodeInterpreter" \ - org.opencontainers.image.description="Secure API for executing code in isolated environments" \ - org.opencontainers.image.version="${VERSION}" \ - org.opencontainers.image.created="${BUILD_DATE}" \ - org.opencontainers.image.revision="${VCS_REF}" \ - org.opencontainers.image.source="https://github.com/LibreCodeInterpreter/LibreCodeInterpreter" \ - org.opencontainers.image.licenses="Apache-2.0" - -# Install system dependencies for building -RUN apt-get update && apt-get install -y \ - build-essential \ - curl \ +# syntax=docker/dockerfile:1.4 +# Unified Dockerfile: All 12 language runtimes + nsjail + API +# This image supports sandboxed code execution without requiring Docker-in-Docker. + +FROM ubuntu:24.04 + +ARG DEBIAN_FRONTEND=noninteractive + +# ============================================ +# System dependencies + nsjail +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + # Build tools for nsjail + git cmake build-essential pkg-config \ + libprotobuf-dev protobuf-compiler \ + libnl-3-dev libnl-route-3-dev \ + flex bison \ + # Common system tools + curl wget ca-certificates gnupg software-properties-common \ + # Shared libraries needed across languages + libssl-dev libffi-dev libxml2-dev libxslt-dev zlib1g-dev \ && rm -rf /var/lib/apt/lists/* -# Set working directory -WORKDIR /app +# Build nsjail from source +RUN git clone --depth 1 https://github.com/google/nsjail.git /tmp/nsjail && \ + cd /tmp/nsjail && make -j$(nproc) && \ + cp /tmp/nsjail/nsjail /usr/local/bin/nsjail && \ + chmod +x /usr/local/bin/nsjail && \ + rm -rf /tmp/nsjail + +# ============================================ +# Python 3.12 (primary runtime) +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv python3-dev \ + python3-tk \ + # Libraries needed for Python packages (from python.Dockerfile) + gcc g++ make pkg-config \ + libcairo2-dev libpango1.0-dev libgdk-pixbuf-2.0-dev \ + libjpeg-dev libpng-dev libtiff-dev libopenjp2-7-dev \ + libfreetype6-dev liblcms2-dev libwebp-dev \ + tcl8.6-dev tk8.6-dev \ + poppler-utils tesseract-ocr pandoc \ + portaudio19-dev flac ffmpeg \ + libpulse-dev libsdl2-dev libsdl2-mixer-dev libsdl2-image-dev libsdl2-ttf-dev \ + antiword unrtf \ + && rm -rf /var/lib/apt/lists/* -# Copy requirements first for better caching -COPY requirements.txt . +# Copy Python requirements files +COPY docker/requirements/python-core.txt /tmp/python-core.txt +COPY docker/requirements/python-analysis.txt /tmp/python-analysis.txt +COPY docker/requirements/python-visualization.txt /tmp/python-visualization.txt +COPY docker/requirements/python-documents.txt /tmp/python-documents.txt +COPY docker/requirements/python-utilities.txt /tmp/python-utilities.txt +COPY docker/requirements/python-new.txt /tmp/python-new.txt -# Install Python dependencies -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r requirements.txt +# Install Python build tooling +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --break-system-packages --ignore-installed \ + "pip<24.1" "setuptools<70" wheel "packaging<24" -# Production stage -FROM python:3.11-slim as production +# Install Python packages in layers (most stable first) +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --break-system-packages -r /tmp/python-core.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --break-system-packages -r /tmp/python-analysis.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --break-system-packages -r /tmp/python-visualization.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --break-system-packages -r /tmp/python-documents.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --break-system-packages -r /tmp/python-utilities.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --break-system-packages -r /tmp/python-new.txt -# Install runtime dependencies -RUN apt-get update && apt-get install -y \ - curl \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean +RUN rm -f /tmp/python-*.txt -# Create non-root user with explicit UID 1000 for consistent volume permissions -# The docker group GID 988 matches the host's docker group for socket access -RUN groupadd -r -g 1000 appuser && useradd -r -u 1000 -g appuser appuser && \ - groupadd -g 988 docker && usermod -aG docker appuser +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONPATH=/mnt/data -# Set working directory -WORKDIR /app +# ============================================ +# Node.js (for JavaScript / TypeScript) +# ============================================ +RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \ + apt-get install -y --no-install-recommends nodejs && \ + rm -rf /var/lib/apt/lists/* -# Copy Python packages from builder stage -COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages -COPY --from=builder /usr/local/bin /usr/local/bin +# Copy Node.js package list and install globally +COPY docker/requirements/nodejs.txt /tmp/nodejs.txt +RUN --mount=type=cache,target=/root/.npm \ + cat /tmp/nodejs.txt | grep -v '^#' | grep -v '^$' | xargs npm install -g && \ + rm -f /tmp/nodejs.txt -# Copy application code -COPY src/ ./src/ -COPY dashboard/ ./dashboard/ +ENV NODE_ENV=sandbox \ + NODE_PATH=/usr/local/lib/node_modules -COPY .env.example . +# ============================================ +# Go +# ============================================ +ARG GO_VERSION=1.23.6 +RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-$(dpkg --print-architecture).tar.gz" \ + | tar -C /usr/local -xzf - -# Create necessary directories with correct ownership -RUN mkdir -p /app/logs /app/data /app/ssl && \ - chown -R 1000:1000 /app +ENV PATH="/usr/local/go/bin:${PATH}" \ + GOPATH="/usr/local/gopath" \ + GO111MODULE=on \ + GOPROXY=https://proxy.golang.org,direct \ + GOSUMDB=sum.golang.org -# Switch to non-root user -USER appuser +# Pre-download Go modules +COPY docker/requirements/go.mod /tmp/gosetup/go.mod +RUN --mount=type=cache,target=/usr/local/gopath/pkg/mod \ + cd /tmp/gosetup && go mod download && \ + cd / && rm -rf /tmp/gosetup -# Set environment variables -ENV PYTHONPATH=/app \ - PYTHONUNBUFFERED=1 \ - PYTHONDONTWRITEBYTECODE=1 +# ============================================ +# Java (JDK) +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + default-jdk \ + && rm -rf /var/lib/apt/lists/* -# Health check - try HTTPS first, then common HTTP ports -HEALTHCHECK --interval=30s --timeout=15s --start-period=10s --retries=3 \ - CMD curl -f -k https://localhost:443/health 2>/dev/null || curl -f http://localhost:8000/health 2>/dev/null || curl -f http://localhost:80/health || exit 1 +# Download Java libraries (from java.Dockerfile) +RUN mkdir -p /opt/java/lib && cd /opt/java/lib && \ + # Apache Commons + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-csv/1.10.0/commons-csv-1.10.0.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.14.0/commons-lang3-3.14.0.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-math3/3.6.1/commons-math3-3.6.1.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-collections4/4.4/commons-collections4-4.4.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-compress/1.25.0/commons-compress-1.25.0.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-text/1.11.0/commons-text-1.11.0.jar && \ + # Jackson JSON + wget -q https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.16.0/jackson-core-2.16.0.jar && \ + wget -q https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.16.0/jackson-databind-2.16.0.jar && \ + wget -q https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.16.0/jackson-annotations-2.16.0.jar && \ + # Apache POI (Excel) + wget -q https://repo1.maven.org/maven2/org/apache/poi/poi/5.2.5/poi-5.2.5.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/5.2.5/poi-ooxml-5.2.5.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml-lite/5.2.5/poi-ooxml-lite-5.2.5.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/xmlbeans/xmlbeans/5.2.0/xmlbeans-5.2.0.jar && \ + # Apache PDFBox + wget -q https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox/3.0.1/pdfbox-3.0.1.jar && \ + wget -q https://repo1.maven.org/maven2/org/apache/pdfbox/fontbox/3.0.1/fontbox-3.0.1.jar && \ + # Google Guava + wget -q https://repo1.maven.org/maven2/com/google/guava/guava/33.0.0-jre/guava-33.0.0-jre.jar && \ + # Gson + wget -q https://repo1.maven.org/maven2/com/google/code/gson/gson/2.10.1/gson-2.10.1.jar && \ + # Joda-Time + wget -q https://repo1.maven.org/maven2/joda-time/joda-time/2.12.5/joda-time-2.12.5.jar -# Expose ports -EXPOSE 8000 443 +ENV JAVA_OPTS="-Xmx512m -Xms128m" \ + CLASSPATH="/mnt/data:/opt/java/lib/*" + +# ============================================ +# C/C++ (GCC) +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake \ + # Math and science libraries + libgsl-dev libblas-dev liblapack-dev \ + # File handling libraries + libzip-dev \ + # JSON library + nlohmann-json3-dev \ + # CSV library + libcsv-dev \ + && rm -rf /var/lib/apt/lists/* + +ENV CC=gcc \ + CXX=g++ \ + PKG_CONFIG_PATH=/usr/lib/x86_64-linux-gnu/pkgconfig + +# ============================================ +# PHP +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + php php-cli php-common \ + php-xml php-zip php-gd php-mbstring \ + php-curl php-json \ + libonig-dev unzip \ + && rm -rf /var/lib/apt/lists/* + +# Install Composer +RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer + +# Install PHP packages globally via Composer +ENV COMPOSER_HOME=/opt/composer/global +RUN mkdir -p /opt/composer/global && \ + composer global require \ + league/csv \ + phpoffice/phpspreadsheet \ + league/flysystem \ + intervention/image \ + ramsey/uuid \ + nesbot/carbon \ + markrogoyski/math-php \ + guzzlehttp/guzzle \ + symfony/yaml \ + symfony/console \ + --optimize-autoloader + +ENV PHP_INI_SCAN_DIR="/etc/php/8.3/cli/conf.d" + +# ============================================ +# Rust +# ============================================ +ENV RUSTUP_HOME=/usr/local/rustup \ + CARGO_HOME=/usr/local/cargo +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \ + sh -s -- -y --default-toolchain stable --profile minimal && \ + chmod -R a+r /usr/local/cargo /usr/local/rustup + +ENV PATH="/usr/local/cargo/bin:${PATH}" + +# Pre-compile Rust crates +COPY docker/requirements/rust-Cargo.toml /tmp/rust-cache/Cargo.toml +RUN mkdir -p /tmp/rust-cache/src && echo 'fn main() {}' > /tmp/rust-cache/src/main.rs && \ + cd /tmp/rust-cache && cargo build --release || true && \ + rm -rf /tmp/rust-cache -# Default command - use Python to run main.py which handles HTTP/HTTPS logic -CMD ["python", "-m", "src.main"] \ No newline at end of file +# ============================================ +# R +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + r-base r-base-dev \ + libcurl4-openssl-dev \ + libfontconfig1-dev libharfbuzz-dev libfribidi-dev \ + libtiff5-dev libjpeg-dev libcairo2-dev \ + libxt-dev libx11-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install R packages +RUN R -e "options(repos = c(CRAN = 'https://cloud.r-project.org')); \ + install.packages(c( \ + 'dplyr', 'tidyr', 'data.table', 'magrittr', \ + 'ggplot2', 'lattice', 'scales', 'Cairo', \ + 'readr', 'readxl', 'writexl', 'jsonlite', 'xml2', \ + 'MASS', 'survival', 'lubridate', 'stringr', 'glue' \ + ))" + +ENV R_LIBS_USER=/usr/local/lib/R/site-library + +# ============================================ +# Fortran (gfortran) +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + gfortran \ + libnetcdf-dev libhdf5-dev \ + && rm -rf /var/lib/apt/lists/* + +ENV FORTRAN_COMPILER=gfortran \ + FC=gfortran \ + F77=gfortran \ + F90=gfortran \ + F95=gfortran + +# ============================================ +# D Language (LDC) +# ============================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + ldc \ + binutils \ + && rm -rf /var/lib/apt/lists/* + +# ============================================ +# REPL Server + entrypoint +# ============================================ +COPY docker/repl_server.py /opt/repl_server.py +COPY docker/entrypoint.sh /opt/entrypoint.sh +RUN chmod +x /opt/repl_server.py /opt/entrypoint.sh + +# ============================================ +# Sandbox directory structure +# ============================================ +RUN mkdir -p /var/lib/code-interpreter/sandboxes && \ + mkdir -p /mnt/data && \ + mkdir -p /tmp/empty_proc + +# Create non-root user for code execution (uid 1001) +RUN groupadd -g 1001 codeuser && \ + useradd -u 1001 -g codeuser -m codeuser && \ + chown -R codeuser:codeuser /mnt/data + +# ============================================ +# API installation +# ============================================ +WORKDIR /app + +# Install API dependencies (excluding docker package) +COPY requirements.txt /tmp/requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + grep -v '^docker==' /tmp/requirements.txt | \ + grep -v '^requests-unixsocket==' | \ + pip install --break-system-packages --ignore-installed -r /dev/stdin && \ + rm -f /tmp/requirements.txt + +# Copy API source code and dashboard +COPY src/ /app/src/ +COPY dashboard/ /app/dashboard/ + +# ============================================ +# Final PATH and environment +# ============================================ +ENV PATH="/usr/local/cargo/bin:/usr/local/go/bin:/opt/composer/global/vendor/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:${PATH}" \ + SANDBOX_BASE_DIR=/var/lib/code-interpreter/sandboxes + +# Security: strip SUID/SGID bits from all binaries +RUN find / -path /proc -prune -o -path /sys -prune -o \ + \( -perm -4000 -o -perm -2000 \) -type f -exec chmod u-s,g-s {} + 2>/dev/null || true + +EXPOSE 8000 443 +CMD ["python3", "-m", "src.main"] diff --git a/README.md b/README.md index 572feaf..67bdc6c 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Python Version](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/) [![CI Status](https://github.com/usnavy13/LibreCodeInterpreter/actions/workflows/lint.yml/badge.svg)](https://github.com/usnavy13/LibreCodeInterpreter/actions/workflows/lint.yml) -A secure, open-source code interpreter API that provides sandboxed code execution in isolated Docker containers. Compatible with LibreChat's Code Interpreter API. +A secure, open-source code interpreter API that provides sandboxed code execution using nsjail for isolation. Compatible with LibreChat's Code Interpreter API. ## Quick Start @@ -24,43 +24,20 @@ Get up and running in minutes by building the execution environment. # The default settings work out-of-the-box for local development ``` -3. **Prepare execution environment images** +3. **Build the unified Docker image** - You can either build the images locally (recommended) or pull pre-built images from GitHub Container Registry. - - **Option A: Build locally (Recommended)** ```bash - # Build Python only (minimal) - ./docker/build-images.sh -l python - - # Or build all 12 languages - ./docker/build-images.sh -p + docker build -t code-interpreter:nsjail . ``` - **Option B: Pull from GHCR** - ```bash - # Pull Python only - docker pull ghcr.io/usnavy13/librecodeinterpreter/python:latest - - # Or pull the API and all languages - docker pull ghcr.io/usnavy13/librecodeinterpreter:latest - for lang in python nodejs go java c-cpp php rust r fortran d; do - docker pull ghcr.io/usnavy13/librecodeinterpreter/$lang:latest - done - ``` + This builds a single image containing all 12 language runtimes and nsjail for sandboxed execution. 4. **Start the API** - **Option A: Using local images (if you built them)** ```bash docker compose up -d ``` - **Option B: Using pre-built images (if you pulled them)** - ```bash - docker compose -f docker-compose.ghcr.yml up -d - ``` - The API will be available at `http://localhost:8000`. Visit `http://localhost:8000/docs` for the interactive API documentation. @@ -72,17 +49,17 @@ A built-in admin dashboard is available at `http://localhost:8000/admin-dashboar - **Overview**: Real-time execution metrics, success rates, and performance graphs - **API Keys**: Create, view, and manage API keys with rate limiting -- **System Health**: Monitor Redis, MinIO, Docker, and container pool status +- **System Health**: Monitor Redis, MinIO, and sandbox pool status The dashboard requires the master API key for authentication. ## Features - **Multi-language Support**: Execute code in 12 languages - Python, JavaScript, TypeScript, Go, Java, C, C++, PHP, Rust, R, Fortran, and D -- **Sub-50ms Python Execution**: Pre-warmed REPL containers achieve ~20-40ms latency for simple Python code -- **Container Pool**: Pre-warmed containers provide ~3ms acquisition time (vs 500-2000ms cold start) +- **Sub-50ms Python Execution**: Pre-warmed REPL sandboxes achieve ~20-40ms latency for simple Python code +- **Sandbox Pool**: Pre-warmed nsjail sandboxes provide ~3ms acquisition time (vs 500-2000ms cold start) - **High Concurrency**: Thread-safe execution supporting 10+ concurrent requests -- **Secure Execution**: Containerized sandboxed environments with comprehensive resource limits +- **Secure Execution**: nsjail-based sandboxed environments with namespace isolation, seccomp, and resource limits - **File Management**: Upload, download, and manage files within execution sessions - **Session Management**: Redis-based session handling with automatic cleanup - **S3-Compatible Storage**: MinIO integration for persistent file storage @@ -97,11 +74,11 @@ The dashboard requires the master API key for authentication. ## Architecture -The LibreCodeInterpreter is built with a focus on security, speed, and scalability. It uses a combination of **FastAPI** for the web layer, **Docker** for sandboxed execution, and **Redis** for session management. +The LibreCodeInterpreter is built with a focus on security, speed, and scalability. It uses a combination of **FastAPI** for the web layer, **nsjail** for sandboxed execution, and **Redis** for session management. Key features include: -- **Container Pooling**: Pre-warmed containers for sub-50ms execution. +- **Sandbox Pooling**: Pre-warmed nsjail sandboxes for sub-50ms execution. - **Stateless Execution**: Each execution is isolated and ephemeral. - **Session Persistence**: Optional state persistence for Python sessions. @@ -134,7 +111,7 @@ The service is highly configurable via environment variables. | **API** | Host, port, and security settings. | | **Storage** | Redis and MinIO/S3 connection details. | | **Resources** | Per-execution memory, CPU, and time limits. | -| **Pools** | Container pool sizing and warmup settings. | +| **Pools** | Sandbox pool sizing and warmup settings. | A full list of configuration options and a production checklist can be found in [CONFIGURATION.md](docs/CONFIGURATION.md). @@ -152,13 +129,13 @@ For comprehensive testing details, see [TESTING.md](docs/TESTING.md). ## Security -- All code execution happens in isolated Docker containers -- Network access is disabled by default (`network_mode: none`) -- Containers run with read-only filesystem -- All Linux capabilities are dropped (`cap_drop: ALL`) -- `/tmp` is mounted as tmpfs with `noexec,nosuid` flags -- `no-new-privileges` security option prevents privilege escalation -- Resource limits prevent CPU, memory, and process exhaustion +- All code execution happens in nsjail sandboxes with namespace isolation +- PID, mount, and network namespaces isolate each execution +- Seccomp syscall filtering restricts available system calls +- Cgroup-based resource limits prevent CPU, memory, and process exhaustion +- rlimits restrict file sizes, open file descriptors, etc. +- Code runs as non-root user (uid 1001) +- Read-only bind mounts for language runtimes and libraries - API key authentication protects all endpoints - Input validation prevents injection attacks diff --git a/docker-compose.ghcr.yml b/docker-compose.ghcr.yml deleted file mode 100644 index 1df2818..0000000 --- a/docker-compose.ghcr.yml +++ /dev/null @@ -1,159 +0,0 @@ -services: - # Code Interpreter API - api: - image: ghcr.io/usnavy13/librecodeinterpreter:latest - container_name: code-interpreter-api - user: "1000:988" # Run as user with docker group access - cap_add: - - NET_ADMIN # Required for iptables management when WAN access is enabled - ports: - - "${API_PORT:-8000}:8000" - - "${HTTPS_PORT:-443}:443" - env_file: - - .env - environment: - # Container-specific overrides (these override .env values) - - API_HOST=0.0.0.0 - - API_PORT=8000 - - DOCKER_IMAGE_REGISTRY=ghcr.io/usnavy13/librecodeinterpreter - - DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-latest} - - # Service discovery (container names) - - REDIS_HOST=redis - - REDIS_PORT=6379 - - MINIO_ENDPOINT=minio:9000 - - # Docker socket path inside container - - DOCKER_BASE_URL=unix://var/run/docker.sock - - # SSL paths inside container - - SSL_CERT_FILE=/app/ssl/cert.pem - - SSL_KEY_FILE=/app/ssl/key.pem - volumes: - - /var/run/docker.sock:/var/run/docker.sock - - ./logs:/app/logs - - app-data:/app/data - - ${SSL_CERTS_PATH:-./ssl}:/app/ssl - depends_on: - - redis - - minio - networks: - - code-interpreter-network - restart: unless-stopped - stop_grace_period: 30s # Allow time to cleanup pooled containers - healthcheck: - test: - [ - "CMD", - "sh", - "-c", - "curl -f -k https://localhost:443/health 2>/dev/null || curl -f http://localhost:8000/health 2>/dev/null || curl -f http://localhost:80/health || exit 1", - ] - interval: 30s - timeout: 15s - retries: 3 - start_period: 15s - - # Redis for session management - redis: - image: redis:7-alpine - container_name: code-interpreter-redis - ports: - # Expose to localhost for CLI tools - - "127.0.0.1:6379:6379" - environment: - - REDIS_PASSWORD=${REDIS_PASSWORD:-} - command: > - sh -c " - if [ -n \"$$REDIS_PASSWORD\" ]; then - redis-server --requirepass $$REDIS_PASSWORD --appendonly yes --appendfsync everysec - else - redis-server --appendonly yes --appendfsync everysec - fi - " - volumes: - - redis-data:/data - - ./docker/redis/redis.conf:/usr/local/etc/redis/redis.conf:ro - networks: - - code-interpreter-network - restart: unless-stopped - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 30s - timeout: 10s - retries: 3 - - # MinIO for file storage - minio: - image: minio/minio:latest - container_name: code-interpreter-minio - ports: - # API port for local testing - - "127.0.0.1:9000:9000" - # Console only, bound to localhost (access via SSH tunnel) - - "127.0.0.1:${MINIO_CONSOLE_PORT:-9001}:9001" - environment: - - MINIO_ROOT_USER=${MINIO_ACCESS_KEY:-minioadmin} - - MINIO_ROOT_PASSWORD=${MINIO_SECRET_KEY:-minioadmin} - - MINIO_BROWSER_REDIRECT_URL=http://localhost:9001 - command: server /data --console-address ":9001" - volumes: - - minio-data:/data - networks: - - code-interpreter-network - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] - interval: 30s - timeout: 10s - retries: 3 - - # MinIO bucket initialization - minio-init: - image: minio/mc:latest - container_name: code-interpreter-minio-init - depends_on: - - minio - environment: - - MINIO_ENDPOINT=minio:9000 - - MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-minioadmin} - - MINIO_SECRET_KEY=${MINIO_SECRET_KEY:-minioadmin} - - MINIO_BUCKET=${MINIO_BUCKET:-code-interpreter-files} - entrypoint: > - /bin/sh -c " echo 'Waiting for MinIO to be ready...'; until mc alias set minio http://$$MINIO_ENDPOINT $$MINIO_ACCESS_KEY $$MINIO_SECRET_KEY; do - echo 'MinIO not ready, waiting...'; - sleep 2; - done; echo 'MinIO is ready. Creating bucket if it does not exist...'; mc mb minio/$$MINIO_BUCKET --ignore-existing; echo 'Bucket setup complete.'; " - networks: - - code-interpreter-network - -volumes: - redis-data: - driver: local - minio-data: - driver: local - app-data: - driver: local - -networks: - code-interpreter-network: - driver: bridge - ipam: - config: - - subnet: 172.20.0.0/16 - - # WAN-only network for execution containers that need internet access - # This network is only created when ENABLE_WAN_ACCESS=true - code-interpreter-wan: - driver: bridge - ipam: - config: - - subnet: 172.30.0.0/16 - driver_opts: - # Enable NAT for outbound internet access - com.docker.network.bridge.enable_ip_masquerade: "true" - # Disable inter-container communication - com.docker.network.bridge.enable_icc: "false" - labels: - com.code-interpreter.managed: "true" - com.code-interpreter.type: "wan-access" diff --git a/docker-compose.yml b/docker-compose.yml index 0df23be..61d4ca4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,166 +1,103 @@ services: - # Code Interpreter API + # Code Interpreter API (unified image with nsjail sandboxing) api: - build: - context: . - dockerfile: Dockerfile - args: - BUILD_DATE: ${BUILD_DATE:-$(date -u +'%Y-%m-%dT%H:%M:%SZ')} - VERSION: ${VERSION:-1.0.0} - VCS_REF: ${VCS_REF:-$(git rev-parse --short HEAD)} + build: . + image: code-interpreter:nsjail container_name: code-interpreter-api - user: "1000:988" # Run as user with docker group access + restart: unless-stopped + # tini init process reaps zombie nsjail/python child processes + init: true + # nsjail requires these capabilities to create namespaces and cgroups cap_add: - - NET_ADMIN # Required for iptables management when WAN access is enabled + - SYS_ADMIN + security_opt: + - apparmor:unconfined ports: - "${API_PORT:-8000}:8000" - "${HTTPS_PORT:-443}:443" env_file: - .env environment: - # Container-specific overrides (these override .env values) - - API_HOST=0.0.0.0 - - API_PORT=8000 - - # Service discovery (container names) + # Container-specific overrides (service discovery within compose network) - REDIS_HOST=redis - - REDIS_PORT=6379 - MINIO_ENDPOINT=minio:9000 - - # Docker socket path inside container - - DOCKER_BASE_URL=unix://var/run/docker.sock - - # SSL paths inside container - - SSL_CERT_FILE=/app/ssl/cert.pem - - SSL_KEY_FILE=/app/ssl/key.pem volumes: - - /var/run/docker.sock:/var/run/docker.sock - - ./logs:/app/logs - - app-data:/app/data - - ${SSL_CERTS_PATH:-./ssl}:/app/ssl - - ./dashboard:/app/dashboard - - ./src:/app/src - - ./docker:/app/docker:ro # Seccomp profile for container security + - sandbox-data:/var/lib/code-interpreter/sandboxes + - ${SSL_CERTS_PATH:-./ssl}:/app/ssl:ro + tmpfs: + - /app/data:size=100m depends_on: - - redis - - minio - networks: - - code-interpreter-network - restart: unless-stopped - stop_grace_period: 30s # Allow time to cleanup pooled containers + redis: + condition: service_healthy + minio-init: + condition: service_completed_successfully healthcheck: - test: - [ - "CMD", - "sh", - "-c", - "curl -f -k https://localhost:443/health 2>/dev/null || curl -f http://localhost:8000/health 2>/dev/null || curl -f http://localhost:80/health || exit 1", - ] + test: ["CMD", "curl", "-fsk", "https://localhost:443/health"] interval: 30s timeout: 15s retries: 3 - start_period: 15s + start_period: 30s + # No /var/run/docker.sock mount needed # Redis for session management redis: image: redis:7-alpine container_name: code-interpreter-redis + restart: unless-stopped ports: - # Expose to localhost for CLI tools - - "127.0.0.1:6379:6379" - environment: - - REDIS_PASSWORD=${REDIS_PASSWORD:-} + - "127.0.0.1:${REDIS_PORT:-6379}:6379" command: > - sh -c " - if [ -n \"$$REDIS_PASSWORD\" ]; then - redis-server --requirepass $$REDIS_PASSWORD --appendonly yes --appendfsync everysec - else - redis-server --appendonly yes --appendfsync everysec - fi - " + redis-server + --appendonly yes + --appendfsync everysec + --maxmemory 256mb + --maxmemory-policy allkeys-lru volumes: - redis-data:/data - - ./docker/redis/redis.conf:/usr/local/etc/redis/redis.conf:ro - networks: - - code-interpreter-network - restart: unless-stopped healthcheck: test: ["CMD", "redis-cli", "ping"] - interval: 30s - timeout: 10s - retries: 3 + interval: 10s + timeout: 5s + retries: 5 # MinIO for file storage minio: image: minio/minio:latest container_name: code-interpreter-minio + restart: unless-stopped ports: - # API port for local testing - - "127.0.0.1:9000:9000" - # Console only, bound to localhost (access via SSH tunnel) + - "127.0.0.1:${MINIO_PORT:-9000}:9000" - "127.0.0.1:${MINIO_CONSOLE_PORT:-9001}:9001" environment: - - MINIO_ROOT_USER=${MINIO_ACCESS_KEY:-minioadmin} - - MINIO_ROOT_PASSWORD=${MINIO_SECRET_KEY:-minioadmin} - - MINIO_BROWSER_REDIRECT_URL=http://localhost:9001 + MINIO_ROOT_USER: ${MINIO_ACCESS_KEY:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_SECRET_KEY:-minioadmin} command: server /data --console-address ":9001" volumes: - minio-data:/data - networks: - - code-interpreter-network - restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] - interval: 30s - timeout: 10s - retries: 3 + interval: 10s + timeout: 5s + retries: 5 # MinIO bucket initialization minio-init: image: minio/mc:latest - container_name: code-interpreter-minio-init depends_on: - - minio - environment: - - MINIO_ENDPOINT=minio:9000 - - MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-minioadmin} - - MINIO_SECRET_KEY=${MINIO_SECRET_KEY:-minioadmin} - - MINIO_BUCKET=${MINIO_BUCKET:-code-interpreter-files} + minio: + condition: service_healthy entrypoint: > - /bin/sh -c " echo 'Waiting for MinIO to be ready...'; until mc alias set minio http://$$MINIO_ENDPOINT $$MINIO_ACCESS_KEY $$MINIO_SECRET_KEY; do - echo 'MinIO not ready, waiting...'; - sleep 2; - done; echo 'MinIO is ready. Creating bucket if it does not exist...'; mc mb minio/$$MINIO_BUCKET --ignore-existing; echo 'Bucket setup complete.'; " - networks: - - code-interpreter-network + /bin/sh -c " + mc alias set myminio http://minio:9000 $${MINIO_ACCESS_KEY:-minioadmin} $${MINIO_SECRET_KEY:-minioadmin}; + mc mb --ignore-existing myminio/$${MINIO_BUCKET:-code-interpreter-files}; + exit 0; + " + environment: + MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-minioadmin} + MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-minioadmin} + MINIO_BUCKET: ${MINIO_BUCKET:-code-interpreter-files} volumes: + sandbox-data: redis-data: - driver: local minio-data: - driver: local - app-data: - driver: local - -networks: - code-interpreter-network: - driver: bridge - ipam: - config: - - subnet: 172.20.0.0/16 - - # WAN-only network for execution containers that need internet access - # This network is only created when ENABLE_WAN_ACCESS=true - code-interpreter-wan: - driver: bridge - ipam: - config: - - subnet: 172.30.0.0/16 - driver_opts: - # Enable NAT for outbound internet access - com.docker.network.bridge.enable_ip_masquerade: "true" - # Disable inter-container communication - com.docker.network.bridge.enable_icc: "false" - labels: - com.code-interpreter.managed: "true" - com.code-interpreter.type: "wan-access" diff --git a/docker/build-images.sh b/docker/build-images.sh deleted file mode 100755 index 1d9e63b..0000000 --- a/docker/build-images.sh +++ /dev/null @@ -1,362 +0,0 @@ -#!/bin/bash -# Build script for Code Interpreter execution environment images -# Supports BuildKit caching and parallel builds - -set -e - -# Enable BuildKit for better caching -export DOCKER_BUILDKIT=1 -export BUILDKIT_PROGRESS=plain - -# Configuration -REGISTRY=${REGISTRY:-"code-interpreter"} -VERSION=${VERSION:-"latest"} -BUILD_DATE=$(date -u +'%Y-%m-%dT%H:%M:%SZ') -VCS_REF=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown") -SINGLE_LANGUAGE="" -PARALLEL_BUILD=false - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' # No Color - -# Logging functions -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -log_timing() { - echo -e "${CYAN}[TIMING]${NC} $1" -} - -# Build function with timing -build_image() { - local language=$1 - local dockerfile=$2 - local image_name="${REGISTRY}/${language}:${VERSION}" - local start_time=$(date +%s) - - log_info "Building ${language} execution environment..." - - if docker build \ - --file "${dockerfile}" \ - --tag "${image_name}" \ - --build-arg BUILD_DATE="${BUILD_DATE}" \ - --build-arg VERSION="${VERSION}" \ - --build-arg VCS_REF="${VCS_REF}" \ - --build-arg BUILDKIT_INLINE_CACHE=1 \ - --label "org.opencontainers.image.title=Code Interpreter ${language^} Environment" \ - --label "org.opencontainers.image.description=Secure execution environment for ${language} code" \ - --label "org.opencontainers.image.version=${VERSION}" \ - --label "org.opencontainers.image.created=${BUILD_DATE}" \ - --label "org.opencontainers.image.revision=${VCS_REF}" \ - .; then - local end_time=$(date +%s) - local duration=$((end_time - start_time)) - log_success "Built ${image_name} in ${duration}s" - return 0 - else - log_error "Failed to build ${image_name}" - return 1 - fi -} - -# Parallel build function -build_all_parallel() { - local pids=() - local languages=("python" "nodejs" "go" "java" "c-cpp" "php" "rust" "fortran" "r" "d") - local dockerfiles=("python.Dockerfile" "nodejs.Dockerfile" "go.Dockerfile" "java.Dockerfile" "c-cpp.Dockerfile" "php.Dockerfile" "rust.Dockerfile" "fortran.Dockerfile" "r.Dockerfile" "d.Dockerfile") - local log_dir="/tmp/code-interpreter-build-$$" - - mkdir -p "${log_dir}" - - log_info "Starting parallel builds for ${#languages[@]} images..." - log_info "Build logs in: ${log_dir}" - echo - - local start_time=$(date +%s) - - for i in "${!languages[@]}"; do - local lang="${languages[$i]}" - local dockerfile="${dockerfiles[$i]}" - ( - build_image "${lang}" "${dockerfile}" > "${log_dir}/${lang}.log" 2>&1 - echo $? > "${log_dir}/${lang}.exit" - ) & - pids+=($!) - log_info "Started build for ${lang} (PID: $!)" - done - - echo - log_info "Waiting for all builds to complete..." - - # Wait for all builds - local failed=() - for i in "${!pids[@]}"; do - local lang="${languages[$i]}" - wait "${pids[$i]}" 2>/dev/null || true - local exit_code=$(cat "${log_dir}/${lang}.exit" 2>/dev/null || echo "1") - if [ "${exit_code}" != "0" ]; then - failed+=("${lang}") - log_error "${lang} build failed" - else - log_success "${lang} build completed" - fi - done - - local end_time=$(date +%s) - local total_duration=$((end_time - start_time)) - - echo - log_timing "Total parallel build time: ${total_duration}s" - - # Show logs for failed builds - if [ ${#failed[@]} -gt 0 ]; then - echo - log_error "Failed builds: ${failed[*]}" - for lang in "${failed[@]}"; do - echo - log_error "=== ${lang} build log ===" - cat "${log_dir}/${lang}.log" - done - rm -rf "${log_dir}" - return 1 - fi - - rm -rf "${log_dir}" - log_success "All parallel builds completed in ${total_duration}s" - return 0 -} - -# Sequential build function -build_all_sequential() { - local failed_builds=() - local start_time=$(date +%s) - - # Python - if ! build_image "python" "python.Dockerfile"; then - failed_builds+=("python") - fi - - # Node.js - if ! build_image "nodejs" "nodejs.Dockerfile"; then - failed_builds+=("nodejs") - fi - - # Go - if ! build_image "go" "go.Dockerfile"; then - failed_builds+=("go") - fi - - # Java - if ! build_image "java" "java.Dockerfile"; then - failed_builds+=("java") - fi - - # C/C++ - if ! build_image "c-cpp" "c-cpp.Dockerfile"; then - failed_builds+=("c-cpp") - fi - - # PHP - if ! build_image "php" "php.Dockerfile"; then - failed_builds+=("php") - fi - - # Rust - if ! build_image "rust" "rust.Dockerfile"; then - failed_builds+=("rust") - fi - - # Fortran - if ! build_image "fortran" "fortran.Dockerfile"; then - failed_builds+=("fortran") - fi - - # R - if ! build_image "r" "r.Dockerfile"; then - failed_builds+=("r") - fi - - # D - if ! build_image "d" "d.Dockerfile"; then - failed_builds+=("d") - fi - - local end_time=$(date +%s) - local total_duration=$((end_time - start_time)) - - echo - log_timing "Total sequential build time: ${total_duration}s" - - if [ ${#failed_builds[@]} -gt 0 ]; then - log_error "Failed to build the following images: ${failed_builds[*]}" - return 1 - fi - - return 0 -} - -# Build single language -build_single() { - case "${SINGLE_LANGUAGE}" in - python) - build_image "python" "python.Dockerfile" - ;; - nodejs) - build_image "nodejs" "nodejs.Dockerfile" - ;; - go) - build_image "go" "go.Dockerfile" - ;; - java) - build_image "java" "java.Dockerfile" - ;; - c-cpp) - build_image "c-cpp" "c-cpp.Dockerfile" - ;; - php) - build_image "php" "php.Dockerfile" - ;; - rust) - build_image "rust" "rust.Dockerfile" - ;; - fortran) - build_image "fortran" "fortran.Dockerfile" - ;; - r) - build_image "r" "r.Dockerfile" - ;; - d) - build_image "d" "d.Dockerfile" - ;; - *) - log_error "Unknown language: ${SINGLE_LANGUAGE}" - show_help - exit 1 - ;; - esac -} - -# Main execution -main() { - log_info "Starting Code Interpreter execution environment builds..." - log_info "Registry: ${REGISTRY}" - log_info "Version: ${VERSION}" - log_info "Build Date: ${BUILD_DATE}" - log_info "VCS Ref: ${VCS_REF}" - log_info "BuildKit: enabled" - log_info "Parallel: ${PARALLEL_BUILD}" - echo - - # Change to docker directory - cd "$(dirname "$0")" - - # Build images - if [ -n "${SINGLE_LANGUAGE}" ]; then - build_single - elif [ "${PARALLEL_BUILD}" = true ]; then - build_all_parallel - else - build_all_sequential - fi - - local build_result=$? - - echo - - # Summary - if [ ${build_result} -eq 0 ]; then - log_success "All execution environment images built successfully!" - else - exit 1 - fi - - # List built images - echo - log_info "Built images:" - docker images "${REGISTRY}/*:${VERSION}" --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}" -} - -# Help function -show_help() { - cat << EOF -Code Interpreter Execution Environment Builder - -Usage: $0 [OPTIONS] - -Options: - -r, --registry REGISTRY Set the Docker registry/namespace (default: code-interpreter) - -v, --version VERSION Set the image version tag (default: latest) - -l, --language LANGUAGE Build only the specified language image - -p, --parallel Build all images in parallel (faster but more resource intensive) - -h, --help Show this help message - -Supported languages: - python, nodejs, go, java, c-cpp, php, rust, fortran, r, d - -Environment Variables: - REGISTRY Docker registry/namespace - VERSION Image version tag - DOCKER_BUILDKIT BuildKit enabled by default (set to 0 to disable) - -Examples: - $0 # Build all images sequentially - $0 -p # Build all images in parallel - $0 -r myregistry -v 1.0.0 # Build with custom registry and version - $0 -l python # Build only the Python image - $0 -p -v 2.0.0 # Parallel build with version tag - REGISTRY=myregistry VERSION=1.0.0 $0 # Build with environment variables - -EOF -} - -# Parse command line arguments -while [[ $# -gt 0 ]]; do - case $1 in - -r|--registry) - REGISTRY="$2" - shift 2 - ;; - -v|--version) - VERSION="$2" - shift 2 - ;; - -l|--language) - SINGLE_LANGUAGE="$2" - shift 2 - ;; - -p|--parallel) - PARALLEL_BUILD=true - shift - ;; - -h|--help) - show_help - exit 0 - ;; - *) - log_error "Unknown option: $1" - show_help - exit 1 - ;; - esac -done - -# Run main function -main diff --git a/docker/c-cpp.Dockerfile b/docker/c-cpp.Dockerfile deleted file mode 100644 index 4939f4e..0000000 --- a/docker/c-cpp.Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# C/C++ execution environment with BuildKit optimizations -# Pin to specific version for reproducibility -FROM gcc:15-bookworm - -# Install essential development tools and libraries -RUN apt-get update && apt-get install -y --no-install-recommends \ - make \ - cmake \ - # Math and science libraries - libgsl-dev \ - libblas-dev \ - liblapack-dev \ - # File handling libraries - libzip-dev \ - zlib1g-dev \ - # JSON library - nlohmann-json3-dev \ - # CSV library - libcsv-dev \ - # Additional utilities - pkg-config \ - && rm -rf /var/lib/apt/lists/* - -# Create non-root user -RUN groupadd -g 1001 codeuser && \ - useradd -r -u 1001 -g codeuser codeuser - -# Set working directory and ensure ownership -WORKDIR /mnt/data -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables for C/C++ development -ENV CC=gcc \ - CXX=g++ \ - PKG_CONFIG_PATH=/usr/lib/x86_64-linux-gnu/pkgconfig - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","CC=gcc","CXX=g++","PKG_CONFIG_PATH=/usr/lib/x86_64-linux-gnu/pkgconfig"] -CMD ["/bin/bash"] diff --git a/docker/d.Dockerfile b/docker/d.Dockerfile deleted file mode 100644 index a564ca3..0000000 --- a/docker/d.Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# D execution environment with BuildKit optimizations -FROM ubuntu:24.04 - -ARG BUILD_DATE -ARG VERSION -ARG VCS_REF - -LABEL org.opencontainers.image.title="Code Interpreter D Environment" \ - org.opencontainers.image.description="Secure execution environment for D (ldc2) code" \ - org.opencontainers.image.version="${VERSION}" \ - org.opencontainers.image.created="${BUILD_DATE}" \ - org.opencontainers.image.revision="${VCS_REF}" - -# Install toolchain (ldc2) and basics; works on amd64 and arm64 -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates curl wget xz-utils git \ - build-essential make binutils \ - ldc \ - && rm -rf /var/lib/apt/lists/* - -# Create non-root user (uid:1001) consistent with other images -RUN useradd -m -u 1001 runner && mkdir -p /mnt/data && chown -R runner:runner /mnt/data - -WORKDIR /mnt/data - -# Switch to non-root user -USER 1001:1001 - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp"] -CMD ["ldc2", "--version"] diff --git a/docker/fortran.Dockerfile b/docker/fortran.Dockerfile deleted file mode 100644 index a9d160c..0000000 --- a/docker/fortran.Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# Fortran execution environment with BuildKit optimizations -FROM ubuntu:24.04 - -# Prevent interactive prompts during package installation -ENV DEBIAN_FRONTEND=noninteractive - -# Install system dependencies and Fortran compiler -RUN apt-get update && apt-get install -y --no-install-recommends \ - gfortran-12 \ - gcc \ - g++ \ - make \ - cmake \ - libblas-dev \ - liblapack-dev \ - libnetcdf-dev \ - libhdf5-dev \ - build-essential \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Set gfortran-12 as the default fortran compiler -RUN update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-12 100 \ - && update-alternatives --install /usr/bin/f95 f95 /usr/bin/gfortran-12 100 - -# Create non-root user -RUN groupadd -g 1001 codeuser && \ - useradd -r -u 1001 -g codeuser codeuser - -# Set working directory and ensure ownership -WORKDIR /mnt/data -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables -ENV FORTRAN_COMPILER=gfortran \ - FC=gfortran \ - F77=gfortran \ - F90=gfortran \ - F95=gfortran - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","FORTRAN_COMPILER=gfortran","FC=gfortran","F77=gfortran","F90=gfortran","F95=gfortran"] -CMD ["gfortran", "--version"] diff --git a/docker/go.Dockerfile b/docker/go.Dockerfile deleted file mode 100644 index bd41361..0000000 --- a/docker/go.Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# Go execution environment with BuildKit optimizations -FROM golang:1.26-alpine - -# Install common tools -RUN apk add --no-cache \ - git \ - make \ - gcc \ - musl-dev - -# Copy go.mod for pre-downloading -COPY requirements/go.mod /tmp/gosetup/go.mod - -# Pre-download common Go packages with cache mount -RUN --mount=type=cache,target=/go/pkg/mod \ - cd /tmp/gosetup && \ - go mod download && \ - cd / && rm -rf /tmp/gosetup - -# Create non-root user -RUN addgroup -g 1001 -S codeuser && \ - adduser -S codeuser -u 1001 -G codeuser - -# Set working directory -WORKDIR /mnt/data - -# Ensure ownership of working directory -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables -ENV GO111MODULE=on \ - GOPROXY=https://proxy.golang.org,direct \ - GOSUMDB=sum.golang.org - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/go/bin:/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","GO111MODULE=on","GOPROXY=https://proxy.golang.org,direct","GOSUMDB=sum.golang.org","GOCACHE=/tmp/go-build"] -CMD ["go"] diff --git a/docker/java.Dockerfile b/docker/java.Dockerfile deleted file mode 100644 index 61af866..0000000 --- a/docker/java.Dockerfile +++ /dev/null @@ -1,60 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# Java execution environment with BuildKit optimizations -FROM eclipse-temurin:25-jdk - -# Install common tools -RUN apt-get update && apt-get install -y --no-install-recommends \ - curl \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Create library directory -RUN mkdir -p /opt/java/lib - -# Download all JARs in a single layer (reduces layers, faster builds) -RUN cd /opt/java/lib && \ - # Apache Commons - wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-csv/1.10.0/commons-csv-1.10.0.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.14.0/commons-lang3-3.14.0.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-math3/3.6.1/commons-math3-3.6.1.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-collections4/4.4/commons-collections4-4.4.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-compress/1.25.0/commons-compress-1.25.0.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-text/1.11.0/commons-text-1.11.0.jar && \ - # Jackson JSON - wget -q https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.16.0/jackson-core-2.16.0.jar && \ - wget -q https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.16.0/jackson-databind-2.16.0.jar && \ - wget -q https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.16.0/jackson-annotations-2.16.0.jar && \ - # Apache POI (Excel) - wget -q https://repo1.maven.org/maven2/org/apache/poi/poi/5.2.5/poi-5.2.5.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/5.2.5/poi-ooxml-5.2.5.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml-lite/5.2.5/poi-ooxml-lite-5.2.5.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/xmlbeans/xmlbeans/5.2.0/xmlbeans-5.2.0.jar && \ - # Apache PDFBox - wget -q https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox/3.0.1/pdfbox-3.0.1.jar && \ - wget -q https://repo1.maven.org/maven2/org/apache/pdfbox/fontbox/3.0.1/fontbox-3.0.1.jar && \ - # Google Guava - wget -q https://repo1.maven.org/maven2/com/google/guava/guava/33.0.0-jre/guava-33.0.0-jre.jar && \ - # NEW: Gson (alternative JSON) - wget -q https://repo1.maven.org/maven2/com/google/code/gson/gson/2.10.1/gson-2.10.1.jar && \ - # NEW: Joda-Time - wget -q https://repo1.maven.org/maven2/joda-time/joda-time/2.12.5/joda-time-2.12.5.jar - -# Create non-root user -RUN groupadd -r codeuser && useradd -r -g codeuser codeuser - -# Set working directory -WORKDIR /mnt/data - -# Ensure ownership of working directory -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables with updated CLASSPATH -ENV JAVA_OPTS="-Xmx512m -Xms128m" \ - CLASSPATH="/mnt/data:/opt/java/lib/*" - -# Default command with sanitized environment (include Java bin path) -ENTRYPOINT ["/usr/bin/env","-i","PATH=/opt/java/openjdk/bin:/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","CLASSPATH=/mnt/data:/opt/java/lib/*","JAVA_OPTS=-Xmx512m -Xms128m"] -CMD ["java", "--version"] diff --git a/docker/nodejs.Dockerfile b/docker/nodejs.Dockerfile deleted file mode 100644 index 74f7b62..0000000 --- a/docker/nodejs.Dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# Node.js execution environment with BuildKit optimizations -FROM node:25-alpine - -# Install common build tools -RUN apk add --no-cache \ - python3 \ - make \ - g++ \ - git - -# Copy package list -COPY requirements/nodejs.txt /tmp/nodejs.txt - -# Install packages with cache mount -# Read packages from file and install globally -RUN --mount=type=cache,target=/root/.npm \ - cat /tmp/nodejs.txt | grep -v '^#' | grep -v '^$' | xargs npm install -g - -# Clean up -RUN rm -f /tmp/nodejs.txt - -# Create non-root user -RUN addgroup -g 1001 -S codeuser && \ - adduser -S codeuser -u 1001 -G codeuser - -# Set working directory -WORKDIR /mnt/data - -# Ensure ownership of working directory -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables -ENV NODE_ENV=sandbox \ - NODE_PATH=/usr/local/lib/node_modules - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","NODE_PATH=/usr/local/lib/node_modules"] -CMD ["node"] diff --git a/docker/nsjail-base.cfg b/docker/nsjail-base.cfg new file mode 100644 index 0000000..c7bb713 --- /dev/null +++ b/docker/nsjail-base.cfg @@ -0,0 +1,231 @@ +# nsjail base configuration for code execution sandboxing +# +# This file documents the default sandbox policy used by LibreCodeInterpreter. +# Runtime configuration is generated programmatically by +# src/services/sandbox/nsjail.py via CLI arguments. This file serves as a +# reference for the security posture and can be used directly with nsjail +# for testing/debugging. + +name: "code-interpreter-sandbox" +description: "Sandboxed code execution environment" + +# Execute command once and exit +mode: ONCE + +# ============================================ +# Namespace isolation +# ============================================ +clone_newuser: true +clone_newnet: true +clone_newns: true +clone_newpid: true +clone_newipc: true +clone_newuts: true +clone_newcgroup: true + +# ============================================ +# Resource limits +# ============================================ + +# Virtual memory limit (MB) — matches HARD rlimit +rlimit_as_type: HARD +rlimit_as: 512 + +# Max output file size (MB) +rlimit_fsize: 100 + +# Max number of open file descriptors +rlimit_nofile: 256 + +# Execution time limit (seconds) +time_limit: 30 + +# Max number of processes/threads +cgroup_pids_max: 64 + +# Memory cgroup limit (bytes) — 512 MB +cgroup_mem_max: 536870912 + +# ============================================ +# Security settings +# ============================================ + +# Drop all capabilities +keep_caps: false + +# Hostname visible inside sandbox +hostname: "sandbox" + +# Disable /proc mount for security +disable_proc: true + +# Run as non-root inside sandbox +uid_mapping { + inside_id: "1001" + outside_id: "1001" + count: 1 +} + +gid_mapping { + inside_id: "1001" + outside_id: "1001" + count: 1 +} + +# ============================================ +# Filesystem mounts (read-only system paths) +# ============================================ + +# System binaries and libraries +mount { + src: "/usr" + dst: "/usr" + is_bind: true + rw: false +} + +mount { + src: "/lib" + dst: "/lib" + is_bind: true + rw: false +} + +mount { + src: "/lib64" + dst: "/lib64" + is_bind: true + rw: false + mandatory: false +} + +mount { + src: "/bin" + dst: "/bin" + is_bind: true + rw: false +} + +mount { + src: "/sbin" + dst: "/sbin" + is_bind: true + rw: false +} + +# SSL certificates +mount { + src: "/etc/ssl" + dst: "/etc/ssl" + is_bind: true + rw: false + mandatory: false +} + +mount { + src: "/etc/alternatives" + dst: "/etc/alternatives" + is_bind: true + rw: false + mandatory: false +} + +# Timezone data +mount { + src: "/usr/share/zoneinfo" + dst: "/usr/share/zoneinfo" + is_bind: true + rw: false + mandatory: false +} + +# Language-specific paths (read-only) +# Go +mount { + src: "/usr/local/go" + dst: "/usr/local/go" + is_bind: true + rw: false + mandatory: false +} + +# Rust/Cargo +mount { + src: "/usr/local/cargo" + dst: "/usr/local/cargo" + is_bind: true + rw: false + mandatory: false +} + +mount { + src: "/usr/local/rustup" + dst: "/usr/local/rustup" + is_bind: true + rw: false + mandatory: false +} + +# Go module cache +mount { + src: "/usr/local/gopath" + dst: "/usr/local/gopath" + is_bind: true + rw: false + mandatory: false +} + +# Java libraries +mount { + src: "/opt/java" + dst: "/opt/java" + is_bind: true + rw: false + mandatory: false +} + +# PHP Composer packages +mount { + src: "/opt/composer" + dst: "/opt/composer" + is_bind: true + rw: false + mandatory: false +} + +# Node.js global modules +mount { + src: "/usr/local/lib/node_modules" + dst: "/usr/local/lib/node_modules" + is_bind: true + rw: false + mandatory: false +} + +# REPL server +mount { + src: "/opt/repl_server.py" + dst: "/opt/repl_server.py" + is_bind: true + rw: false + mandatory: false +} + +# Writable tmpfs for temporary files +mount { + dst: "/tmp" + fstype: "tmpfs" + rw: true + options: "size=104857600" +} + +# Working directory — bound at runtime to the sandbox data dir +# mount { +# src: "" +# dst: "/mnt/data" +# is_bind: true +# rw: true +# } + +# Default working directory +cwd: "/mnt/data" diff --git a/docker/php.Dockerfile b/docker/php.Dockerfile deleted file mode 100644 index 8db5c8b..0000000 --- a/docker/php.Dockerfile +++ /dev/null @@ -1,72 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# PHP execution environment with BuildKit optimizations -FROM php:8.5-cli - -# Install system dependencies and PHP extensions -RUN apt-get update && apt-get install -y --no-install-recommends \ - libzip-dev \ - libpng-dev \ - libjpeg-dev \ - libfreetype6-dev \ - libonig-dev \ - libxml2-dev \ - unzip \ - git \ - && docker-php-ext-configure gd --with-freetype --with-jpeg \ - && docker-php-ext-install -j$(nproc) \ - xml \ - zip \ - gd \ - mbstring \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Install Composer -RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer - -# Create non-root user -RUN groupadd -g 1001 codeuser && \ - useradd -r -u 1001 -g codeuser codeuser - -# Create global composer directory and set permissions -RUN mkdir -p /opt/composer/global && \ - chown -R codeuser:codeuser /opt/composer - -# Switch to non-root user for package installation -USER codeuser - -# Set composer home directory -ENV COMPOSER_HOME=/opt/composer/global - -# Pre-install PHP packages globally with cache mount -RUN --mount=type=cache,target=/opt/composer/global/cache,uid=1001,gid=1001 \ - composer global require \ - league/csv \ - phpoffice/phpspreadsheet \ - league/flysystem \ - intervention/image \ - ramsey/uuid \ - nesbot/carbon \ - markrogoyski/math-php \ - guzzlehttp/guzzle \ - symfony/yaml \ - symfony/console \ - --optimize-autoloader - -# Switch back to root to set up directories and final permissions -USER root - -# Set working directory and ensure ownership -WORKDIR /mnt/data -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user for execution -USER codeuser - -# Set environment variables -ENV PATH="/opt/composer/global/vendor/bin:${PATH}" \ - PHP_INI_SCAN_DIR="/usr/local/etc/php/conf.d" - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/opt/composer/global/vendor/bin:/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","COMPOSER_HOME=/opt/composer/global","PHP_INI_SCAN_DIR=/usr/local/etc/php/conf.d"] -CMD ["php", "-a"] diff --git a/docker/python.Dockerfile b/docker/python.Dockerfile deleted file mode 100644 index 207a6c6..0000000 --- a/docker/python.Dockerfile +++ /dev/null @@ -1,120 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# Python execution environment with BuildKit optimizations -FROM python:3.14-slim - -# Install common packages for data science and general use -RUN apt-get update && apt-get install -y --no-install-recommends \ - gcc \ - g++ \ - make \ - pkg-config \ - libxml2-dev \ - libxslt-dev \ - libffi-dev \ - libcairo2-dev \ - libpango1.0-dev \ - libgdk-pixbuf-2.0-dev \ - libssl-dev \ - libjpeg-dev \ - libpng-dev \ - libtiff-dev \ - libopenjp2-7-dev \ - libfreetype6-dev \ - liblcms2-dev \ - libwebp-dev \ - tcl8.6-dev \ - tk8.6-dev \ - python3-tk \ - python3-dev \ - poppler-utils \ - tesseract-ocr \ - pandoc \ - portaudio19-dev \ - flac \ - ffmpeg \ - libpulse-dev \ - libsdl2-dev \ - libsdl2-mixer-dev \ - libsdl2-image-dev \ - libsdl2-ttf-dev \ - antiword \ - unrtf \ - && rm -rf /var/lib/apt/lists/* - -# Configure pip and build tools -ENV PIP_NO_BUILD_ISOLATION=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 - -# Install pip and build tooling with compatible versions -RUN --mount=type=cache,target=/root/.cache/pip \ - python -m pip install \ - "pip<24.1" \ - "setuptools<70" \ - wheel \ - "packaging<24" - -# Copy requirements files -COPY requirements/python-core.txt /tmp/python-core.txt -COPY requirements/python-analysis.txt /tmp/python-analysis.txt -COPY requirements/python-visualization.txt /tmp/python-visualization.txt -COPY requirements/python-documents.txt /tmp/python-documents.txt -COPY requirements/python-utilities.txt /tmp/python-utilities.txt -COPY requirements/python-new.txt /tmp/python-new.txt - -# Layer 1: Core data packages (most stable, rarely changes) -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/python-core.txt - -# Layer 2: Analysis packages (math, science, ML) -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/python-analysis.txt - -# Layer 3: Visualization packages -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/python-visualization.txt - -# Layer 4: Document processing packages -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/python-documents.txt - -# Layer 5: Utility packages -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/python-utilities.txt - -# Layer 6: NEW packages (changes most frequently) -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/python-new.txt - -# Clean up requirements files -RUN rm -f /tmp/python-*.txt - -# Create non-root user -RUN groupadd -r codeuser && useradd -r -g codeuser codeuser - -# Set working directory -WORKDIR /mnt/data - -# Ensure ownership of working directory -RUN chown -R codeuser:codeuser /mnt/data - -# Add REPL server and entrypoint scripts -COPY repl_server.py /opt/repl_server.py -COPY entrypoint.sh /opt/entrypoint.sh -RUN chmod +x /opt/repl_server.py /opt/entrypoint.sh - -# Ensure /opt is accessible -RUN chown -R codeuser:codeuser /opt - -# Switch to non-root user -USER codeuser - -# Set environment variables -ENV PYTHONUNBUFFERED=1 \ - PYTHONDONTWRITEBYTECODE=1 \ - PYTHONPATH=/mnt/data - -# Use the entrypoint script which handles REPL mode -# REPL_MODE=true -> runs repl_server.py -# REPL_MODE unset/false -> runs default command -ENTRYPOINT ["/opt/entrypoint.sh"] -CMD ["tail", "-f", "/dev/null"] diff --git a/docker/r.Dockerfile b/docker/r.Dockerfile deleted file mode 100644 index 116c358..0000000 --- a/docker/r.Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# R execution environment with BuildKit optimizations -FROM r-base:4.4.3 - -# Install system dependencies for R packages (including Cairo) -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev \ - libfontconfig1-dev \ - libharfbuzz-dev \ - libfribidi-dev \ - libfreetype6-dev \ - libpng-dev \ - libtiff5-dev \ - libjpeg-dev \ - libcairo2-dev \ - libxt-dev \ - libx11-dev \ - && rm -rf /var/lib/apt/lists/* - -# Install all R packages in a single layer using Posit Package Manager -# - amd64: Downloads pre-compiled binaries (~5 min) -# - arm64: Compiles from source but single layer avoids redundant dependency builds -RUN R -e "options(repos = c(CRAN = 'https://packagemanager.posit.co/cran/__linux__/bookworm/latest')); \ - install.packages(c( \ - 'dplyr', 'tidyr', 'data.table', 'magrittr', \ - 'ggplot2', 'lattice', 'scales', 'Cairo', \ - 'readr', 'readxl', 'writexl', 'jsonlite', 'xml2', \ - 'MASS', 'survival', 'lubridate', 'stringr', 'glue' \ - ))" - -# Create non-root user -RUN groupadd -g 1001 codeuser && \ - useradd -r -u 1001 -g codeuser codeuser - -# Set working directory and ensure ownership -WORKDIR /mnt/data -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables -ENV R_LIBS_USER=/usr/local/lib/R/site-library - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","R_LIBS_USER=/usr/local/lib/R/site-library"] -CMD ["R", "--version"] diff --git a/docker/rust.Dockerfile b/docker/rust.Dockerfile deleted file mode 100644 index 0eef6f8..0000000 --- a/docker/rust.Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -# syntax=docker/dockerfile:1.4 -# Rust execution environment with BuildKit optimizations -FROM rust:1.93-slim - -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - pkg-config \ - libssl-dev \ - libfontconfig1-dev \ - libfreetype6-dev \ - && rm -rf /var/lib/apt/lists/* - -# Create a temporary project to pre-compile and cache crates -WORKDIR /tmp/rust-cache - -# Copy Cargo.toml for crate caching -COPY requirements/rust-Cargo.toml Cargo.toml - -# Create minimal src/main.rs (cargo init would fail since Cargo.toml exists) -RUN mkdir -p src && echo 'fn main() {}' > src/main.rs - -# Pre-compile crates with cache mounts -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/tmp/rust-cache/target \ - cargo build --release || true - -# Clean up the temporary project but keep the cargo cache -WORKDIR / -RUN rm -rf /tmp/rust-cache - -# Create non-root user -RUN groupadd -g 1001 codeuser && \ - useradd -r -u 1001 -g codeuser codeuser - -# Set working directory and ensure ownership -WORKDIR /mnt/data -RUN chown -R codeuser:codeuser /mnt/data - -# Switch to non-root user -USER codeuser - -# Set environment variables -ENV CARGO_HOME=/usr/local/cargo \ - RUSTUP_HOME=/usr/local/rustup \ - PATH=/usr/local/cargo/bin:$PATH - -# Default command with sanitized environment -ENTRYPOINT ["/usr/bin/env","-i","PATH=/usr/local/cargo/bin:/usr/local/bin:/usr/bin:/bin","HOME=/tmp","TMPDIR=/tmp","CARGO_HOME=/usr/local/cargo","RUSTUP_HOME=/usr/local/rustup"] -CMD ["rustc", "--version"] diff --git a/docker/seccomp-sandbox.json b/docker/seccomp-sandbox.json deleted file mode 100644 index ee476ad..0000000 --- a/docker/seccomp-sandbox.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "defaultAction": "SCMP_ACT_ALLOW", - "defaultErrnoRet": 1, - "architectures": [ - "SCMP_ARCH_X86_64", - "SCMP_ARCH_X86", - "SCMP_ARCH_X32", - "SCMP_ARCH_AARCH64", - "SCMP_ARCH_ARM" - ], - "syscalls": [ - { - "names": [ - "ptrace" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block process tracing - caused container hang with PTRACE_TRACEME" - }, - { - "names": [ - "process_vm_readv", - "process_vm_writev" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block cross-process memory access" - }, - { - "names": [ - "personality" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block ASLR disabling" - }, - { - "names": [ - "mount", - "umount", - "umount2" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block filesystem manipulation (defense-in-depth, also blocked by CAP_SYS_ADMIN)" - }, - { - "names": [ - "pivot_root", - "chroot" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block container escape vectors (defense-in-depth, also blocked by CAP_SYS_CHROOT)" - }, - { - "names": [ - "reboot", - "kexec_load", - "kexec_file_load" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block system disruption (defense-in-depth, also blocked by CAP_SYS_BOOT)" - }, - { - "names": [ - "init_module", - "finit_module", - "delete_module" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block kernel module manipulation (defense-in-depth, also blocked by CAP_SYS_MODULE)" - }, - { - "names": [ - "acct" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block process accounting manipulation (defense-in-depth, also blocked by CAP_SYS_PACCT)" - }, - { - "names": [ - "swapon", - "swapoff" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block swap manipulation (defense-in-depth, also blocked by CAP_SYS_ADMIN)" - }, - { - "names": [ - "sethostname", - "setdomainname" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block host identity changes (defense-in-depth, also blocked by CAP_SYS_ADMIN)" - }, - { - "names": [ - "clock_settime", - "clock_adjtime", - "settimeofday", - "adjtimex" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block time manipulation (defense-in-depth, also blocked by CAP_SYS_TIME)" - }, - { - "names": [ - "iopl", - "ioperm" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block direct I/O port access (defense-in-depth, also blocked by CAP_SYS_RAWIO)" - }, - { - "names": [ - "create_module", - "get_kernel_syms", - "query_module" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block legacy kernel module syscalls" - }, - { - "names": [ - "unshare", - "setns" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block namespace manipulation" - }, - { - "names": [ - "userfaultfd" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block userfaultfd - can be used in exploits" - }, - { - "names": [ - "bpf" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block BPF - powerful and often exploited" - }, - { - "names": [ - "perf_event_open" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block perf events - can leak kernel info" - }, - { - "names": [ - "add_key", - "keyctl", - "request_key" - ], - "action": "SCMP_ACT_ERRNO", - "errnoRet": 1, - "comment": "Block kernel keyring manipulation" - } - ] -} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index dd3f9aa..188e67c 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -6,7 +6,8 @@ This document provides a comprehensive overview of the Code Interpreter API arch ``` ┌─────────────────────────────────────────────────────────────┐ - │ Code Interpreter API │ + │ Code Interpreter API Container │ + │ (single unified image with nsjail) │ │ │ ┌──────────┐ HTTPS/443 │ ┌─────────────┐ ┌─────────────────────────────────┐ │ │ Client │ ──────────────────────▶ │ FastAPI │───▶│ ExecutionOrchestrator │ │ @@ -18,44 +19,45 @@ This document provides a comprehensive overview of the Code Interpreter API arch │ ┌─────────────┐ ┌─────────────────────────────────┐ │ │ │ Middleware │ │ Services │ │ │ │ - Auth │ │ ┌─────────┐ ┌─────────────┐ │ │ - │ │ - Headers │ │ │Container│ │ Execution │ │ │ + │ │ - Headers │ │ │ Sandbox │ │ Execution │ │ │ │ │ - Logging │ │ │ Pool │ │ Runner │ │ │ │ │ - Metrics │ │ └────┬────┘ └──────┬──────┘ │ │ │ └─────────────┘ │ │ │ │ │ │ │ ▼ ▼ │ │ │ │ ┌──────────────────────────┐ │ │ - │ │ │ Container Manager │ │ │ + │ │ │ Sandbox Manager │ │ │ │ │ │ + REPL Executor │ │ │ + │ │ │ → nsjail (isolation) │ │ │ │ │ └──────────────────────────┘ │ │ │ └─────────────────────────────────┘ │ └────────────────────────────────┬──────────────────────────┘ │ - ┌──────────────────────────────────────────┼──────────────────────────────┐ - │ │ │ - ▼ ▼ ▼ - ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ - │ Redis │ │ Docker │ │ MinIO │ - │ │ │ Engine │ │ (S3-API) │ - │ - Sessions │ │ │ │ │ - │ - State │ │ ┌──────────┐ │ │ - Files │ - │ - Caching │ │ │Container │ │ │ - State │ - │ │ │ │ Pool │ │ │ Archives │ - └──────────────┘ │ └──────────┘ │ └──────────────┘ - └──────────────┘ + ┌──────────────────────┴──────────────────────┐ + │ │ + ▼ ▼ + ┌──────────────┐ ┌──────────────┐ + │ Redis │ │ MinIO │ + │ │ │ (S3-API) │ + │ - Sessions │ │ │ + │ - State │ │ - Files │ + │ - Caching │ │ - State │ + │ │ │ Archives │ + └──────────────┘ └──────────────┘ ``` +**Key architectural change:** The API, all language runtimes, and nsjail run inside a single Docker container. Code execution is isolated via nsjail sandboxes (PID/mount/network namespaces, seccomp, cgroups) rather than separate Docker containers. No Docker socket is mounted. + ## Core Components ### 1. API Layer (`src/api/`) The API layer contains thin endpoint handlers that delegate to the orchestrator: -| File | Purpose | -| ----------- | ----------------------------------------------------------------------- | -| `exec.py` | Code execution endpoint, delegates to `ExecutionOrchestrator` | -| `files.py` | File upload, download, list, and delete operations | -| `state.py` | Python state download, upload, info, and delete for client-side caching | -| `health.py` | Health checks and metrics endpoints | +| File | Purpose | +| ----------- | ------------------------------------------------------------- | +| `exec.py` | Code execution endpoint, delegates to `ExecutionOrchestrator` | +| `files.py` | File upload, download, and list operations | +| `health.py` | Health checks and metrics endpoints | **Design principle:** Endpoints are intentionally thin (~70 lines each). All business logic resides in services. @@ -68,24 +70,24 @@ Business logic is organized into focused services: | **ExecutionOrchestrator** | `orchestrator.py` | Coordinates execution workflow | | **SessionService** | `session.py` | Redis session management | | **FileService** | `file.py` | MinIO file storage | -| **StateService** | `state.py` | Python state persistence (Redis) | -| **StateArchivalService** | `state_archival.py` | State archival (MinIO) | +| **StateService** | `state.py` | Internal Python state persistence (Redis, no external API) | +| **StateArchivalService** | `state_archival.py` | Internal state archival (MinIO) | | **AuthService** | `auth.py` | API key authentication | | **HealthService** | `health.py` | Health checks | | **MetricsService** | `metrics.py` | Metrics collection | | **CleanupService** | `cleanup.py` | Background cleanup tasks | -### 3. Container Management (`src/services/container/`) +### 3. Sandbox Management (`src/services/sandbox/`) -Container lifecycle is managed by a dedicated package: +Sandbox lifecycle is managed by a dedicated package: -| Component | File | Purpose | -| --------------------- | ------------------ | -------------------------------------------------- | -| **ContainerManager** | `manager.py` | Container lifecycle (create, start, stop, destroy) | -| **ContainerPool** | `pool.py` | Pre-warmed container pool per language | -| **ContainerExecutor** | `executor.py` | Command execution in containers | -| **REPLExecutor** | `repl_executor.py` | Python REPL communication | -| **DockerClient** | `client.py` | Docker client factory | +| Component | File | Purpose | +| -------------------- | ------------------ | ---------------------------------------------------- | +| **SandboxManager** | `manager.py` | Sandbox lifecycle (create, destroy) | +| **SandboxPool** | `pool.py` | Pre-warmed Python REPL sandbox pool | +| **SandboxExecutor** | `executor.py` | Code execution in nsjail sandboxes | +| **REPLExecutor** | `repl_executor.py` | Python REPL communication | +| **NsjailConfig** | `nsjail.py` | nsjail CLI argument builder and SandboxInfo dataclass | ### 4. Execution Engine (`src/services/execution/`) @@ -107,7 +109,7 @@ class ExecutionStarted(Event): ... class SessionCreated(Event): ... class SessionDeleted(Event): ... class FileUploaded(Event): ... -class ContainerAcquiredFromPool(Event): ... +class SandboxAcquiredFromPool(Event): ... class PoolWarmedUp(Event): ... ``` @@ -142,25 +144,25 @@ await event_bus.publish(ExecutionCompleted(session_id=..., execution_id=...)) │ ├── 3c. Load state if session_id provided (StateService) │ - ├── 3d. Upload input files to container + ├── 3d. Upload input files to sandbox directory │ - ├── 3e. Acquire container from pool + ├── 3e. Acquire sandbox from pool │ │ - │ └── ContainerPool.acquire() → returns warm container + │ └── SandboxPool.acquire() → returns warm sandbox │ ├── 3f. Execute code │ │ │ ├── Python + REPL: REPLExecutor.execute() - │ │ └── Send JSON via Docker attach socket + │ │ └── Send JSON via stdin/stdout pipe │ │ - │ └── Other languages: ContainerExecutor.execute() - │ └── docker exec with timeout + │ └── Other languages: SandboxExecutor.execute() + │ └── nsjail subprocess with timeout │ ├── 3g. Save state if Python (StateService) │ - ├── 3h. Download output files from container + ├── 3h. Collect output files from sandbox directory │ - └── 3i. Destroy container immediately + └── 3i. Destroy sandbox immediately │ ▼ 4. Return ExecResponse with stdout, stderr, files, session_id @@ -187,76 +189,46 @@ await event_bus.publish(ExecutionCompleted(session_id=..., execution_id=...)) 4. Return session_id and file_id ``` -### State Persistence Flow +## Sandbox Lifecycle -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ State Persistence Flow │ -└─────────────────────────────────────────────────────────────────────────────┘ +### Sandbox Pool -First Execution (no session_id): -───────────────────────────────── -1. Execute Python code → variables created in REPL namespace -2. REPL server serializes namespace with cloudpickle + lz4 -3. StateService stores compressed state in Redis (2-hour TTL) -4. Response includes session_id for future use - -Subsequent Execution (with session_id): -──────────────────────────────────────── -1. StateService loads state from Redis - └── If not in Redis, check MinIO archives -2. REPL server deserializes state into namespace -3. Execute Python code with existing variables -4. Save updated state to Redis - -Background Archival: -──────────────────── -1. CleanupService runs periodic check (every 5 min) -2. For states inactive > 1 hour: - └── StateArchivalService archives to MinIO (7-day TTL) -``` - ---- - -## Container Lifecycle - -### Container Pool - -The container pool pre-warms containers to eliminate cold start latency: +The sandbox pool pre-warms Python REPL sandboxes to eliminate cold start latency: ``` ┌────────────────────────────────────────────────────────────────────────────┐ -│ Container Pool │ +│ Sandbox Pool │ ├────────────────────────────────────────────────────────────────────────────┤ │ │ -│ Python Pool (min: 5, max: 20) JavaScript Pool (min: 2, max: 8) │ -│ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ -│ │REPL │ │REPL │ │REPL │ │REPL │ │ JS │ │ JS │ │ -│ │Ready│ │Ready│ │Ready│ │Ready│ │Ready│ │Ready│ │ -│ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ +│ Python REPL Pool (configurable size, default: 5) │ +│ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ +│ │REPL │ │REPL │ │REPL │ │REPL │ │REPL │ │ +│ │Ready│ │Ready│ │Ready│ │Ready│ │Ready│ │ +│ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │ │ -│ Acquisition: O(1) ~3ms Acquisition: O(1) ~3ms │ +│ Acquisition: O(1) ~3ms │ +│ Non-Python languages: one-shot nsjail execution (no pooling) │ │ │ └────────────────────────────────────────────────────────────────────────────┘ Pool Lifecycle: ─────────────── -1. On startup: Pre-warm containers to min pool size -2. On acquire: Pop container from pool, mark as in-use -3. On execution complete: Destroy container (no reuse) -4. Background: Replenish pool to min size when below threshold +1. On startup: Pre-warm Python REPL sandboxes to configured pool size +2. On acquire: Pop sandbox from pool, mark as in-use +3. On execution complete: Destroy sandbox (no reuse) +4. Background: Replenish pool when below threshold ``` ### REPL Server -For Python, containers run a REPL server as PID 1: +For Python, sandboxes run a REPL server as the main process: ``` ┌─────────────────────────────────────────────────────────────────────────────┐ -│ Python Container │ +│ nsjail Sandbox (Python) │ ├─────────────────────────────────────────────────────────────────────────────┤ │ │ -│ PID 1: repl_server.py │ +│ repl_server.py (running inside nsjail) │ │ ┌───────────────────────────────────────────────────────────────────┐ │ │ │ Pre-imported: numpy, pandas, matplotlib, scipy, sklearn, etc. │ │ │ │ │ │ @@ -265,16 +237,16 @@ For Python, containers run a REPL server as PID 1: │ │ Protocol: JSON-framed via stdin/stdout │ │ │ └───────────────────────────────────────────────────────────────────┘ │ │ │ -│ Communication: Docker attach socket (not exec) │ +│ Communication: stdin/stdout pipe (subprocess) │ │ │ └─────────────────────────────────────────────────────────────────────────────┘ REPL Execution (~20-40ms): ────────────────────────── -1. REPLExecutor sends JSON request via attach socket +1. REPLExecutor sends JSON request via stdin pipe 2. REPL server executes code in namespace 3. REPL server captures stdout, stderr, files -4. REPL server sends JSON response back +4. REPL server sends JSON response back via stdout 5. REPLExecutor parses response ``` @@ -344,7 +316,7 @@ Environment Variables (.env) │ │ │ Imports and merges: │ │ ├── api.py → API settings (host, port, debug) │ -│ ├── docker.py → Docker settings (base_url, timeout) │ +│ ├── sandbox.py → Sandbox settings (nsjail binary, base dir) │ │ ├── redis.py → Redis settings (host, port, pool) │ │ ├── minio.py → MinIO settings (endpoint, credentials) │ │ ├── security.py → Security settings (isolation, headers) │ @@ -378,23 +350,26 @@ settings.max_memory_mb ## Security Architecture -### Container Isolation +### nsjail Sandbox Isolation ``` ┌─────────────────────────────────────────────────────────────────────────────┐ -│ Container Security Layers │ +│ nsjail Security Layers │ ├─────────────────────────────────────────────────────────────────────────────┤ │ │ -│ 1. Network Isolation : network_mode: none (no network access) │ -│ 2. Filesystem Isolation : read_only: true, /tmp as tmpfs │ -│ 3. Capability Dropping : cap_drop: ALL │ -│ 4. Resource Limits : memory, cpu, pids, file descriptors │ -│ 5. Security Options : no-new-privileges:true │ -│ 6. tmpfs Options : noexec, nosuid │ +│ 1. PID Namespace : Each sandbox has its own PID 1 │ +│ 2. Mount Namespace : Minimal filesystem, read-only bind mounts │ +│ 3. Network Namespace : No network access │ +│ 4. Seccomp Filtering : Restricted syscalls │ +│ 5. Cgroup Limits : Memory, CPU, pids │ +│ 6. rlimits : File size, open files, stack size │ +│ 7. Non-root Execution : Code runs as uid 1001 (codeuser) │ │ │ └─────────────────────────────────────────────────────────────────────────────┘ ``` +**Note:** The API container requires `SYS_ADMIN` capability for nsjail to create namespaces and cgroups. No Docker socket is mounted. + ### Authentication - All endpoints except `/health` require API key @@ -423,13 +398,16 @@ Response ← SecurityMiddleware ← AuthMiddleware ← LoggingMiddleware ← Met ## Key Files Reference -| Component | Primary File | Description | -| -------------- | ----------------------------------------- | ------------------------------------------------ | -| FastAPI App | `src/main.py` | Application entry point with lifespan management | -| Orchestrator | `src/services/orchestrator.py` | Execution workflow coordinator | -| Container Pool | `src/services/container/pool.py` | Pre-warmed container management | -| REPL Executor | `src/services/container/repl_executor.py` | Python REPL communication | -| REPL Server | `docker/repl_server.py` | In-container Python REPL | -| State Service | `src/services/state.py` | Python state persistence | -| Event Bus | `src/core/events.py` | Async event-driven communication | -| Settings | `src/config/__init__.py` | Unified configuration | +| Component | Primary File | Description | +| ---------------- | ---------------------------------------- | ------------------------------------------------ | +| FastAPI App | `src/main.py` | Application entry point with lifespan management | +| Orchestrator | `src/services/orchestrator.py` | Execution workflow coordinator | +| Sandbox Pool | `src/services/sandbox/pool.py` | Pre-warmed Python REPL sandbox management | +| Sandbox Manager | `src/services/sandbox/manager.py` | Sandbox lifecycle (create, destroy) | +| Sandbox Executor | `src/services/sandbox/executor.py` | Code execution in nsjail sandboxes | +| REPL Executor | `src/services/sandbox/repl_executor.py` | Python REPL communication | +| nsjail Config | `src/services/sandbox/nsjail.py` | nsjail CLI builder and SandboxInfo dataclass | +| REPL Server | `docker/repl_server.py` | In-sandbox Python REPL | +| State Service | `src/services/state.py` | Python state persistence | +| Event Bus | `src/core/events.py` | Async event-driven communication | +| Settings | `src/config/__init__.py` | Unified configuration | diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 70c3a6c..d3b7adc 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -43,29 +43,15 @@ Controls the basic API server settings. Configures SSL/TLS support for secure HTTPS connections. -#### Docker Deployments - | Variable | Default | Description | | ---------------- | -------- | -------------------------------------------------------- | | `ENABLE_HTTPS` | `false` | Enable HTTPS/SSL support | | `HTTPS_PORT` | `443` | HTTPS server port | | `SSL_CERTS_PATH` | `./ssl` | Host path to directory containing `cert.pem` and `key.pem` | -| `SSL_REDIRECT` | `false` | Redirect HTTP traffic to HTTPS | - -> **Note:** When using Docker, the certificate files are automatically mapped to `/app/ssl/` inside the container. You only need to set `SSL_CERTS_PATH` to point to your certificates directory on the host. -#### Non-Docker Deployments - -| Variable | Default | Description | -| ---------------- | -------- | -------------------------------------------------------- | -| `ENABLE_HTTPS` | `false` | Enable HTTPS/SSL support | -| `HTTPS_PORT` | `443` | HTTPS server port | -| `SSL_CERT_FILE` | - | Absolute path to SSL certificate file (.pem) | -| `SSL_KEY_FILE` | - | Absolute path to SSL private key file (.pem) | -| `SSL_CA_CERTS` | - | Path to CA certificates file (optional) | -| `SSL_REDIRECT` | `false` | Redirect HTTP traffic to HTTPS | +> **Note:** The certificate files are automatically mapped to `/app/ssl/` inside the API container via `docker-compose.yml`. You only need to set `SSL_CERTS_PATH` to point to your certificates directory on the host. -**HTTPS Setup (Docker):** +**HTTPS Setup:** 1. **Generate or obtain SSL certificates**: @@ -82,8 +68,7 @@ Configures SSL/TLS support for secure HTTPS connections. ```bash ENABLE_HTTPS=true HTTPS_PORT=443 - SSL_REDIRECT=true # Optional: redirect HTTP to HTTPS - + # If using the default ./ssl directory, no additional config needed. # If your certs are elsewhere, set the path: # SSL_CERTS_PATH=/path/to/your/ssl/certs @@ -91,28 +76,16 @@ Configures SSL/TLS support for secure HTTPS connections. The directory must contain files named `cert.pem` and `key.pem`. -3. **Deploy with Docker Compose**: +3. **Deploy with docker compose**: ```bash - docker-compose up -d + docker compose up -d ``` -**HTTPS Setup (Non-Docker):** - -```bash -ENABLE_HTTPS=true -HTTPS_PORT=443 -SSL_CERT_FILE=/absolute/path/to/cert.pem -SSL_KEY_FILE=/absolute/path/to/key.pem -SSL_REDIRECT=true -``` - **Security Notes:** - Use certificates from trusted Certificate Authorities in production - Keep private keys secure and never commit them to version control - Consider using Let's Encrypt for free SSL certificates -- Enable `SSL_REDIRECT` to automatically redirect HTTP to HTTPS - ### Authentication Configuration Manages API key authentication and security. @@ -121,8 +94,6 @@ Manages API key authentication and security. | ------------------- | -------------- | -------------------------------------- | | `API_KEY` | `test-api-key` | Primary API key (CHANGE IN PRODUCTION) | | `API_KEYS` | - | Additional API keys (comma-separated) | -| `API_KEY_HEADER` | `x-api-key` | HTTP header name for API key | -| `API_KEY_CACHE_TTL` | `300` | API key validation cache TTL (seconds) | **Security Notes:** @@ -162,87 +133,70 @@ MinIO provides S3-compatible object storage for files. | `MINIO_SECRET_KEY` | `minioadmin` | MinIO secret key | | `MINIO_SECURE` | `false` | Use HTTPS for MinIO connections | | `MINIO_BUCKET` | `code-interpreter-files` | Bucket name for file storage | -| `MINIO_REGION` | `us-east-1` | MinIO region | -### Docker Configuration +### Sandbox Configuration -Docker is used for secure code execution in containers. +nsjail is used for secure code execution in isolated sandboxes. -| Variable | Default | Description | -| --------------------- | ------- | -------------------------------------------- | -| `DOCKER_BASE_URL` | - | Docker daemon URL (auto-detected if not set) | -| `DOCKER_TIMEOUT` | `60` | Docker operation timeout (seconds) | -| `DOCKER_NETWORK_MODE` | `none` | Container network mode | -| `DOCKER_READ_ONLY` | `true` | Mount container filesystem as read-only | +| Variable | Default | Description | +| ---------------------------------- | ------------------------------------------- | ------------------------------------- | +| `NSJAIL_BINARY` | `nsjail` | Path to nsjail binary | +| `SANDBOX_BASE_DIR` | `/var/lib/code-interpreter/sandboxes` | Base directory for sandbox filesystems | +| `SANDBOX_TMPFS_SIZE_MB` | `100` | tmpfs size for sandbox /tmp (MB) | +| `SANDBOX_TTL_MINUTES` | `5` | Sandbox time-to-live | +| `SANDBOX_CLEANUP_INTERVAL_MINUTES` | `5` | Cleanup check interval | **Security Notes:** -- `DOCKER_NETWORK_MODE=none` provides maximum isolation -- `DOCKER_READ_ONLY=true` prevents container filesystem modifications +- nsjail provides PID, mount, and network namespace isolation +- Code runs as non-root user (uid 1001) inside the sandbox +- The API container requires `SYS_ADMIN` capability for nsjail namespace creation ### Resource Limits #### Execution Limits -| Variable | Default | Description | -| -------------------- | ------- | ---------------------------------------------------------------- | -| `MAX_EXECUTION_TIME` | `30` | Maximum code execution time (seconds) | -| `MAX_MEMORY_MB` | `512` | Maximum memory per execution (MB) | -| `MAX_CPUS` | `4.0` | Maximum CPU cores available to execution containers | -| `MAX_PIDS` | `512` | Per-container process limit (cgroup pids_limit, prevents fork bombs) | -| `MAX_OPEN_FILES` | `1024` | Maximum open files per container | +| Variable | Default | Description | +| -------------------- | ------- | ------------------------------------- | +| `MAX_EXECUTION_TIME` | `30` | Maximum code execution time (seconds) | +| `MAX_MEMORY_MB` | `512` | Maximum memory per execution (MB) | #### File Limits -| Variable | Default | Description | -| ------------------------ | ------- | ---------------------------------------- | -| `MAX_FILE_SIZE_MB` | `10` | Maximum individual file size (MB) | -| `MAX_TOTAL_FILE_SIZE_MB` | `50` | Maximum total file size per session (MB) | -| `MAX_FILES_PER_SESSION` | `50` | Maximum files per session | -| `MAX_OUTPUT_FILES` | `10` | Maximum output files per execution | -| `MAX_FILENAME_LENGTH` | `255` | Maximum filename length | - -#### Session Limits - -| Variable | Default | Description | -| --------------------------- | ------- | ---------------------------------- | -| `MAX_CONCURRENT_EXECUTIONS` | `10` | Maximum concurrent code executions | -| `MAX_SESSIONS_PER_ENTITY` | `100` | Maximum sessions per entity | +| Variable | Default | Description | +| ----------------------- | ------- | ---------------------------------- | +| `MAX_FILE_SIZE_MB` | `10` | Maximum individual file size (MB) | +| `MAX_FILES_PER_SESSION` | `50` | Maximum files per session | +| `MAX_OUTPUT_FILES` | `10` | Maximum output files per execution | +| `MAX_FILENAME_LENGTH` | `255` | Maximum filename length | ### Session Configuration | Variable | Default | Description | | ---------------------------------- | ------- | ---------------------------- | | `SESSION_TTL_HOURS` | `24` | Session time-to-live (hours) | -| `SESSION_CLEANUP_INTERVAL_MINUTES` | `60` | Cleanup interval (minutes) | -| `SESSION_ID_LENGTH` | `32` | Session ID length | +| `SESSION_CLEANUP_INTERVAL_MINUTES` | `10` | Cleanup interval (minutes) | -### Container Pool Configuration +### Sandbox Pool Configuration -Pre-warmed containers significantly reduce execution latency by eliminating cold start time. +Pre-warmed Python REPL sandboxes reduce execution latency by eliminating interpreter startup and library import time. Only Python supports REPL pooling; all other languages use one-shot nsjail execution. | Variable | Default | Description | | ---------------------------------- | ------- | -------------------------------------- | -| `CONTAINER_POOL_ENABLED` | `true` | Enable container pooling | -| `CONTAINER_POOL_MIN_SIZE` | `2` | Default minimum pool size per language | -| `CONTAINER_POOL_MAX_SIZE` | `15` | Default maximum pool size per language | -| `CONTAINER_POOL_WARMUP_ON_STARTUP` | `true` | Pre-warm containers at startup | -| `CONTAINER_POOL_PY_MIN` | `5` | Minimum Python containers in pool | -| `CONTAINER_POOL_PY_MAX` | `20` | Maximum Python containers in pool | -| `CONTAINER_POOL_JS_MIN` | `2` | Minimum JavaScript containers in pool | -| `CONTAINER_POOL_JS_MAX` | `8` | Maximum JavaScript containers in pool | +| `SANDBOX_POOL_ENABLED` | `true` | Enable Python REPL pool | +| `SANDBOX_POOL_WARMUP_ON_STARTUP` | `true` | Pre-warm Python REPLs at startup | +| `SANDBOX_POOL_PY` | `5` | Number of pre-warmed Python REPLs | -**Note:** Containers are destroyed immediately after execution - there is no TTL-based cleanup. The pool is automatically replenished in the background. +**Note:** Sandboxes are destroyed immediately after execution. The pool is automatically replenished in the background. Non-Python languages do not use pooling. ### REPL Configuration (Python Fast Execution) -REPL mode keeps a Python interpreter running inside pooled containers with common libraries pre-imported, reducing execution latency from ~3,500ms to ~20-40ms. +REPL mode keeps a Python interpreter running inside pooled sandboxes with common libraries pre-imported, reducing execution latency from ~3,500ms to ~20-40ms. -| Variable | Default | Description | -| ----------------------------------- | ------- | --------------------------------------- | -| `REPL_ENABLED` | `true` | Enable pre-warmed Python REPL | -| `REPL_WARMUP_TIMEOUT_SECONDS` | `15` | Timeout for REPL server to become ready | -| `REPL_HEALTH_CHECK_TIMEOUT_SECONDS` | `5` | Timeout for REPL health checks | +| Variable | Default | Description | +| ----------------------------- | ------- | --------------------------------------- | +| `REPL_ENABLED` | `true` | Enable pre-warmed Python REPL | +| `REPL_WARMUP_TIMEOUT_SECONDS` | `15` | Timeout for REPL server to become ready | ### State Persistence Configuration (Python) @@ -252,7 +206,6 @@ Python sessions can persist variables, functions, and objects across executions | --------------------------- | ------- | ------------------------------------ | | `STATE_PERSISTENCE_ENABLED` | `true` | Enable Python state persistence | | `STATE_TTL_SECONDS` | `7200` | Redis hot storage TTL (2 hours) | -| `STATE_MAX_SIZE_MB` | `50` | Maximum serialized state size | | `STATE_CAPTURE_ON_ERROR` | `false` | Save state even on execution failure | ### State Archival Configuration (Python) @@ -270,27 +223,28 @@ Inactive states are automatically archived to MinIO for long-term storage. | Variable | Default | Description | | ----------------------------- | ------- | --------------------------------------- | -| `ENABLE_NETWORK_ISOLATION` | `true` | Enable network isolation for containers | +| `ENABLE_NETWORK_ISOLATION` | `true` | Enable network isolation for sandboxes | | `ENABLE_FILESYSTEM_ISOLATION` | `true` | Enable filesystem isolation | ### Logging Configuration -| Variable | Default | Description | -| ---------------------- | ------- | ------------------------------------------- | -| `LOG_LEVEL` | `INFO` | Logging level (DEBUG, INFO, WARNING, ERROR) | -| `LOG_FORMAT` | `json` | Log format (json or text) | -| `LOG_FILE` | - | Log file path (stdout if not set) | -| `LOG_MAX_SIZE_MB` | `100` | Maximum log file size (MB) | -| `LOG_BACKUP_COUNT` | `5` | Number of log file backups | -| `ENABLE_ACCESS_LOGS` | `true` | Enable HTTP access logs | -| `ENABLE_SECURITY_LOGS` | `true` | Enable security event logs | +| Variable | Default | Description | +| ---------------------- | ------- | ----------------------------------------------- | +| `LOG_LEVEL` | `INFO` | Logging level (DEBUG, INFO, WARNING, ERROR) | +| `LOG_FORMAT` | `json` | Log format (`json` or `text`) | +| `LOG_FILE` | - | Log file path (stdout if not set) | +| `LOG_MAX_SIZE_MB` | `100` | Maximum log file size (MB) | +| `LOG_BACKUP_COUNT` | `5` | Number of log file backups | +| `ENABLE_ACCESS_LOGS` | `false` | Enable uvicorn HTTP access logs | +| `ENABLE_SECURITY_LOGS` | `true` | Enable security event logs | + +**Log level guide:** -### Health Check Configuration +- **`INFO`** (default) — Clean, readable output. Logs startup/shutdown lifecycle, one entry per code execution (request + response), session cleanup summaries, warnings, and errors. Internal details like sandbox creation, REPL warmup, state persistence, file operations, and pool replenishment are suppressed. +- **`DEBUG`** — Full detail. Adds per-request internals: sandbox acquisition, REPL readiness, state save/load, file mounting, session reuse lookups, pool warmup cycles, and all HTTP request/response logging. +- **`WARNING`** / **`ERROR`** — Only problems. -| Variable | Default | Description | -| ----------------------- | ------- | ------------------------------- | -| `HEALTH_CHECK_INTERVAL` | `30` | Health check interval (seconds) | -| `HEALTH_CHECK_TIMEOUT` | `5` | Health check timeout (seconds) | +**Request logging:** The `RequestLoggingMiddleware` handles HTTP request logging with status-aware levels — 5xx responses log at ERROR, 4xx at WARNING, and 2xx/3xx at DEBUG. This replaces uvicorn's native access logs (disabled by default). Set `ENABLE_ACCESS_LOGS=true` to re-enable uvicorn's access logs if needed. ### Development Configuration @@ -302,32 +256,22 @@ Inactive states are automatically archived to MinIO for long-term storage. ## Language-Specific Configuration -Each supported programming language has its own configuration for container images and resource multipliers: +All 12 language runtimes are pre-installed in the unified Docker image. No per-language images are needed. ### Supported Languages -- **Python** (`py`): `python:3.11-slim` -- **Node.js** (`js`): `node:18-alpine` -- **TypeScript** (`ts`): `node:18-alpine` -- **Go** (`go`): `golang:1.21-alpine` -- **Java** (`java`): `openjdk:11-jre-slim` -- **C** (`c`): `gcc:latest` -- **C++** (`cpp`): `gcc:latest` -- **PHP** (`php`): `php:8.2-cli-alpine` -- **Rust** (`rs`): `rust:1.70-slim` -- **R** (`r`): `r-base:latest` -- **Fortran** (`f90`): `gcc:latest` -- **D** (`d`): `dlang2/dmd-ubuntu:latest` - -### Custom Language Images - -You can override default images using environment variables: - -```bash -LANG_PYTHON_IMAGE=python:3.12-slim -LANG_NODEJS_IMAGE=node:20-alpine -LANG_JAVA_IMAGE=openjdk:17-jre-slim -``` +- **Python** (`py`): Python 3.12 with numpy, pandas, matplotlib, scipy, sklearn, etc. +- **Node.js** (`js`): Node.js 22 +- **TypeScript** (`ts`): Node.js 22 with TypeScript +- **Go** (`go`): Go 1.23 +- **Java** (`java`): OpenJDK (default-jdk) +- **C** (`c`): GCC +- **C++** (`cpp`): G++ +- **PHP** (`php`): PHP 8.3 +- **Rust** (`rs`): Rust (stable) +- **R** (`r`): R with dplyr, ggplot2, data.table, etc. +- **Fortran** (`f90`): gfortran +- **D** (`d`): LDC ## Configuration Management Tools @@ -374,8 +318,7 @@ if validate_configuration(): - [ ] Change default API key to a secure random value - [ ] Enable network isolation (`ENABLE_NETWORK_ISOLATION=true`) - [ ] Enable filesystem isolation (`ENABLE_FILESYSTEM_ISOLATION=true`) -- [ ] Set Docker network mode to `none` -- [ ] Enable read-only container filesystems +- [ ] Ensure nsjail sandbox isolation is active - [ ] Review and adjust resource limits ### Performance @@ -385,13 +328,12 @@ if validate_configuration(): - [ ] Set reasonable execution timeouts - [ ] Configure log rotation - [ ] Enable REPL mode for Python (`REPL_ENABLED=true`) -- [ ] Configure container pool sizes based on language usage +- [ ] Configure sandbox pool size based on expected Python usage - [ ] Review state persistence TTL settings ### State Persistence (Python) - [ ] Configure `STATE_TTL_SECONDS` based on session patterns -- [ ] Set `STATE_MAX_SIZE_MB` limit appropriate for use case - [ ] Enable state archival for long-term session resumption - [ ] Configure archival TTL (`STATE_ARCHIVE_TTL_DAYS`) @@ -406,7 +348,7 @@ if validate_configuration(): - [ ] Secure Redis with authentication - [ ] Secure MinIO with proper access keys -- [ ] Configure Docker daemon security +- [ ] Ensure SYS_ADMIN capability is set for nsjail - [ ] Set up backup for Redis and MinIO data ## Troubleshooting @@ -431,10 +373,10 @@ python config_manager.py validate - Check access key and secret key - Ensure bucket exists or can be created -3. **Docker Connection Failed** - - Verify Docker daemon is running - - Check Docker socket permissions - - Ensure user has Docker access +3. **Sandbox Execution Failed** + - Verify nsjail binary is available + - Check that the API container has SYS_ADMIN capability + - Ensure sandbox base directory exists and is writable 4. **Resource Limit Errors** - Check system resources available @@ -443,11 +385,12 @@ python config_manager.py validate ### Debug Mode -Enable debug mode for detailed logging: +Enable verbose logging for troubleshooting: ```bash -API_DEBUG=true -LOG_LEVEL=DEBUG +LOG_LEVEL=DEBUG # Shows all internal operations (sandbox, REPL, state, files) +ENABLE_ACCESS_LOGS=true # Re-enables uvicorn per-request access logs +API_DEBUG=true # Enables /config endpoint and verbose error responses ``` **Warning:** Disable debug mode in production as it may expose sensitive information. @@ -462,6 +405,7 @@ API_RELOAD=true ENABLE_CORS=true ENABLE_DOCS=true LOG_LEVEL=DEBUG +ENABLE_ACCESS_LOGS=true ``` ### Testing @@ -484,4 +428,6 @@ ENABLE_DOCS=false LOG_LEVEL=INFO LOG_FORMAT=json ENABLE_SECURITY_LOGS=true +# ENABLE_ACCESS_LOGS defaults to false — request logging middleware +# handles this with status-aware levels (errors at WARNING/ERROR) ``` diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 7cbb20a..5b77100 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -7,7 +7,7 @@ This document provides detailed instructions for setting up the development envi ### Prerequisites - Python 3.11+ -- Docker Engine +- Docker and docker compose (for running the API container, Redis, and MinIO) - Redis - MinIO (or S3-compatible storage) @@ -43,7 +43,7 @@ This document provides detailed instructions for setting up the development envi 5. **Start infrastructure services** ```bash - docker-compose up -d + docker compose up -d ``` 6. **Run the API server** @@ -61,23 +61,20 @@ For detailed testing instructions, please refer to [TESTING.md](TESTING.md). # Run unit tests pytest tests/unit/ -# Run integration tests (requires Docker/Redis/MinIO) +# Run integration tests (requires running API container, Redis, MinIO) pytest tests/integration/ # Run all tests with coverage pytest --cov=src tests/ ``` -## Building Docker Images +## Building the Docker Image -The API requires language-specific execution images. +The API uses a single unified Docker image containing all 12 language runtimes and nsjail. ```bash -# Build all language execution images -cd docker && ./build-images.sh -p && cd .. - -# Build a single language image (e.g., Python) -cd docker && ./build-images.sh -l python && cd .. +# Build the unified image +docker build -t code-interpreter:nsjail . ``` -For more details on container management, see [ARCHITECTURE.md](ARCHITECTURE.md). +For more details on the sandbox architecture, see [ARCHITECTURE.md](ARCHITECTURE.md). diff --git a/docs/METRICS.md b/docs/METRICS.md index b40a4c7..b0ff1fb 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -18,12 +18,12 @@ Track per-execution, per-language, and per-API-key metrics. | `GET /metrics/detailed` | Summary with language breakdown | | `GET /metrics/by-language` | Per-language execution stats | | `GET /metrics/by-api-key/{hash}` | Per-API-key usage | -| `GET /metrics/pool` | Container pool hit rates | +| `GET /metrics/pool` | Sandbox pool hit rates | ## Tracked Metrics **Per-execution:** -- Language, execution time, memory usage, status, files generated, container source +- Language, execution time, memory usage, status, files generated, sandbox source **Per-language:** - Execution count, error rates, average execution times diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md index 3bb25db..164b74c 100644 --- a/docs/PERFORMANCE.md +++ b/docs/PERFORMANCE.md @@ -6,14 +6,14 @@ This document provides performance benchmarks, tuning recommendations, and monit ### Baseline Metrics (With Optimizations) -The following metrics represent typical performance with all optimizations enabled (container pooling, REPL mode): +The following metrics represent typical performance with all optimizations enabled (sandbox pooling, REPL mode): | Metric | Value | Notes | | ------------------------------ | ---------- | ------------------------------- | | **Python execution (simple)** | 20-40ms | With REPL mode | | **Python execution (complex)** | 50-200ms | Depends on code complexity | -| **JavaScript execution** | 50-100ms | With container pool | -| **Container acquisition** | ~3ms | From pre-warmed pool | +| **JavaScript execution** | 50-100ms | One-shot nsjail execution | +| **Sandbox acquisition** | ~3ms | From pre-warmed pool | | **Cold start (no pool)** | 500-2000ms | First request or pool exhausted | | **State serialization** | 1-25ms | Depends on state size | | **File upload (1MB)** | 50-100ms | To MinIO | @@ -30,14 +30,14 @@ The following metrics represent typical performance with all optimizations enabl ## Optimization Features -### 1. Container Pool +### 1. Sandbox Pool -Pre-warmed containers eliminate cold start latency: +Pre-warmed Python REPL sandboxes eliminate cold start latency: ``` Without Pool: -Request → Create Container → Start → Execute → Destroy - [~500-2000ms] [~100ms] [~50ms] [~50ms] +Request → Create Sandbox → Start nsjail → Execute → Destroy + [~500-2000ms] [~100ms] [~50ms] [~50ms] Total: ~700-2200ms With Pool: @@ -50,12 +50,11 @@ Request → Acquire from Pool → Execute → Destroy → (Background: Replenish ```bash CONTAINER_POOL_ENABLED=true -CONTAINER_POOL_MIN_SIZE=2 # Default per language -CONTAINER_POOL_MAX_SIZE=15 # Default per language -CONTAINER_POOL_PY_MIN=5 # Python-specific minimum -CONTAINER_POOL_PY_MAX=20 # Python-specific maximum +CONTAINER_POOL_PY=5 # Number of pre-warmed Python REPLs ``` +**Note:** Only Python supports REPL pooling. All other languages use one-shot nsjail execution. + ### 2. REPL Mode (Python) Pre-warmed Python interpreter with common libraries: @@ -102,20 +101,20 @@ REDIS_SOCKET_CONNECT_TIMEOUT=5 ### Pool Size Recommendations -| Usage Pattern | Python Min/Max | JS Min/Max | Other Min/Max | -| ---------------------- | -------------- | ---------- | ------------- | -| Light (< 10 req/min) | 2/5 | 1/3 | 1/2 | -| Medium (10-50 req/min) | 5/15 | 2/8 | 2/5 | -| Heavy (> 50 req/min) | 10/30 | 5/15 | 3/10 | +| Usage Pattern | Python Pool Size | +| ---------------------- | ---------------- | +| Light (< 10 req/min) | 2-5 | +| Medium (10-50 req/min) | 5-15 | +| Heavy (> 50 req/min) | 10-30 | **Trade-offs:** -- Higher min = more memory usage, faster warm responses -- Higher max = handles bursts better, more resource usage +- Higher pool size = more memory usage, faster warm responses +- Non-Python languages use one-shot nsjail execution (no pooling) ### Memory Allocation -Each container uses memory: +Each sandbox uses memory: | Language | Base Memory | With Code | Recommendation | | ----------------- | ----------- | --------- | -------------- | @@ -128,7 +127,7 @@ Each container uses memory: **Configuration:** ```bash -MAX_MEMORY_MB=512 # Default per container +MAX_MEMORY_MB=512 # Default per sandbox ``` ### State Persistence Tuning @@ -159,7 +158,7 @@ Request parsing ~1ms Authentication ~1ms Session lookup ~2ms State load (if exists) ~3ms -Container acquire ~3ms +Sandbox acquire ~3ms REPL communication ~5ms Code execution ~20ms State save ~3ms @@ -176,11 +175,11 @@ Component Time Request parsing ~1ms Authentication ~1ms Session lookup ~2ms -File upload to container ~10ms (1MB file) -Container acquire ~3ms +File upload to sandbox ~10ms (1MB file) +Sandbox acquire ~3ms Code execution ~50ms Output file detection ~5ms -File download from container ~10ms +File download from sandbox ~10ms MinIO upload ~20ms Response building ~2ms ────────────────────────────────── @@ -205,9 +204,9 @@ The API handles concurrent requests efficiently: **Bottlenecks at high concurrency:** -1. Container pool exhaustion (wait for replenishment) +1. Sandbox pool exhaustion (wait for replenishment) 2. Redis connection pool saturation -3. Docker daemon throughput +3. nsjail process throughput ### Horizontal Scaling @@ -216,7 +215,7 @@ For high-throughput deployments: 1. **Multiple API instances**: Load balance across instances 2. **Shared Redis**: All instances use same Redis for sessions/state 3. **Shared MinIO**: All instances use same MinIO for files -4. **Separate Docker hosts**: Distribute container load +4. **Separate hosts**: Distribute sandbox load across API instances ``` ┌─────────────────┐ @@ -226,7 +225,7 @@ For high-throughput deployments: ▼ ▼ ▼ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ API 1 │ │ API 2 │ │ API 3 │ - │+ Docker │ │+ Docker │ │+ Docker │ + │+ nsjail │ │+ nsjail │ │+ nsjail │ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │ │ └───────────────┼───────────────┘ @@ -311,7 +310,7 @@ Recommended alert conditions: curl https://localhost/metrics | jq '.pool' ``` - If pool is frequently exhausted, increase `CONTAINER_POOL_MAX_SIZE`. + If pool is frequently exhausted, increase `CONTAINER_POOL_PY`. 2. **Check Redis latency**: @@ -325,36 +324,35 @@ Recommended alert conditions: ```bash curl https://localhost/health/detailed | jq '.repl' ``` - If unhealthy, check REPL server logs in containers. + If unhealthy, check REPL server logs. ### Pool Exhaustion 1. **Increase pool size**: ```bash - CONTAINER_POOL_MAX_SIZE=30 - CONTAINER_POOL_PY_MAX=40 + CONTAINER_POOL_PY=15 ``` 2. **Check for slow executions**: - Long-running code blocks containers. Consider timeout reduction: + Long-running code blocks sandboxes. Consider timeout reduction: ```bash MAX_EXECUTION_TIME=15 ``` -3. **Check container cleanup**: - Containers should be destroyed immediately. Check for zombie containers: +3. **Check sandbox cleanup**: + Sandboxes should be destroyed immediately. Check for stale sandbox directories: ```bash - docker ps -a --filter "label=com.code-interpreter.managed=true" + ls -la /var/lib/code-interpreter/sandboxes/ ``` ### Memory Issues -1. **Check container memory**: +1. **Check API container memory**: ```bash - docker stats --no-stream + docker stats --no-stream code-interpreter-api ``` 2. **Reduce state size limit**: diff --git a/docs/PROGRAMMATIC_TOOL_CALLING.md b/docs/PROGRAMMATIC_TOOL_CALLING.md index 3974339..3e4b66f 100644 --- a/docs/PROGRAMMATIC_TOOL_CALLING.md +++ b/docs/PROGRAMMATIC_TOOL_CALLING.md @@ -45,7 +45,7 @@ X-API-Key: {api_key} ### Authentication -Same as `/exec` endpoint - uses `X-API-Key` header (also supports `Authorization: Bearer` and `Authorization: ApiKey`). +Same as `/exec` endpoint - uses `X-API-Key` header. --- @@ -207,7 +207,7 @@ The server must maintain execution state between requests: class PausedExecution: id: str # continuation_token session_id: str - container_id: str # Keep container alive + sandbox_id: str # Keep sandbox alive python_state: bytes # Pickled execution state OR execution_socket: Any # Active socket connection pending_tool_calls: List[dict] @@ -220,7 +220,7 @@ class PausedExecution: - Redis with TTL (recommended) - In-memory with cleanup task -- Container stays running with paused coroutine +- Sandbox stays running with paused coroutine ### 2. Python Environment Setup @@ -231,9 +231,9 @@ The Python execution environment must: 3. **Capture stdout/stderr** - Buffer output across round-trips 4. **Handle imports** - Pre-import common libraries -### 3. Container Lifecycle +### 3. Sandbox Lifecycle -Unlike `/exec`, containers for programmatic execution must: +Unlike `/exec`, sandboxes for programmatic execution must: - **Stay alive** between round-trips - Have **longer TTL** (match request timeout, up to 5 minutes) @@ -455,7 +455,7 @@ async def retrieve_execution_state(execution_id: str) -> PausedExecution: - Return partial stdout/stderr on timeout - Continue execution even if some tools error -- Clean up container on any terminal state +- Clean up sandbox on any terminal state --- @@ -468,12 +468,12 @@ async def retrieve_execution_state(execution_id: str) -> PausedExecution: - **Validate round-trip count** to prevent replay - **Bind to session** to prevent cross-session attacks -### 2. Container Isolation +### 2. Sandbox Isolation -- Same isolation as `/exec` (network disabled, capabilities dropped) +- Same isolation as `/exec` (nsjail namespace isolation, seccomp, cgroups) - **Longer lifetime** requires monitoring for resource abuse - **Memory limits** still enforced -- Container destroyed on completion/error/timeout +- Sandbox destroyed on completion/error/timeout ### 3. Tool Injection @@ -502,7 +502,7 @@ async def retrieve_execution_state(execution_id: str) -> PausedExecution: 1. **Continuation tokens**: Generate and validate 2. **State storage**: Redis with TTL -3. **Container persistence**: Keep alive between rounds +3. **Sandbox persistence**: Keep alive between rounds 4. **Round-trip limits**: Enforce maximum 20 ### Phase 3: Production Hardening diff --git a/docs/REPL.md b/docs/REPL.md index 9ff53ad..81e35e9 100644 --- a/docs/REPL.md +++ b/docs/REPL.md @@ -4,7 +4,7 @@ This document describes the Python REPL (Read-Eval-Print Loop) server that enabl ## Overview -The REPL server is a Python process that runs inside Docker containers, keeping the Python interpreter warm with common libraries pre-imported. This eliminates the ~3 second Python startup overhead on each execution. +The REPL server is a Python process that runs inside nsjail sandboxes, keeping the Python interpreter warm with common libraries pre-imported. This eliminates the ~3 second Python startup overhead on each execution. ### Performance Impact @@ -23,24 +23,24 @@ The REPL server is a Python process that runs inside Docker containers, keeping ``` ┌─────────────────────────────────────────────────────────────────────────────┐ -│ Host (API Server) │ +│ API Container (Host Process) │ │ │ -│ src/services/container/repl_executor.py │ +│ src/services/sandbox/repl_executor.py │ │ ┌───────────────────────────────────────────────────────────────────┐ │ │ │ REPLExecutor │ │ -│ │ - Communicates via Docker attach socket │ │ +│ │ - Communicates via stdin/stdout pipe │ │ │ │ - Sends JSON requests │ │ │ │ - Parses JSON responses │ │ │ │ - Handles timeouts and errors │ │ │ └───────────────────────────────────────────────────────────────────┘ │ │ │ │ └──────────────────────────────┼──────────────────────────────────────────────┘ - │ Docker attach socket + │ stdin/stdout pipe ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ -│ Docker Container │ +│ nsjail Sandbox │ │ │ -│ docker/repl_server.py (PID 1) │ +│ docker/repl_server.py (main process) │ │ ┌───────────────────────────────────────────────────────────────────┐ │ │ │ REPL Server │ │ │ │ - Pre-imports: numpy, pandas, matplotlib, scipy, sklearn, etc. │ │ @@ -56,12 +56,12 @@ The REPL server is a Python process that runs inside Docker containers, keeping ### Key Files -| File | Location | Purpose | -| ------------------ | ----------------------------------------- | ------------------------------------ | -| `repl_server.py` | `docker/repl_server.py` | In-container REPL server | -| `repl_executor.py` | `src/services/container/repl_executor.py` | Host-side communication | -| `entrypoint.sh` | `docker/entrypoint.sh` | Mode-aware container startup | -| `runner.py` | `src/services/execution/runner.py` | Routes to REPL or standard execution | +| File | Location | Purpose | +| ------------------ | ---------------------------------------- | ------------------------------------ | +| `repl_server.py` | `docker/repl_server.py` | In-sandbox REPL server | +| `repl_executor.py` | `src/services/sandbox/repl_executor.py` | Host-side communication | +| `entrypoint.sh` | `docker/entrypoint.sh` | Mode-aware sandbox startup | +| `runner.py` | `src/services/execution/runner.py` | Routes to REPL or standard execution | --- @@ -69,9 +69,9 @@ The REPL server is a Python process that runs inside Docker containers, keeping ### Communication Channel -The REPL uses Docker attach socket (not exec) for communication: +The REPL uses stdin/stdout pipes for communication: -- **Attach socket**: Connects to container's stdin/stdout +- **Subprocess pipe**: Connects to sandbox process's stdin/stdout - **JSON framing**: Messages delimited by special markers - **Bidirectional**: Request in, response out @@ -173,8 +173,8 @@ import sys ### Notes -- Imports are done at container startup (during pool warmup) -- Import time is amortized across all requests to that container +- Imports are done at sandbox startup (during pool warmup) +- Import time is amortized across all requests to that sandbox - User can still import additional libraries in their code --- @@ -248,37 +248,37 @@ REPL_ENABLED=false When disabled: -- Python uses standard execution (docker exec) +- Python uses standard one-shot nsjail execution - Startup overhead ~3 seconds per request - State persistence still works (via file-based serialization) --- -## Container Lifecycle +## Sandbox Lifecycle ### Startup Sequence -1. Container created with `repl_server.py` as entrypoint +1. nsjail sandbox created with `repl_server.py` as the main process 2. REPL server initializes and pre-imports libraries (~10-15 seconds) 3. REPL server writes "ready" marker to stdout -4. Container pool marks container as available -5. Container waits for requests on stdin +4. Sandbox pool marks sandbox as available +5. Sandbox waits for requests on stdin ### Request Processing -1. REPLExecutor sends JSON request via attach socket +1. REPLExecutor sends JSON request via stdin pipe 2. REPL server reads until delimiter 3. REPL server executes code in namespace 4. REPL server captures output and state 5. REPL server sends JSON response 6. REPLExecutor parses response -### Container Destruction +### Sandbox Destruction After each request: -- Container is destroyed immediately -- No container reuse (fresh state each request) +- Sandbox is destroyed immediately +- No sandbox reuse (fresh state each request) - Pool replenishes in background --- @@ -295,7 +295,7 @@ try: timeout=execution_timeout ) except asyncio.TimeoutError: - # Container is killed, new container acquired for retry + # Sandbox is killed, new sandbox acquired for retry raise ExecutionTimeoutError("Execution timed out") ``` @@ -345,10 +345,10 @@ If state cannot be serialized: ### REPL Not Starting -1. **Check container logs**: +1. **Check API container logs**: ```bash - docker logs + docker logs code-interpreter-api ``` 2. **Check warmup timeout**: @@ -359,18 +359,18 @@ If state cannot be serialized: ``` 3. **Check memory**: - REPL requires ~150MB for pre-imports. Ensure container has enough memory. + REPL requires ~150MB for pre-imports. Ensure the API container has enough memory. ### High Latency -1. **Check attach socket health**: +1. **Check sandbox health**: ```bash - curl https://localhost/health/detailed | jq '.containers' + curl https://localhost/health/detailed | jq '.sandbox' ``` 2. **Check for blocking operations**: - User code with network or disk I/O can block the REPL. + User code with disk I/O can block the REPL. 3. **Check state size**: Large state causes serialization overhead. Keep state < 1MB for best performance. @@ -396,14 +396,14 @@ If state cannot be serialized: ### Testing REPL Locally ```bash -# Build Python image -cd docker && ./build-images.sh -l python +# Build the unified image +docker build -t code-interpreter:nsjail . -# Run container with REPL -docker run -it --rm code-interp-python:latest +# Start the API with docker compose +docker compose up -d -# In container, REPL server starts automatically -# Send test request via stdin +# Check REPL health +curl -sk https://localhost/health/detailed | jq '.sandbox' ``` ### Debugging REPL Server @@ -429,7 +429,7 @@ PRELOAD_MODULES = [ ] ``` -Remember to rebuild the Docker image after changes. +Remember to rebuild the unified Docker image after changes (`docker build -t code-interpreter:nsjail .`). --- diff --git a/docs/SECURITY.md b/docs/SECURITY.md index 45580dd..27d0725 100644 --- a/docs/SECURITY.md +++ b/docs/SECURITY.md @@ -12,18 +12,11 @@ All API endpoints (except health checks and documentation) require authenticatio #### Providing API Key -The API key can be provided in two ways: +The API key is provided via the `x-api-key` header: -1. **x-api-key header** (recommended): - - ```bash - curl -H "x-api-key: your-api-key" https://api.example.com/sessions - ``` - -2. **Authorization header**: - ```bash - curl -H "Authorization: Bearer your-api-key" https://api.example.com/sessions - ``` +```bash +curl -H "x-api-key: your-api-key" https://api.example.com/sessions +``` #### Configuration @@ -102,121 +95,24 @@ Code is analyzed for potentially dangerous patterns: - **File operations**: `open()`, `file()`, etc. - **Input functions**: `input()`, `raw_input()`, etc. -**Note**: Dangerous patterns generate warnings but don't block execution, as the code runs in isolated containers. - -#### Container Isolation - -- **Docker containers**: All code runs in isolated Docker containers -- **Resource limits**: Memory and CPU limits are enforced -- **Network isolation**: External network access is blocked by default -- **Filesystem isolation**: Limited filesystem access within containers - -#### Container Hardening (Host Info Protection) - -Containers are hardened to prevent information leakage about the host infrastructure. -This prevents reconnaissance attacks that could reveal details about your cloud provider, -kernel version, or internal network configuration. - -**Currently Implemented**: - -| Feature | Protection | -|---------|------------| -| Generic hostname | All containers use hostname "sandbox" instead of revealing host info | -| Empty DNS search domain | WAN containers have empty search domain to prevent Azure/cloud domain leakage | -| Public DNS only | WAN containers use only public DNS (8.8.8.8, 1.1.1.1) | - -**Configuration**: - -```bash -# Enable/disable host info masking (default: true) -CONTAINER_MASK_HOST_INFO=true - -# Custom generic hostname (default: sandbox) -CONTAINER_GENERIC_HOSTNAME=sandbox -``` - -**Note**: Kernel version (`/proc/version`) and CPU/memory info (`/proc/cpuinfo`, `/proc/meminfo`) -remain accessible because many libraries depend on them. The hostname and DNS hardening above -addresses the primary concern of revealing cloud provider and internal network details. - -### WAN-Only Network Access - -The Code Interpreter API supports an optional WAN-only network mode that allows -execution containers to access the public internet while maintaining strict -isolation from internal networks. - -#### Overview - -When enabled via `ENABLE_WAN_ACCESS=true`, execution containers are connected -to a special Docker network that: - -1. **Allows**: Outbound connections to public internet IPs (all ports) -2. **Blocks**: Access to private IP ranges, Docker host, and other containers - -#### Blocked IP Ranges - -The following ranges are blocked via iptables rules: - -| Range | Description | -|-------|-------------| -| `10.0.0.0/8` | Class A private network | -| `172.16.0.0/12` | Class B private network (includes Docker networks) | -| `192.168.0.0/16` | Class C private network | -| `169.254.0.0/16` | Link-local (includes cloud metadata services) | -| `127.0.0.0/8` | Loopback | -| `224.0.0.0/4` | Multicast | -| `240.0.0.0/4` | Reserved | - -#### Configuration - -```bash -# Enable WAN access (default: false) -ENABLE_WAN_ACCESS=true - -# Custom network name (optional) -WAN_NETWORK_NAME=code-interpreter-wan - -# Custom DNS servers (optional, defaults to Google and Cloudflare DNS) -WAN_DNS_SERVERS=8.8.8.8,1.1.1.1,8.8.4.4 -``` - -#### Security Considerations - -1. **iptables Required**: The API container needs `NET_ADMIN` capability to - manage iptables rules. This is automatically configured in docker-compose.yml. - -2. **Public DNS Only**: Only public DNS servers are used to prevent DNS-based - attacks that could leak internal network information. - -3. **No Inter-Container Communication**: The WAN network has ICC (inter-container - communication) disabled. Containers cannot communicate with each other. - -4. **Cloud Metadata Blocked**: The link-local range (169.254.0.0/16) is blocked, - which includes cloud metadata endpoints (169.254.169.254) used by AWS, GCP, - and Azure. - -5. **IPv4 Only**: The current implementation focuses on IPv4. IPv6 would require - separate ip6tables rules. - -6. **Default Off**: WAN access is disabled by default for maximum security. +**Note**: Dangerous patterns generate warnings but don't block execution, as the code runs in isolated nsjail sandboxes. -#### When to Enable WAN Access +#### nsjail Sandbox Isolation -Enable WAN access when: -- Users need to download packages or dependencies (pip, npm, etc.) -- Code needs to fetch data from public APIs -- Web scraping or data collection is required +- **nsjail sandboxes**: All code runs in isolated nsjail sandboxes with namespace separation +- **PID namespace**: Each sandbox has its own PID 1; processes cannot see or signal other sandboxes +- **Mount namespace**: Minimal filesystem with read-only bind mounts for language runtimes +- **Network namespace**: No network access by default +- **Seccomp filtering**: Restricts available system calls +- **Cgroup limits**: Memory, CPU, and PID limits enforced +- **rlimits**: File size, open files, and stack size restricted +- **Non-root execution**: Code runs as uid 1001 (codeuser) -Keep WAN access disabled (default) when: -- Maximum security isolation is required -- All dependencies are pre-installed in container images -- Code should not have any network access +**Note**: The API container requires `SYS_ADMIN` capability for nsjail to create namespaces and cgroups. No Docker socket is mounted. -#### Audit Logging +### Network Isolation -WAN-enabled containers are tracked via labels: -- `com.code-interpreter.wan-access=true` on each container -- Network initialization and iptables rule application are logged at startup +By default, nsjail sandboxes have no network access. Each sandbox runs in its own network namespace with no connectivity. ### State Persistence Security @@ -224,8 +120,8 @@ Python state persistence introduces additional security considerations: #### Serialization Security -- **Serialization inside containers**: State is serialized within the isolated container, not on the host. The host never unpickles user data. -- **cloudpickle usage**: We use cloudpickle for serialization. While pickle-based formats can execute code during deserialization, this only occurs inside the sandboxed container. +- **Serialization inside sandboxes**: State is serialized within the isolated nsjail sandbox, not on the host. The host never unpickles user data. +- **cloudpickle usage**: We use cloudpickle for serialization. While pickle-based formats can execute code during deserialization, this only occurs inside the sandboxed nsjail environment. - **Compression**: State is compressed with lz4 before storage, providing minor obfuscation and reducing attack surface. - **Base64 encoding**: Final storage uses base64 encoding for safe transport. @@ -339,7 +235,7 @@ If dangerous code patterns are detected: 1. Review the code content in logs 2. Check the session and user context 3. Consider additional code validation rules -4. Monitor container resource usage +4. Monitor sandbox resource usage ### File Upload Issues diff --git a/docs/STATE_PERSISTENCE.md b/docs/STATE_PERSISTENCE.md index 661f36a..d4000d5 100644 --- a/docs/STATE_PERSISTENCE.md +++ b/docs/STATE_PERSISTENCE.md @@ -39,7 +39,7 @@ State persistence uses a hybrid storage architecture: ``` POST /exec {"lang": "py", "code": "x = 42"} - → Container executes code + → Sandbox executes code → REPL server captures namespace: {"x": 42} → Namespace serialized with cloudpickle → Compressed with lz4 (~10x reduction) @@ -372,7 +372,7 @@ During serialization, memory temporarily doubles: - Original object in memory - Serialized copy being created -Ensure containers have sufficient memory for state operations. +Ensure sandboxes have sufficient memory for state operations. --- @@ -431,164 +431,6 @@ curl -X GET https://localhost/health/redis \ --- -## Client-Side State API - -Clients can download, cache, and restore state independently. This enables: - -- **Longer state retention**: Cache state client-side beyond 2-hour Redis TTL -- **Reduced server load**: Restore from client cache instead of MinIO archive -- **Offline resilience**: Resume sessions even if server state is lost - -### API Endpoints - -| Endpoint | Method | Description | -| -------------------------- | ------ | --------------------------------- | -| `/state/{session_id}` | GET | Download state as raw lz4 binary | -| `/state/{session_id}` | POST | Upload state as raw lz4 binary | -| `/state/{session_id}/info` | GET | Get state metadata | -| `/state/{session_id}` | DELETE | Delete state (always returns 204) | - -### ExecResponse State Fields - -Python executions return additional state fields: - -```json -{ - "session_id": "abc123...", - "stdout": "...", - "has_state": true, - "state_size": 1234, - "state_hash": "sha256..." -} -``` - -| Field | Type | Description | -| ------------ | ------ | ----------------------------------------------- | -| `has_state` | `bool` | True when execution produced serializable state | -| `state_size` | `int` | Size of compressed state in bytes | -| `state_hash` | `str` | SHA256 hash for change detection | - -### Downloading State - -```bash -# Download state for client-side caching -curl -X GET https://localhost/state/{session_id} \ - -H "x-api-key: $API_KEY" \ - -o state.bin - -# Response: Raw lz4 binary with ETag header -# ETag: "sha256hash..." -``` - -### Checking State Existence - -```bash -curl -X GET https://localhost/state/{session_id}/info \ - -H "x-api-key: $API_KEY" - -# Response: -{ - "exists": true, - "session_id": "abc123...", - "size_bytes": 1234, - "hash": "sha256...", - "created_at": "2024-01-01T12:00:00Z", - "expires_at": "2024-01-01T14:00:00Z", - "source": "redis" // or "archive" -} -``` - -### Uploading State (Restore from Client Cache) - -```bash -# Upload cached state before execution -curl -X POST https://localhost/state/{session_id} \ - -H "x-api-key: $API_KEY" \ - -H "Content-Type: application/octet-stream" \ - --data-binary @state.bin - -# Response: 201 Created -{ - "message": "state_uploaded", - "size": 1234 -} -``` - -### ETag Caching - -Use `If-None-Match` to avoid unnecessary downloads: - -```bash -# Check if state changed since last download -curl -X GET https://localhost/state/{session_id} \ - -H "x-api-key: $API_KEY" \ - -H "If-None-Match: \"sha256hash...\"" - -# Response: 304 Not Modified (if unchanged) -``` - -### Client-Side Caching Workflow - -``` -1. Execute code → Response includes has_state, state_hash -2. Download state → GET /state/{session_id} → Cache locally -3. Before next execution: - a. Check server → GET /state/{session_id}/info - b. If exists=false AND client has cached state: - Upload state → POST /state/{session_id} - c. Execute → POST /exec with session_id -4. State is restored, execution continues -``` - -### Upload Priority - -When a client uploads state, it takes priority over server-side state for the next 30 seconds. This ensures the client's cached state is used even if server had stale data. - -### Error Responses - -| Status | Error | Description | -| ------ | ----------------- | ----------------------------------------------- | -| 400 | `invalid_state` | State format invalid (wrong version, too short) | -| 404 | `state_not_found` | No state exists for session | -| 413 | `state_too_large` | State exceeds 50MB limit | - -### Example: Full Restore Flow - -```bash -# 1. Execute and create state -RESPONSE=$(curl -sk -X POST https://localhost/exec \ - -H "x-api-key: $API_KEY" \ - -d '{"lang": "py", "code": "secret = 42"}') -SESSION_ID=$(echo $RESPONSE | jq -r '.session_id') - -# 2. Download state for caching -curl -sk -X GET "https://localhost/state/$SESSION_ID" \ - -H "x-api-key: $API_KEY" -o /tmp/state.bin - -# ... time passes, Redis TTL expires ... - -# 3. Check if state exists -INFO=$(curl -sk -X GET "https://localhost/state/$SESSION_ID/info" \ - -H "x-api-key: $API_KEY") -EXISTS=$(echo $INFO | jq -r '.exists') - -# 4. Restore if needed -if [ "$EXISTS" = "false" ]; then - curl -sk -X POST "https://localhost/state/$SESSION_ID" \ - -H "x-api-key: $API_KEY" \ - -H "Content-Type: application/octet-stream" \ - --data-binary @/tmp/state.bin -fi - -# 5. Execute with restored state -curl -sk -X POST https://localhost/exec \ - -H "x-api-key: $API_KEY" \ - -d "{\"lang\": \"py\", \"code\": \"print(secret)\", \"session_id\": \"$SESSION_ID\"}" -# Output: 42 -``` - ---- - ## Related Documentation - [CONFIGURATION.md](CONFIGURATION.md) - All configuration options diff --git a/docs/TESTING.md b/docs/TESTING.md index ecb9eff..f7a02d1 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -13,7 +13,7 @@ tests/ │ ├── test_execution_service.py │ ├── test_session_service.py │ └── ... -├── integration/ # Integration tests (require Docker, Redis, MinIO) +├── integration/ # Integration tests (require running API, Redis, MinIO) │ ├── test_api_contracts.py │ ├── test_librechat_compat.py │ ├── test_container_behavior.py @@ -28,7 +28,7 @@ tests/ Unit tests validate individual components in isolation: -- Mock external dependencies (Docker, Redis, MinIO) +- Mock external dependencies (Redis, MinIO, sandbox executor) - Fast execution (~seconds) - No infrastructure required @@ -36,10 +36,10 @@ Unit tests validate individual components in isolation: Integration tests validate end-to-end behavior: -- Require running Docker, Redis, MinIO +- Require running API, Redis, MinIO - Test actual API endpoints - Validate LibreChat compatibility -- Test container behavior and cleanup +- Test sandbox behavior and cleanup --- @@ -64,7 +64,7 @@ Before running tests, ensure: 3. **For integration tests, infrastructure running:** ```bash - docker-compose up -d + docker compose up -d ``` ### Running All Tests @@ -144,13 +144,13 @@ Ensures compatibility with LibreChat's Code Interpreter API: - File reference format - Response structure matching LibreChat expectations -### Container Behavior Tests +### Sandbox Behavior Tests **File:** `tests/integration/test_container_behavior.py` -Tests container lifecycle and execution: +Tests sandbox lifecycle and execution: -- Container creation and cleanup +- Sandbox creation and cleanup - Resource limit enforcement - Timeout handling - Output capture @@ -234,10 +234,9 @@ For unit tests, mock external dependencies: from unittest.mock import AsyncMock, patch @pytest.mark.asyncio -async def test_execution_with_mocked_docker(): - with patch("src.services.container.client.docker_client") as mock_docker: - mock_container = AsyncMock() - mock_docker.containers.run.return_value = mock_container +async def test_execution_with_mocked_sandbox(): + with patch("src.services.sandbox.executor.SandboxExecutor") as mock_executor: + mock_executor.execute.return_value = ("output", "", 0) # Test code here ``` @@ -372,7 +371,7 @@ pytest --cov=src --cov-report=xml tests/ 1. **Check infrastructure:** ```bash - docker-compose ps # All services should be "Up" + docker compose ps # All services should be "Up" ``` 2. **Check API health:** @@ -383,7 +382,7 @@ pytest --cov=src --cov-report=xml tests/ 3. **Check logs:** ```bash - docker-compose logs api + docker compose logs api ``` ### Async Test Issues @@ -398,7 +397,7 @@ If async tests hang: For tests that occasionally fail: -- Check for race conditions in container cleanup +- Check for race conditions in sandbox cleanup - Ensure proper test isolation - Use explicit waits for async operations diff --git a/mypy.ini b/mypy.ini index b9fd408..cc67dfa 100644 --- a/mypy.ini +++ b/mypy.ini @@ -11,14 +11,6 @@ disallow_untyped_defs = False # Ignore noisy error codes that aren't real bugs disable_error_code = no-any-return, import-untyped -# Per-module overrides for problematic external dependencies -[mypy-docker.*] -ignore_errors = True - -# Modules that require interface updates (tracked for future refactoring) -[mypy-src.services.container.*] -ignore_errors = True - [mypy-src.services.health] ignore_errors = True diff --git a/requirements.txt b/requirements.txt index 920388b..f6d85fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,10 +19,6 @@ aiosqlite>=0.19.0 # MinIO/S3 client minio==7.2.20 -# Docker client for container management -docker==7.1.0 -requests-unixsocket==0.4.1 - # Date/time parsing utilities python-dateutil==2.9.0.post0 diff --git a/scripts/admin_cli.py b/scripts/admin_cli.py index 955fcf5..5cbb127 100644 --- a/scripts/admin_cli.py +++ b/scripts/admin_cli.py @@ -11,7 +11,7 @@ - Real-time metrics dashboard - API key management (create, list, revoke, update) - Per-language and per-API-key usage stats - - Container pool monitoring + - Sandbox pool monitoring """ import argparse @@ -39,7 +39,7 @@ from src.config import settings from src.core.pool import redis_pool -from src.services.detailed_metrics import DetailedMetricsService +from src.services.metrics import metrics_service from src.services.api_key_manager import ApiKeyManagerService from src.models.api_key import RateLimits @@ -62,10 +62,10 @@ async def get_redis(): return redis_client -async def get_metrics_service() -> DetailedMetricsService: - """Get metrics service instance.""" - redis_client = await get_redis() - return DetailedMetricsService(redis_client) +async def ensure_metrics_started(): + """Ensure metrics service is running.""" + if not metrics_service._running: + await metrics_service.start() async def get_key_manager() -> ApiKeyManagerService: @@ -119,121 +119,95 @@ def format_limit(value: Optional[int]) -> str: # Metrics Panels # ============================================================================ -async def build_summary_panel(service: DetailedMetricsService) -> Panel: +async def build_summary_panel(hours: int = 24) -> Panel: """Build summary panel.""" - summary = await service.get_summary() + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + summary = await metrics_service.get_summary_stats(start=start, end=now) table = Table(show_header=False, box=None, padding=(0, 2)) table.add_column("Metric", style="cyan") table.add_column("Value", style="white") - table.add_row("Total Executions", str(summary.total_executions)) - table.add_row("Today (24h)", str(summary.total_executions_today)) - table.add_row("This Hour", str(summary.total_executions_hour)) - table.add_row("", "") - table.add_row("Success Rate", format_rate(summary.success_rate)) - table.add_row("Avg Exec Time", format_duration(summary.avg_execution_time_ms)) - table.add_row("Pool Hit Rate", format_rate(summary.pool_hit_rate)) + table.add_row("Total Executions", str(summary.get("total_executions", 0))) + table.add_row("Success Rate", format_rate(summary.get("success_rate", 0))) + table.add_row("Avg Exec Time", format_duration(summary.get("avg_execution_time_ms", 0))) + table.add_row("Pool Hit Rate", format_rate(summary.get("pool_hit_rate", 0))) + table.add_row("Active API Keys", str(summary.get("active_api_keys", 0))) - return Panel(table, title="[bold]Metrics Summary[/bold]", border_style="blue") + return Panel(table, title=f"[bold]Metrics Summary[/bold] (last {hours}h)", border_style="blue") -async def build_languages_table(service: DetailedMetricsService, hours: int = 24) -> Table: +async def build_languages_table(hours: int = 24) -> Table: """Build languages table.""" - language_stats = await service.get_language_stats(hours=hours) + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + lang_data = await metrics_service.get_language_usage(start=start, end=now) + by_language = lang_data.get("by_language", {}) table = Table(title=f"Language Metrics (last {hours}h)", box=box.ROUNDED) table.add_column("Language", style="cyan", justify="center") table.add_column("Executions", justify="right") - table.add_column("Success", justify="right", style="green") - table.add_column("Failures", justify="right", style="red") - table.add_column("Avg Time", justify="right") - table.add_column("Error Rate", justify="right") - sorted_languages = sorted( - language_stats.values(), - key=lambda x: x.execution_count, - reverse=True - ) + sorted_langs = sorted(by_language.items(), key=lambda x: x[1], reverse=True) - for lang in sorted_languages: - table.add_row( - lang.language.upper(), - str(lang.execution_count), - str(lang.success_count), - str(lang.failure_count), - format_duration(lang.avg_execution_time_ms), - format_error_rate(lang.error_rate) - ) + for lang, count in sorted_langs: + table.add_row(lang.upper(), str(count)) - if sorted_languages: - total_exec = sum(l.execution_count for l in sorted_languages) - total_success = sum(l.success_count for l in sorted_languages) - total_fail = sum(l.failure_count for l in sorted_languages) - overall_rate = (total_fail / total_exec * 100) if total_exec > 0 else 0 + if sorted_langs: + total = sum(count for _, count in sorted_langs) + table.add_row("", "", style="dim") + table.add_row("[bold]TOTAL[/bold]", f"[bold]{total}[/bold]") - table.add_row("", "", "", "", "", "", style="dim") - table.add_row( - "[bold]TOTAL[/bold]", - f"[bold]{total_exec}[/bold]", - f"[bold]{total_success}[/bold]", - f"[bold]{total_fail}[/bold]", - "", - format_error_rate(overall_rate) - ) + if not sorted_langs: + table.add_row("[dim]No data[/dim]", "") return table -async def build_pool_panel(service: DetailedMetricsService) -> Panel: +async def build_pool_panel() -> Panel: """Build pool stats panel.""" - pool_stats = await service.get_pool_stats() + pool_stats = metrics_service.get_pool_stats() table = Table(show_header=False, box=None, padding=(0, 2)) table.add_column("Metric", style="cyan") table.add_column("Value", style="white") - table.add_row("Total Acquisitions", str(pool_stats.total_acquisitions)) - table.add_row("Pool Hits", Text(str(pool_stats.pool_hits), style="green")) - table.add_row("Pool Misses", Text(str(pool_stats.pool_misses), style="yellow")) - table.add_row("Hit Rate", format_rate(pool_stats.hit_rate)) - table.add_row("Avg Acquire Time", format_duration(pool_stats.avg_acquire_time_ms)) - table.add_row("Exhaustion Events", Text(str(pool_stats.exhaustion_events), - style="red" if pool_stats.exhaustion_events > 0 else "green")) + table.add_row("Total Acquisitions", str(pool_stats["total_acquisitions"])) + table.add_row("Pool Hits", Text(str(pool_stats["pool_hits"]), style="green")) + table.add_row("Pool Misses", Text(str(pool_stats["pool_misses"]), style="yellow")) + table.add_row("Hit Rate", format_rate(pool_stats["hit_rate"])) + table.add_row("Avg Acquire Time", format_duration(pool_stats["avg_acquire_time_ms"])) + table.add_row("Exhaustion Events", Text(str(pool_stats["exhaustion_events"]), + style="red" if pool_stats["exhaustion_events"] > 0 else "green")) - return Panel(table, title="[bold]Container Pool[/bold]", border_style="magenta") + return Panel(table, title="[bold]Sandbox Pool[/bold]", border_style="magenta") -async def build_hourly_table(service: DetailedMetricsService, hours: int = 12) -> Table: - """Build hourly breakdown table.""" +async def build_hourly_table(hours: int = 12) -> Table: + """Build hourly breakdown table using time series from SQLite.""" + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + data = await metrics_service.get_time_series( + start=start, end=now, granularity="hour" + ) + table = Table(title=f"Hourly Breakdown (last {hours}h)", box=box.ROUNDED) table.add_column("Hour", style="dim") table.add_column("Executions", justify="right") - table.add_column("Success", justify="right", style="green") - table.add_column("Failures", justify="right", style="red") - table.add_column("Timeouts", justify="right", style="yellow") + table.add_column("Success Rate", justify="right") table.add_column("Avg Time", justify="right") - now = datetime.now(timezone.utc) - - for i in range(hours): - hour = now - timedelta(hours=i) - metrics = await service.get_hourly_metrics(hour) + for i, ts in enumerate(data.get("timestamps", [])): + table.add_row( + ts, + str(data["executions"][i]), + format_rate(data["success_rate"][i]), + format_duration(data["avg_duration"][i]), + ) - if metrics: - table.add_row( - hour.strftime('%m-%d %H:00'), - str(metrics.execution_count), - str(metrics.success_count), - str(metrics.failure_count), - str(metrics.timeout_count), - format_duration(metrics.avg_execution_time_ms) - ) - else: - table.add_row( - hour.strftime('%m-%d %H:00'), - "[dim]0[/dim]", "[dim]0[/dim]", "[dim]0[/dim]", "[dim]0[/dim]", "[dim]-[/dim]" - ) + if not data.get("timestamps"): + table.add_row("[dim]No data[/dim]", "", "", "") return table @@ -327,7 +301,7 @@ async def build_key_detail_panel(manager: ApiKeyManagerService, key_hash: str) - async def metrics_menu(): """Metrics sub-menu.""" - service = await get_metrics_service() + await ensure_metrics_started() while True: console.clear() @@ -338,17 +312,19 @@ async def metrics_menu(): console.print() # Quick stats - summary = await service.get_summary() - console.print(f" [cyan]Today:[/cyan] {summary.total_executions_today} executions " - f"[cyan]Success:[/cyan] {summary.success_rate:.1f}% " - f"[cyan]Avg:[/cyan] {format_duration(summary.avg_execution_time_ms)}") + now = datetime.now(timezone.utc) + start = now - timedelta(hours=24) + summary = await metrics_service.get_summary_stats(start=start, end=now) + console.print(f" [cyan]Today:[/cyan] {summary.get('total_executions', 0)} executions " + f"[cyan]Success:[/cyan] {summary.get('success_rate', 0):.1f}% " + f"[cyan]Avg:[/cyan] {format_duration(summary.get('avg_execution_time_ms', 0))}") console.print() console.print("[bold]Options:[/bold]") console.print(" [cyan]1[/cyan] Summary") console.print(" [cyan]2[/cyan] Language breakdown") console.print(" [cyan]3[/cyan] Hourly breakdown") - console.print(" [cyan]4[/cyan] Container pool stats") + console.print(" [cyan]4[/cyan] Sandbox pool stats") console.print(" [cyan]5[/cyan] Live dashboard (auto-refresh)") console.print(" [cyan]b[/cyan] Back to main menu") console.print() @@ -358,30 +334,30 @@ async def metrics_menu(): if choice == "b": break elif choice == "1": - panel = await build_summary_panel(service) + panel = await build_summary_panel() console.print() console.print(panel) elif choice == "2": - table = await build_languages_table(service, 24) + table = await build_languages_table(24) console.print() console.print(table) elif choice == "3": - table = await build_hourly_table(service, 12) + table = await build_hourly_table(12) console.print() console.print(table) elif choice == "4": - panel = await build_pool_panel(service) + panel = await build_pool_panel() console.print() console.print(panel) elif choice == "5": - await live_dashboard(service) + await live_dashboard() continue console.print() Prompt.ask("[dim]Press Enter to continue[/dim]", default="") -async def live_dashboard(service: DetailedMetricsService): +async def live_dashboard(): """Auto-refresh dashboard.""" console.print("\n[bold cyan]Live Dashboard[/bold cyan] [dim](Ctrl+C to exit)[/dim]\n") @@ -395,12 +371,12 @@ async def live_dashboard(service: DetailedMetricsService): )) console.print() - summary_panel = await build_summary_panel(service) - pool_panel = await build_pool_panel(service) + summary_panel = await build_summary_panel() + pool_panel = await build_pool_panel() console.print(Columns([summary_panel, pool_panel], equal=True, expand=True)) console.print() - lang_table = await build_languages_table(service, 24) + lang_table = await build_languages_table(24) console.print(lang_table) console.print() @@ -633,16 +609,19 @@ async def main_menu(): # Quick stats try: - service = await get_metrics_service() - summary = await service.get_summary() + await ensure_metrics_started() + now = datetime.now(timezone.utc) + start = now - timedelta(hours=24) + summary = await metrics_service.get_summary_stats(start=start, end=now) manager = await get_key_manager() keys = await manager.list_keys() + success_rate = summary.get("success_rate", 0) stats_table = Table(show_header=False, box=None) stats_table.add_column("", style="dim") stats_table.add_column("") - stats_table.add_row("Executions (24h):", f"[cyan]{summary.total_executions_today}[/cyan]") - stats_table.add_row("Success Rate:", f"[{'green' if summary.success_rate >= 80 else 'yellow'}]{summary.success_rate:.1f}%[/]") + stats_table.add_row("Executions (24h):", f"[cyan]{summary.get('total_executions', 0)}[/cyan]") + stats_table.add_row("Success Rate:", f"[{'green' if success_rate >= 80 else 'yellow'}]{success_rate:.1f}%[/]") stats_table.add_row("API Keys:", f"[cyan]{len(keys)}[/cyan] active") console.print(stats_table) @@ -673,8 +652,8 @@ async def main_menu(): console.print() Prompt.ask("[dim]Press Enter to continue[/dim]", default="") elif choice == "4": - service = await get_metrics_service() - await live_dashboard(service) + await ensure_metrics_started() + await live_dashboard() # ============================================================================ @@ -749,8 +728,8 @@ async def cmd_revoke(args): async def cmd_summary(args): """Show quick summary.""" - service = await get_metrics_service() - panel = await build_summary_panel(service) + await ensure_metrics_started() + panel = await build_summary_panel() console.print() console.print(panel) console.print() diff --git a/scripts/cleanup-containers.sh b/scripts/cleanup-containers.sh deleted file mode 100755 index 69837a8..0000000 --- a/scripts/cleanup-containers.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Cleanup orphaned pooled containers created by Code Interpreter API -# -# Run this after docker compose down if pooled containers remain: -# ./scripts/cleanup-containers.sh -# -# Or combine with compose down: -# docker compose down && ./scripts/cleanup-containers.sh - -set -e - -LABEL="com.code-interpreter.managed=true" - -# Find containers with our label -CONTAINERS=$(docker ps -aq --filter "label=$LABEL" 2>/dev/null || true) - -if [ -z "$CONTAINERS" ]; then - echo "No orphaned code-interpreter containers found." - exit 0 -fi - -COUNT=$(echo "$CONTAINERS" | wc -l) -echo "Found $COUNT orphaned container(s) with label '$LABEL'" - -# Remove them -echo "$CONTAINERS" | xargs docker rm -f - -echo "Cleanup complete." diff --git a/scripts/load_test/client.py b/scripts/load_test/client.py index 479b70a..483ac8d 100644 --- a/scripts/load_test/client.py +++ b/scripts/load_test/client.py @@ -201,7 +201,7 @@ async def get_metrics(self) -> Dict[str, Any]: return {"error": str(e)} async def get_pool_metrics(self) -> Dict[str, Any]: - """Get container pool metrics.""" + """Get sandbox pool metrics.""" if self._session is None: await self.start() diff --git a/scripts/load_test/config.py b/scripts/load_test/config.py index 3f5113b..3e4bca4 100644 --- a/scripts/load_test/config.py +++ b/scripts/load_test/config.py @@ -162,7 +162,7 @@ class LoadTestDefaults: warmup_requests: int = 10 timeout_seconds: int = 60 monitor_interval_seconds: float = 1.0 - enable_docker_stats: bool = True + enable_sandbox_stats: bool = True output_dir: str = "./load_test_results" # Thresholds diff --git a/scripts/load_test/models.py b/scripts/load_test/models.py index 60477f8..a95c736 100644 --- a/scripts/load_test/models.py +++ b/scripts/load_test/models.py @@ -67,8 +67,8 @@ def to_dict(self) -> Dict[str, Any]: @dataclass -class DockerStats: - """Docker container resource statistics.""" +class SandboxStats: + """Sandbox process resource statistics.""" container_count: int = 0 total_cpu_percent: float = 0.0 @@ -97,7 +97,7 @@ class ConcurrencyTestResult: errors: Dict[str, int] = field(default_factory=dict) duration_seconds: float = 0.0 system_metrics: SystemMetrics = field(default_factory=SystemMetrics) - docker_stats: Optional[DockerStats] = None + sandbox_stats: Optional[SandboxStats] = None @property def success_rate(self) -> float: @@ -179,8 +179,8 @@ def to_dict(self) -> Dict[str, Any]: "errors": self.errors, "system_metrics": self.system_metrics.to_dict(), } - if self.docker_stats: - result["docker_stats"] = self.docker_stats.to_dict() + if self.sandbox_stats: + result["sandbox_stats"] = self.sandbox_stats.to_dict() return result @@ -288,7 +288,7 @@ class LoadTestConfig: warmup_requests: int = 10 timeout_seconds: int = 60 monitor_interval_seconds: float = 1.0 - enable_docker_stats: bool = True + enable_sandbox_stats: bool = True output_dir: str = "./load_test_results" def to_dict(self) -> Dict[str, Any]: @@ -302,7 +302,7 @@ def to_dict(self) -> Dict[str, Any]: "warmup_requests": self.warmup_requests, "timeout_seconds": self.timeout_seconds, "monitor_interval_seconds": self.monitor_interval_seconds, - "enable_docker_stats": self.enable_docker_stats, + "enable_sandbox_stats": self.enable_sandbox_stats, "output_dir": self.output_dir, } diff --git a/scripts/load_test/monitor.py b/scripts/load_test/monitor.py index 6c70cca..8d7e7b3 100644 --- a/scripts/load_test/monitor.py +++ b/scripts/load_test/monitor.py @@ -11,7 +11,7 @@ except ImportError: PSUTIL_AVAILABLE = False -from .models import DockerStats, SystemMetrics +from .models import SandboxStats, SystemMetrics class ResourceMonitor: @@ -20,14 +20,14 @@ class ResourceMonitor: def __init__( self, sample_interval: float = 1.0, - enable_docker_stats: bool = True, + enable_sandbox_stats: bool = True, ): self.sample_interval = sample_interval - self.enable_docker_stats = enable_docker_stats + self.enable_sandbox_stats = enable_sandbox_stats self._running = False self._task: Optional[asyncio.Task] = None self._samples: List[SystemMetrics] = [] - self._docker_samples: List[DockerStats] = [] + self._sandbox_samples: List[SandboxStats] = [] self._initial_disk_io: Optional[tuple] = None self._initial_net_io: Optional[tuple] = None @@ -38,7 +38,7 @@ async def start(self) -> None: self._running = True self._samples = [] - self._docker_samples = [] + self._sandbox_samples = [] # Capture initial I/O counters if PSUTIL_AVAILABLE: @@ -71,10 +71,10 @@ async def _sampling_loop(self) -> None: metrics = self._get_current_system_metrics() self._samples.append(metrics) - if self.enable_docker_stats: - docker_stats = await self._get_docker_stats() - if docker_stats: - self._docker_samples.append(docker_stats) + if self.enable_sandbox_stats: + sandbox_stats = self._get_sandbox_stats() + if sandbox_stats: + self._sandbox_samples.append(sandbox_stats) await asyncio.sleep(self.sample_interval) except asyncio.CancelledError: @@ -121,81 +121,44 @@ def _get_current_system_metrics(self) -> SystemMetrics: network_recv_mb=net_recv_mb, ) - async def _get_docker_stats(self) -> Optional[DockerStats]: - """Get Docker container statistics.""" - try: - # Run docker stats --no-stream in subprocess - proc = await asyncio.create_subprocess_exec( - "docker", "stats", "--no-stream", "--format", - '{"name":"{{.Name}}","cpu":"{{.CPUPerc}}","mem":"{{.MemUsage}}","mem_perc":"{{.MemPerc}}"}', - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=5.0) - - if proc.returncode != 0: - return None + def _get_sandbox_stats(self) -> Optional[SandboxStats]: + """Get nsjail sandbox process statistics.""" + if not PSUTIL_AVAILABLE: + return None - containers = [] + try: + nsjail_procs = [] total_cpu = 0.0 total_memory_mb = 0.0 - for line in stdout.decode().strip().split("\n"): - if not line: - continue + for proc in psutil.process_iter(["pid", "name", "cpu_percent", "memory_info"]): try: - container = json.loads(line) - # Parse CPU percentage (e.g., "0.50%") - cpu_str = container.get("cpu", "0%").replace("%", "") - cpu = float(cpu_str) if cpu_str else 0.0 - total_cpu += cpu - - # Parse memory usage (e.g., "50MiB / 1GiB") - mem_str = container.get("mem", "0MiB / 0GiB").split("/")[0].strip() - mem_mb = self._parse_memory_string(mem_str) - total_memory_mb += mem_mb - - containers.append({ - "name": container.get("name", "unknown"), - "cpu_percent": cpu, - "memory_mb": mem_mb, - }) - except (json.JSONDecodeError, ValueError): + if proc.info["name"] and "nsjail" in proc.info["name"]: + cpu = proc.info.get("cpu_percent", 0.0) or 0.0 + mem_info = proc.info.get("memory_info") + mem_mb = mem_info.rss / (1024 * 1024) if mem_info else 0.0 + + total_cpu += cpu + total_memory_mb += mem_mb + + nsjail_procs.append({ + "name": f"nsjail-{proc.info['pid']}", + "cpu_percent": cpu, + "memory_mb": mem_mb, + }) + except (psutil.NoSuchProcess, psutil.AccessDenied): continue - # Filter for code-interpreter containers only - code_interpreter_containers = [ - c for c in containers - if "code-interpreter" in c["name"].lower() or "executor" in c["name"].lower() - ] - - return DockerStats( - container_count=len(code_interpreter_containers), - total_cpu_percent=sum(c["cpu_percent"] for c in code_interpreter_containers), - total_memory_mb=sum(c["memory_mb"] for c in code_interpreter_containers), - containers=code_interpreter_containers, + return SandboxStats( + container_count=len(nsjail_procs), + total_cpu_percent=total_cpu, + total_memory_mb=total_memory_mb, + containers=nsjail_procs, ) - except (asyncio.TimeoutError, FileNotFoundError): - return None except Exception: return None - def _parse_memory_string(self, mem_str: str) -> float: - """Parse memory string like '50MiB' or '1.5GiB' to MB.""" - mem_str = mem_str.strip().upper() - try: - if "GIB" in mem_str or "GB" in mem_str: - return float(mem_str.replace("GIB", "").replace("GB", "")) * 1024 - elif "MIB" in mem_str or "MB" in mem_str: - return float(mem_str.replace("MIB", "").replace("MB", "")) - elif "KIB" in mem_str or "KB" in mem_str: - return float(mem_str.replace("KIB", "").replace("KB", "")) / 1024 - else: - return float(mem_str) - except ValueError: - return 0.0 - def _aggregate_metrics(self) -> SystemMetrics: """Aggregate all samples into summary metrics.""" if not self._samples: @@ -221,21 +184,21 @@ def _aggregate_metrics(self) -> SystemMetrics: network_recv_mb=last_sample.network_recv_mb, ) - def get_docker_summary(self) -> Optional[DockerStats]: - """Get aggregated Docker statistics.""" - if not self._docker_samples: + def get_sandbox_summary(self) -> Optional[SandboxStats]: + """Get aggregated sandbox statistics.""" + if not self._sandbox_samples: return None # Calculate averages - container_counts = [s.container_count for s in self._docker_samples] - cpu_totals = [s.total_cpu_percent for s in self._docker_samples] - memory_totals = [s.total_memory_mb for s in self._docker_samples] + container_counts = [s.container_count for s in self._sandbox_samples] + cpu_totals = [s.total_cpu_percent for s in self._sandbox_samples] + memory_totals = [s.total_memory_mb for s in self._sandbox_samples] - return DockerStats( + return SandboxStats( container_count=max(container_counts) if container_counts else 0, total_cpu_percent=sum(cpu_totals) / len(cpu_totals) if cpu_totals else 0.0, total_memory_mb=max(memory_totals) if memory_totals else 0.0, - containers=[], # Don't include individual container details in summary + containers=[], # Don't include individual details in summary ) def get_current_metrics(self) -> SystemMetrics: @@ -243,19 +206,18 @@ def get_current_metrics(self) -> SystemMetrics: return self._get_current_system_metrics() -async def get_docker_container_count() -> int: - """Get count of running Docker containers.""" - try: - proc = await asyncio.create_subprocess_exec( - "docker", "ps", "-q", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=5.0) - - if proc.returncode == 0: - lines = stdout.decode().strip().split("\n") - return len([l for l in lines if l]) +async def get_sandbox_process_count() -> int: + """Get count of running nsjail sandbox processes.""" + if not PSUTIL_AVAILABLE: return 0 + try: + count = 0 + for proc in psutil.process_iter(["name"]): + try: + if proc.info["name"] and "nsjail" in proc.info["name"]: + count += 1 + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + return count except Exception: return 0 diff --git a/scripts/load_test/runner.py b/scripts/load_test/runner.py index aed3946..605658f 100644 --- a/scripts/load_test/runner.py +++ b/scripts/load_test/runner.py @@ -37,7 +37,7 @@ def __init__( ) self.monitor = ResourceMonitor( sample_interval=config.monitor_interval_seconds, - enable_docker_stats=config.enable_docker_stats, + enable_sandbox_stats=config.enable_sandbox_stats, ) self.progress_callback = progress_callback or (lambda x: None) @@ -95,7 +95,7 @@ async def bounded_request(iteration: int) -> ExecutionResult: # Stop monitoring end_time = time.perf_counter() system_metrics = await self.monitor.stop() - docker_stats = self.monitor.get_docker_summary() + sandbox_stats = self.monitor.get_sandbox_summary() # Process results for exec_result in execution_results: @@ -114,7 +114,7 @@ async def bounded_request(iteration: int) -> ExecutionResult: result.duration_seconds = end_time - start_time result.system_metrics = system_metrics - result.docker_stats = docker_stats + result.sandbox_stats = sandbox_stats return result @@ -294,7 +294,7 @@ async def run_full_suite( # when run_scenario_at_concurrency() calls start()/stop() on self.monitor overall_monitor = ResourceMonitor( sample_interval=self.config.monitor_interval_seconds, - enable_docker_stats=self.config.enable_docker_stats, + enable_sandbox_stats=self.config.enable_sandbox_stats, ) await overall_monitor.start() diff --git a/scripts/metrics_cli.py b/scripts/metrics_cli.py index fc6e954..d2d4caf 100644 --- a/scripts/metrics_cli.py +++ b/scripts/metrics_cli.py @@ -11,8 +11,7 @@ (no args) Interactive menu summary Show metrics summary languages Per-language breakdown - api-keys Per-API-key usage - pool Container pool stats + pool Sandbox pool stats watch Auto-refresh dashboard """ @@ -34,29 +33,19 @@ from rich.console import Console from rich.table import Table from rich.panel import Panel -from rich.layout import Layout -from rich.live import Live from rich.text import Text -from rich.progress import Progress, SpinnerColumn, TextColumn -from rich.prompt import Prompt, IntPrompt +from rich.prompt import Prompt from rich import box -from src.core.pool import redis_pool -from src.services.detailed_metrics import DetailedMetricsService +from src.services.metrics import metrics_service console = Console() -async def get_metrics_service() -> DetailedMetricsService: - """Get metrics service instance.""" - redis_client = redis_pool.get_client() - try: - await redis_client.ping() - except Exception as e: - console.print(f"[red]Error:[/red] Cannot connect to Redis: {e}") - console.print("\nEnsure Redis is running and REDIS_URL/REDIS_HOST is configured correctly.") - sys.exit(1) - return DetailedMetricsService(redis_client) +async def ensure_started(): + """Start the metrics service if not already running.""" + if not metrics_service._running: + await metrics_service.start() def format_duration(ms: float) -> str: @@ -80,194 +69,72 @@ def format_rate(rate: float, good_threshold: float = 80, bad_threshold: float = return Text(text, style="red") -def format_error_rate(rate: float) -> Text: - """Format error rate (lower is better).""" - text = f"{rate:.1f}%" - if rate <= 5: - return Text(text, style="green") - elif rate <= 20: - return Text(text, style="yellow") - else: - return Text(text, style="red") - - -async def build_summary_panel(service: DetailedMetricsService) -> Panel: - """Build summary panel.""" - summary = await service.get_summary() +async def build_summary_panel(hours: int = 24) -> Panel: + """Build summary panel from SQLite data.""" + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + summary = await metrics_service.get_summary_stats(start=start, end=now) table = Table(show_header=False, box=None, padding=(0, 2)) table.add_column("Metric", style="cyan") table.add_column("Value", style="white") - table.add_row("Total Executions", str(summary.total_executions)) - table.add_row("Today (24h)", str(summary.total_executions_today)) - table.add_row("This Hour", str(summary.total_executions_hour)) - table.add_row("", "") - table.add_row("Success Rate", format_rate(summary.success_rate)) - table.add_row("Avg Exec Time", format_duration(summary.avg_execution_time_ms)) - table.add_row("Active API Keys", str(summary.active_api_keys)) - table.add_row("Pool Hit Rate", format_rate(summary.pool_hit_rate)) + table.add_row("Total Executions", str(summary.get("total_executions", 0))) + table.add_row("Success Rate", format_rate(summary.get("success_rate", 0))) + table.add_row("Avg Exec Time", format_duration(summary.get("avg_execution_time_ms", 0))) + table.add_row("Pool Hit Rate", format_rate(summary.get("pool_hit_rate", 0))) + table.add_row("Active API Keys", str(summary.get("active_api_keys", 0))) - return Panel(table, title="[bold]Summary[/bold] (last 24h)", border_style="blue") + return Panel(table, title=f"[bold]Summary[/bold] (last {hours}h)", border_style="blue") -async def build_languages_table(service: DetailedMetricsService, hours: int = 24) -> Table: - """Build languages table.""" - language_stats = await service.get_language_stats(hours=hours) +async def build_languages_table(hours: int = 24) -> Table: + """Build languages table from SQLite data.""" + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + lang_data = await metrics_service.get_language_usage(start=start, end=now) + by_language = lang_data.get("by_language", {}) table = Table(title=f"Language Metrics (last {hours}h)", box=box.ROUNDED) table.add_column("Language", style="cyan", justify="center") table.add_column("Executions", justify="right") - table.add_column("Success", justify="right", style="green") - table.add_column("Failures", justify="right", style="red") - table.add_column("Avg Time", justify="right") - table.add_column("Error Rate", justify="right") - - # Sort by execution count - sorted_languages = sorted( - language_stats.values(), - key=lambda x: x.execution_count, - reverse=True - ) - for lang in sorted_languages: - table.add_row( - lang.language.upper(), - str(lang.execution_count), - str(lang.success_count), - str(lang.failure_count), - format_duration(lang.avg_execution_time_ms), - format_error_rate(lang.error_rate) - ) + sorted_langs = sorted(by_language.items(), key=lambda x: x[1], reverse=True) - if sorted_languages: - total_exec = sum(l.execution_count for l in sorted_languages) - total_success = sum(l.success_count for l in sorted_languages) - total_fail = sum(l.failure_count for l in sorted_languages) - overall_rate = (total_fail / total_exec * 100) if total_exec > 0 else 0 - - table.add_row("", "", "", "", "", "", style="dim") - table.add_row( - "[bold]TOTAL[/bold]", - f"[bold]{total_exec}[/bold]", - f"[bold]{total_success}[/bold]", - f"[bold]{total_fail}[/bold]", - "", - format_error_rate(overall_rate) - ) + for lang, count in sorted_langs: + table.add_row(lang.upper(), str(count)) - return table - - -async def build_api_keys_table(service: DetailedMetricsService, hours: int = 24) -> Table: - """Build API keys usage table.""" - # Get all API key stats by scanning Redis - redis = service.redis - - # Find all API key metric keys - key_stats = {} - cursor = 0 - while True: - cursor, keys = await redis.scan(cursor, match="metrics:api_key:*", count=100) - if keys: - for key in keys: - key_str = key.decode() if isinstance(key, bytes) else key - # Extract API key hash prefix from key - # Format: metrics:api_key:{hash}:hour:{hour_key} - parts = key_str.split(":") - if len(parts) >= 3: - api_hash = parts[2] - if api_hash not in key_stats: - key_stats[api_hash] = await service.get_api_key_stats(api_hash, hours=hours) - if cursor == 0: - break - - table = Table(title=f"API Key Usage (last {hours}h)", box=box.ROUNDED) - table.add_column("Key Hash", style="cyan") - table.add_column("Executions", justify="right") - table.add_column("Success Rate", justify="right") - table.add_column("Avg Time", justify="right") - table.add_column("File Ops", justify="right", style="dim") - - if not key_stats: - table.add_row("[dim]No API key usage data found[/dim]", "", "", "", "") - else: - # Sort by execution count - for key_hash, stats in sorted(key_stats.items(), key=lambda x: x[1].execution_count, reverse=True): - avg_time = (stats.total_execution_time_ms / stats.execution_count) if stats.execution_count > 0 else 0 - - table.add_row( - f"{key_hash}...", - str(stats.execution_count), - format_rate(stats.success_rate), - format_duration(avg_time), - str(stats.file_operations) - ) + if not sorted_langs: + table.add_row("[dim]No data[/dim]", "") return table -async def build_pool_panel(service: DetailedMetricsService) -> Panel: - """Build pool stats panel.""" - pool_stats = await service.get_pool_stats() +async def build_pool_panel() -> Panel: + """Build pool stats panel from in-memory data.""" + pool_stats = metrics_service.get_pool_stats() table = Table(show_header=False, box=None, padding=(0, 2)) table.add_column("Metric", style="cyan") table.add_column("Value", style="white") - table.add_row("Total Acquisitions", str(pool_stats.total_acquisitions)) - table.add_row("Pool Hits", Text(str(pool_stats.pool_hits), style="green")) - table.add_row("Pool Misses", Text(str(pool_stats.pool_misses), style="yellow")) - table.add_row("Hit Rate", format_rate(pool_stats.hit_rate)) - table.add_row("Avg Acquire Time", format_duration(pool_stats.avg_acquire_time_ms)) - table.add_row("Exhaustion Events", Text(str(pool_stats.exhaustion_events), - style="red" if pool_stats.exhaustion_events > 0 else "green")) - - return Panel(table, title="[bold]Container Pool[/bold]", border_style="magenta") - + table.add_row("Total Acquisitions", str(pool_stats["total_acquisitions"])) + table.add_row("Pool Hits", Text(str(pool_stats["pool_hits"]), style="green")) + table.add_row("Pool Misses", Text(str(pool_stats["pool_misses"]), style="yellow")) + table.add_row("Hit Rate", format_rate(pool_stats["hit_rate"])) + table.add_row("Avg Acquire Time", format_duration(pool_stats["avg_acquire_time_ms"])) + table.add_row("Exhaustion Events", Text( + str(pool_stats["exhaustion_events"]), + style="red" if pool_stats["exhaustion_events"] > 0 else "green" + )) -async def build_hourly_table(service: DetailedMetricsService, hours: int = 12) -> Table: - """Build hourly breakdown table.""" - table = Table(title=f"Hourly Breakdown (last {hours}h)", box=box.ROUNDED) - table.add_column("Hour", style="dim") - table.add_column("Executions", justify="right") - table.add_column("Success", justify="right", style="green") - table.add_column("Failures", justify="right", style="red") - table.add_column("Timeouts", justify="right", style="yellow") - table.add_column("Avg Time", justify="right") - - now = datetime.now(timezone.utc) - - for i in range(hours): - hour = now - timedelta(hours=i) - metrics = await service.get_hourly_metrics(hour) - - if metrics: - table.add_row( - hour.strftime('%m-%d %H:00'), - str(metrics.execution_count), - str(metrics.success_count), - str(metrics.failure_count), - str(metrics.timeout_count), - format_duration(metrics.avg_execution_time_ms) - ) - else: - table.add_row( - hour.strftime('%m-%d %H:00'), - "[dim]0[/dim]", - "[dim]0[/dim]", - "[dim]0[/dim]", - "[dim]0[/dim]", - "[dim]-[/dim]" - ) - - return table + return Panel(table, title="[bold]Sandbox Pool[/bold]", border_style="magenta") async def cmd_summary(args): """Show summary.""" - service = await get_metrics_service() - panel = await build_summary_panel(service) + await ensure_started() + panel = await build_summary_panel(getattr(args, "hours", 24)) console.print() console.print(panel) console.print() @@ -275,17 +142,8 @@ async def cmd_summary(args): async def cmd_languages(args): """Show per-language metrics.""" - service = await get_metrics_service() - table = await build_languages_table(service, args.hours) - console.print() - console.print(table) - console.print() - - -async def cmd_api_keys(args): - """Show per-API-key metrics.""" - service = await get_metrics_service() - table = await build_api_keys_table(service, args.hours) + await ensure_started() + table = await build_languages_table(args.hours) console.print() console.print(table) console.print() @@ -293,25 +151,16 @@ async def cmd_api_keys(args): async def cmd_pool(args): """Show pool stats.""" - service = await get_metrics_service() - panel = await build_pool_panel(service) + await ensure_started() + panel = await build_pool_panel() console.print() console.print(panel) console.print() -async def cmd_hourly(args): - """Show hourly breakdown.""" - service = await get_metrics_service() - table = await build_hourly_table(service, args.hours) - console.print() - console.print(table) - console.print() - - async def cmd_watch(args): """Auto-refresh dashboard.""" - service = await get_metrics_service() + await ensure_started() console.print("\n[bold cyan]Live Metrics Dashboard[/bold cyan]") console.print("[dim]Press Ctrl+C to exit[/dim]\n") @@ -326,16 +175,14 @@ async def cmd_watch(args): )) console.print() - # Summary and Pool side by side - summary_panel = await build_summary_panel(service) - pool_panel = await build_pool_panel(service) + summary_panel = await build_summary_panel() + pool_panel = await build_pool_panel() console.print(summary_panel) console.print() console.print(pool_panel) console.print() - # Language breakdown - lang_table = await build_languages_table(service, 24) + lang_table = await build_languages_table(24) console.print(lang_table) console.print() @@ -348,7 +195,7 @@ async def cmd_watch(args): async def cmd_interactive(args): """Interactive menu.""" - service = await get_metrics_service() + await ensure_started() while True: console.clear() @@ -359,49 +206,42 @@ async def cmd_interactive(args): )) console.print() - # Show quick summary - summary = await service.get_summary() - console.print(f" [cyan]Executions today:[/cyan] {summary.total_executions_today} " - f"[cyan]Success rate:[/cyan] {summary.success_rate:.1f}% " - f"[cyan]Avg time:[/cyan] {format_duration(summary.avg_execution_time_ms)}") + now = datetime.now(timezone.utc) + start = now - timedelta(hours=24) + summary = await metrics_service.get_summary_stats(start=start, end=now) + console.print( + f" [cyan]Executions (24h):[/cyan] {summary.get('total_executions', 0)} " + f"[cyan]Success rate:[/cyan] {summary.get('success_rate', 0):.1f}% " + f"[cyan]Avg time:[/cyan] {format_duration(summary.get('avg_execution_time_ms', 0))}" + ) console.print() console.print("[bold]Commands:[/bold]") console.print(" [cyan]1[/cyan] Summary") console.print(" [cyan]2[/cyan] Language breakdown") - console.print(" [cyan]3[/cyan] API key usage") - console.print(" [cyan]4[/cyan] Container pool stats") - console.print(" [cyan]5[/cyan] Hourly breakdown") - console.print(" [cyan]6[/cyan] Live dashboard (auto-refresh)") + console.print(" [cyan]3[/cyan] Sandbox pool stats") + console.print(" [cyan]4[/cyan] Live dashboard (auto-refresh)") console.print(" [cyan]q[/cyan] Quit") console.print() - choice = Prompt.ask("Select", choices=["1", "2", "3", "4", "5", "6", "q"], default="1") + choice = Prompt.ask("Select", choices=["1", "2", "3", "4", "q"], default="1") if choice == "q": console.print("[yellow]Goodbye![/yellow]") break elif choice == "1": - panel = await build_summary_panel(service) + panel = await build_summary_panel() console.print() console.print(panel) elif choice == "2": - table = await build_languages_table(service, 24) + table = await build_languages_table(24) console.print() console.print(table) elif choice == "3": - table = await build_api_keys_table(service, 24) - console.print() - console.print(table) - elif choice == "4": - panel = await build_pool_panel(service) + panel = await build_pool_panel() console.print() console.print(panel) - elif choice == "5": - table = await build_hourly_table(service, 12) - console.print() - console.print(table) - elif choice == "6": + elif choice == "4": args.interval = 5 await cmd_watch(args) continue @@ -424,16 +264,8 @@ def main(): lang_p = subparsers.add_parser("languages", help="Per-language metrics") lang_p.add_argument("--hours", type=int, default=24) - # api-keys - keys_p = subparsers.add_parser("api-keys", help="Per-API-key usage") - keys_p.add_argument("--hours", type=int, default=24) - # pool - subparsers.add_parser("pool", help="Container pool stats") - - # hourly - hourly_p = subparsers.add_parser("hourly", help="Hourly breakdown") - hourly_p.add_argument("--hours", type=int, default=12) + subparsers.add_parser("pool", help="Sandbox pool stats") # watch watch_p = subparsers.add_parser("watch", help="Auto-refresh dashboard") @@ -444,9 +276,7 @@ def main(): handlers = { "summary": cmd_summary, "languages": cmd_languages, - "api-keys": cmd_api_keys, "pool": cmd_pool, - "hourly": cmd_hourly, "watch": cmd_watch, None: cmd_interactive, } diff --git a/scripts/perf_test.py b/scripts/perf_test.py index b3db056..ab9abe2 100755 --- a/scripts/perf_test.py +++ b/scripts/perf_test.py @@ -534,9 +534,9 @@ async def main(): print("\n\n>>> New version testing complete.") print(">>> Results saved to /tmp/new_results.json") print("\n>>> To test the old version:") - print(">>> 1. docker compose down") - print(">>> 2. Edit docker-compose.yml to use 'codeinterperter-api:pro'") - print(">>> 3. docker compose up -d") + print(">>> 1. Stop the running API server") + print(">>> 2. Switch to the old version branch") + print(">>> 3. Start the API server again") print(">>> 4. Run this script again with --old flag") diff --git a/scripts/run_functional_tests.py b/scripts/run_functional_tests.py index 7d9e909..34e3540 100644 --- a/scripts/run_functional_tests.py +++ b/scripts/run_functional_tests.py @@ -178,8 +178,6 @@ async def test_state_persistence(self, client: httpx.AsyncClient): )) return - has_state = r1.json().get("has_state", False) - # Step 2: Use variable r2 = await client.post( "/exec", @@ -196,7 +194,7 @@ async def test_state_persistence(self, client: httpx.AsyncClient): stdout = r2.json().get("stdout", "") if "43" in stdout: passed = True - msg = f"OK - state persisted (has_state={has_state})" + msg = "OK - state persisted" else: passed = False msg = f"Expected '43' in stdout, got: {stdout[:100]}, stderr: {r2.json().get('stderr', '')[:100]}" diff --git a/scripts/setup-wan-network.sh b/scripts/setup-wan-network.sh deleted file mode 100755 index 01588ca..0000000 --- a/scripts/setup-wan-network.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/bin/bash -# Setup script for WAN-only network with iptables rules -# This script should be run with root/sudo privileges -# -# This script creates a Docker network that allows containers to access -# the public internet while blocking access to: -# - Private IP ranges (10.x, 172.16-31.x, 192.168.x) -# - Link-local addresses (169.254.x.x) - includes cloud metadata services -# - Docker host gateway -# - Other containers on the same network (ICC disabled) - -set -e - -# Configuration (can be overridden by environment variables) -NETWORK_NAME="${WAN_NETWORK_NAME:-code-interpreter-wan}" -SUBNET="172.30.0.0/16" -GATEWAY="172.30.0.1" -CHAIN_NAME="CODE_INTERP_WAN" -DNS_SERVERS="${WAN_DNS_SERVERS:-8.8.8.8,1.1.1.1,8.8.4.4}" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -echo -e "${GREEN}Setting up WAN-only network: $NETWORK_NAME${NC}" -echo "" - -# Check for root/sudo -if [ "$EUID" -ne 0 ]; then - echo -e "${RED}Error: This script must be run with root/sudo privileges${NC}" - echo "Please run: sudo $0" - exit 1 -fi - -# Check if Docker is available -if ! command -v docker &> /dev/null; then - echo -e "${RED}Error: Docker is not installed or not in PATH${NC}" - exit 1 -fi - -# Create Docker network if it doesn't exist -if ! docker network inspect "$NETWORK_NAME" >/dev/null 2>&1; then - echo "Creating Docker network..." - docker network create \ - --driver bridge \ - --subnet="$SUBNET" \ - --gateway="$GATEWAY" \ - --opt "com.docker.network.bridge.enable_ip_masquerade=true" \ - --opt "com.docker.network.bridge.enable_icc=false" \ - --label "com.code-interpreter.managed=true" \ - --label "com.code-interpreter.type=wan-access" \ - "$NETWORK_NAME" - echo -e "${GREEN}Network created successfully${NC}" -else - echo -e "${YELLOW}Network $NETWORK_NAME already exists${NC}" -fi - -# Get bridge interface name -NETWORK_ID=$(docker network inspect "$NETWORK_NAME" --format '{{.Id}}' | cut -c1-12) -BRIDGE_NAME="br-$NETWORK_ID" - -echo "" -echo "Network details:" -echo " - Network ID: $NETWORK_ID" -echo " - Bridge interface: $BRIDGE_NAME" -echo " - Subnet: $SUBNET" -echo " - Gateway: $GATEWAY" -echo "" - -# Wait for bridge interface to be available -echo "Waiting for bridge interface..." -for i in {1..10}; do - if ip link show "$BRIDGE_NAME" >/dev/null 2>&1; then - echo -e "${GREEN}Bridge interface $BRIDGE_NAME is ready${NC}" - break - fi - if [ $i -eq 10 ]; then - echo -e "${YELLOW}Warning: Bridge interface not found. iptables rules may fail.${NC}" - echo "This can happen if no containers are connected to the network yet." - fi - sleep 1 -done - -echo "" -echo "Setting up iptables rules..." - -# Create chain if it doesn't exist -iptables -N "$CHAIN_NAME" 2>/dev/null || iptables -F "$CHAIN_NAME" - -# Allow established connections (critical for return traffic) -iptables -A "$CHAIN_NAME" -m state --state ESTABLISHED,RELATED -j ACCEPT - -# Allow DNS to public servers -IFS=',' read -ra DNS_ARRAY <<< "$DNS_SERVERS" -for DNS in "${DNS_ARRAY[@]}"; do - echo " Allowing DNS to $DNS" - iptables -A "$CHAIN_NAME" -p udp -d "$DNS" --dport 53 -j ACCEPT - iptables -A "$CHAIN_NAME" -p tcp -d "$DNS" --dport 53 -j ACCEPT -done - -# Block private IP ranges -echo " Blocking private IP ranges..." -iptables -A "$CHAIN_NAME" -d 10.0.0.0/8 -j DROP -iptables -A "$CHAIN_NAME" -d 172.16.0.0/12 -j DROP -iptables -A "$CHAIN_NAME" -d 192.168.0.0/16 -j DROP -iptables -A "$CHAIN_NAME" -d 169.254.0.0/16 -j DROP -iptables -A "$CHAIN_NAME" -d 127.0.0.0/8 -j DROP -iptables -A "$CHAIN_NAME" -d 224.0.0.0/4 -j DROP -iptables -A "$CHAIN_NAME" -d 240.0.0.0/4 -j DROP - -# Block gateway (Docker host) -echo " Blocking Docker host gateway ($GATEWAY)..." -iptables -A "$CHAIN_NAME" -d "$GATEWAY" -j DROP - -# Allow all other traffic (public internet) -iptables -A "$CHAIN_NAME" -j ACCEPT - -# Insert into FORWARD chain (remove existing rule first to avoid duplicates) -iptables -D FORWARD -i "$BRIDGE_NAME" -j "$CHAIN_NAME" 2>/dev/null || true -iptables -I FORWARD 1 -i "$BRIDGE_NAME" -j "$CHAIN_NAME" - -echo "" -echo -e "${GREEN}WAN network setup complete!${NC}" -echo "" -echo "Containers on '$NETWORK_NAME' can now access:" -echo " - Public internet (all ports)" -echo " - Public DNS servers ($DNS_SERVERS)" -echo "" -echo "Blocked:" -echo " - Private IP ranges (10.x, 172.16-31.x, 192.168.x)" -echo " - Link-local addresses (169.254.x.x)" -echo " - Docker host gateway ($GATEWAY)" -echo " - Inter-container communication" -echo "" -echo -e "${YELLOW}Note: These iptables rules are not persistent across reboots.${NC}" -echo "Run this script again after a system restart, or use iptables-persistent." diff --git a/src/api/__init__.py b/src/api/__init__.py index 5c301df..1dbd563 100644 --- a/src/api/__init__.py +++ b/src/api/__init__.py @@ -1,5 +1,5 @@ """API endpoints for the Code Interpreter API.""" -from . import files, exec, health, state, admin, dashboard_metrics +from . import files, exec, health, admin, dashboard_metrics -__all__ = ["files", "exec", "health", "state", "admin", "dashboard_metrics"] +__all__ = ["files", "exec", "health", "admin", "dashboard_metrics"] diff --git a/src/api/admin.py b/src/api/admin.py index 77653a9..60fb180 100644 --- a/src/api/admin.py +++ b/src/api/admin.py @@ -1,13 +1,14 @@ """Admin API endpoints for dashboard.""" from typing import List, Optional, Dict, Any -from datetime import datetime, timezone -from fastapi import APIRouter, HTTPException, Depends, Header, Query +from datetime import datetime, timedelta, timezone +from fastapi import APIRouter, HTTPException, Depends, Query from pydantic import BaseModel, Field from ..config import settings +from ..dependencies.auth import verify_master_key from ..services.api_key_manager import get_api_key_manager -from ..services.detailed_metrics import get_detailed_metrics_service +from ..services.metrics import metrics_service as unified_metrics from ..services.health import health_service from ..models.api_key import RateLimits as RateLimitsModel @@ -50,22 +51,6 @@ class ApiKeyResponse(BaseModel): source: str = "managed" # "managed" or "environment" -# --- Dependencies --- - - -async def verify_master_key(x_api_key: str = Header(...)): - """Verify the Master API Key for admin operations.""" - if not settings.master_api_key: - raise HTTPException( - status_code=500, - detail="Admin operations are disabled (no MASTER_API_KEY configured)", - ) - - if x_api_key != settings.master_api_key: - raise HTTPException(status_code=403, detail="Invalid Master API Key") - return x_api_key - - # --- Endpoints --- @@ -189,27 +174,26 @@ async def get_admin_stats( hours: int = Query(24, ge=1, le=168), _: str = Depends(verify_master_key) ): """Get aggregated statistics for the admin dashboard.""" - metrics_service = get_detailed_metrics_service() + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) - # Get high-level summary - summary = await metrics_service.get_summary() + # Get high-level summary from unified metrics service + summary = await unified_metrics.get_summary_stats(start=start, end=now) # Get language breakdown - language_stats = await metrics_service.get_language_stats(hours=hours) + lang_data = await unified_metrics.get_language_usage(start=start, end=now) - # Get pool stats - pool_stats = await metrics_service.get_pool_stats() + # Get pool stats (in-memory) + pool_stats = unified_metrics.get_pool_stats() # Get health status health_results = await health_service.check_all_services(use_cache=True) overall_health = health_service.get_overall_status(health_results) return { - "summary": summary.to_dict(), - "by_language": { - lang: stats.to_dict() for lang, stats in language_stats.items() - }, - "pool_stats": pool_stats.to_dict(), + "summary": summary, + "by_language": lang_data.get("by_language", {}), + "pool_stats": pool_stats, "health": { "status": overall_health.value, "services": { @@ -217,5 +201,5 @@ async def get_admin_stats( }, }, "period_hours": hours, - "timestamp": datetime.now(timezone.utc).isoformat(), + "timestamp": now.isoformat(), } diff --git a/src/api/dashboard_metrics.py b/src/api/dashboard_metrics.py index 8529a27..bd57039 100644 --- a/src/api/dashboard_metrics.py +++ b/src/api/dashboard_metrics.py @@ -3,32 +3,16 @@ from datetime import datetime, timedelta, timezone from typing import Dict, List, Literal, Optional -from fastapi import APIRouter, Depends, Header, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query from pydantic import BaseModel -from ..config import settings -from ..services.sqlite_metrics import sqlite_metrics_service +from ..dependencies.auth import verify_master_key +from ..services.metrics import metrics_service from ..services.api_key_manager import get_api_key_manager router = APIRouter(prefix="/admin/metrics", tags=["admin-metrics"]) -# --- Dependencies --- - - -async def verify_master_key(x_api_key: str = Header(...)): - """Verify the Master API Key for admin operations.""" - if not settings.master_api_key: - raise HTTPException( - status_code=500, - detail="Admin operations are disabled (no MASTER_API_KEY configured)", - ) - - if x_api_key != settings.master_api_key: - raise HTTPException(status_code=403, detail="Invalid Master API Key") - return x_api_key - - def get_date_range( period: str, start_date: Optional[datetime] = None, @@ -126,7 +110,7 @@ async def get_metrics_summary( """Get summary statistics for the selected period.""" start, end = get_date_range(period, start_date, end_date) - stats = await sqlite_metrics_service.get_summary_stats( + stats = await metrics_service.get_summary_stats( start=start, end=end, api_key_hash=api_key_hash ) @@ -157,7 +141,7 @@ async def get_language_metrics( """Get language usage data for stacked bar chart.""" start, end = get_date_range(period, start_date, end_date) - data = await sqlite_metrics_service.get_language_usage( + data = await metrics_service.get_language_usage( start=start, end=end, api_key_hash=api_key_hash, @@ -183,7 +167,7 @@ async def get_time_series( start, end = get_date_range(period, start_date, end_date) granularity = get_granularity(period) - data = await sqlite_metrics_service.get_time_series( + data = await metrics_service.get_time_series( start=start, end=end, api_key_hash=api_key_hash, @@ -215,7 +199,7 @@ async def get_activity_heatmap( effective_period = period if period in ("week", "month") else "week" start, end = get_date_range(effective_period, start_date, end_date) - data = await sqlite_metrics_service.get_heatmap_data( + data = await metrics_service.get_heatmap_data( start=start, end=end, api_key_hash=api_key_hash ) @@ -238,7 +222,7 @@ async def get_api_keys_for_filter(_: str = Depends(verify_master_key)): key_lookup = {k.key_hash: k for k in managed_keys} # Get keys from SQLite with usage counts - sqlite_keys = await sqlite_metrics_service.get_api_keys_list() + sqlite_keys = await metrics_service.get_api_keys_list() result = [] seen_hashes = set() @@ -285,7 +269,7 @@ async def get_top_languages( """Get top languages by execution count.""" start, end = get_date_range(period, start_date, end_date) - languages = await sqlite_metrics_service.get_top_languages( + languages = await metrics_service.get_top_languages( start=start, end=end, limit=limit ) diff --git a/src/api/files.py b/src/api/files.py index 88c73c7..9800882 100644 --- a/src/api/files.py +++ b/src/api/files.py @@ -9,11 +9,10 @@ # Third-party imports import structlog from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query -from fastapi.responses import Response, StreamingResponse +from fastapi.responses import StreamingResponse from unidecode import unidecode # Local application imports -from ..config import settings from ..dependencies import FileServiceDep, SessionServiceDep from ..models import SessionCreate from ..services.execution.output import OutputProcessor @@ -88,29 +87,16 @@ async def upload_file( }, ) - # Check file size limits - for file in upload_files: - if file.size and file.size > settings.max_file_size_mb * 1024 * 1024: - raise HTTPException( - status_code=413, - detail=f"File {file.filename} exceeds maximum size of {settings.max_file_size_mb}MB", - ) - - # Check number of files limit - if len(upload_files) > settings.max_files_per_session: + # Validate uploads via service layer + validation_error = file_service.validate_uploads( + filenames=[f.filename or "" for f in upload_files], + file_sizes=[f.size for f in upload_files], + ) + if validation_error: raise HTTPException( - status_code=413, - detail=f"Too many files. Maximum {settings.max_files_per_session} files allowed", + status_code=validation_error[0], detail=validation_error[1] ) - # Check file type restrictions - for file in upload_files: - if not settings.is_file_allowed(file.filename or ""): - raise HTTPException( - status_code=415, - detail=f"File type not allowed: {file.filename}", - ) - uploaded_files = [] # Create a real session for file uploads @@ -314,13 +300,7 @@ async def generate_chunks(): media_type=content_type, headers={ "Content-Disposition": content_disposition, - # DO NOT include Content-Length - this forces chunked transfer encoding "Cache-Control": "private, max-age=3600", - # Add CORS headers for browser compatibility - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET, OPTIONS", - "Access-Control-Allow-Headers": "x-api-key, Content-Type", - "Access-Control-Expose-Headers": "Content-Disposition", }, ) @@ -334,48 +314,3 @@ async def generate_chunks(): error=str(e), ) raise HTTPException(status_code=404, detail="File not found") - - -@router.options("/download/{session_id}/{file_id}") -async def download_file_options(session_id: str, file_id: str): - """Handle OPTIONS preflight request for download endpoint.""" - return Response( - status_code=204, # No Content - headers={ - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET, OPTIONS", - "Access-Control-Allow-Headers": "x-api-key, Content-Type", - "Access-Control-Max-Age": "3600", - }, - ) - - -@router.delete("/files/{session_id}/{file_id}") -async def delete_file( - session_id: str, file_id: str, file_service: FileServiceDep = None -): - """Delete a file from the session - LibreChat compatible.""" - try: - # Get file info before deletion - file_info = await file_service.get_file_info(session_id, file_id) - if not file_info: - raise HTTPException(status_code=404, detail="File not found") - - success = await file_service.delete_file(session_id, file_id) - - if success: - # Return 200 with empty response for LibreChat compatibility - return Response(status_code=200) - else: - raise HTTPException(status_code=500, detail="Failed to delete file") - - except HTTPException: - raise - except Exception as e: - logger.error( - "Failed to delete file", - session_id=session_id, - file_id=file_id, - error=str(e), - ) - raise HTTPException(status_code=500, detail="Failed to delete file") diff --git a/src/api/health.py b/src/api/health.py index 0472e11..b71f06e 100644 --- a/src/api/health.py +++ b/src/api/health.py @@ -1,11 +1,13 @@ """Health check and monitoring endpoints.""" +from datetime import datetime, timedelta, timezone + from fastapi import APIRouter, HTTPException, Depends, Query from fastapi.responses import JSONResponse import structlog from ..services.health import health_service, HealthStatus -from ..services.metrics import metrics_collector +from ..services.metrics import metrics_service from ..dependencies.auth import verify_api_key from ..config import settings @@ -138,11 +140,11 @@ async def minio_health_check(_: str = Depends(verify_api_key)): ) -@router.get("/health/docker", summary="Docker health check") -async def docker_health_check(_: str = Depends(verify_api_key)): - """Check Docker daemon connectivity and performance.""" +@router.get("/health/nsjail", summary="nsjail health check") +async def nsjail_health_check(_: str = Depends(verify_api_key)): + """Check nsjail sandbox availability and configuration.""" try: - result = await health_service.check_docker() + result = await health_service.check_nsjail() if result.status == HealthStatus.UNHEALTHY: return JSONResponse(status_code=503, content=result.to_dict()) @@ -150,13 +152,13 @@ async def docker_health_check(_: str = Depends(verify_api_key)): return JSONResponse(status_code=200, content=result.to_dict()) except Exception as e: - logger.error("Docker health check failed", error=str(e)) + logger.error("nsjail health check failed", error=str(e)) return JSONResponse( status_code=503, content={ - "service": "docker", + "service": "nsjail", "status": "unhealthy", - "error": str(e) if settings.api_debug else "Docker check failed", + "error": str(e) if settings.api_debug else "nsjail check failed", }, ) @@ -166,9 +168,9 @@ async def get_metrics(_: str = Depends(verify_api_key)): """Get system metrics and statistics.""" try: return { - "execution_statistics": metrics_collector.get_execution_statistics(), - "api_statistics": metrics_collector.get_api_statistics(), - "system_metrics": metrics_collector.get_system_metrics(), + "execution_statistics": metrics_service.get_execution_statistics(), + "api_statistics": metrics_service.get_api_statistics(), + "system_metrics": metrics_service.get_system_metrics(), } except Exception as e: @@ -180,7 +182,7 @@ async def get_metrics(_: str = Depends(verify_api_key)): async def get_execution_metrics(_: str = Depends(verify_api_key)): """Get code execution metrics and statistics.""" try: - return metrics_collector.get_execution_statistics() + return metrics_service.get_execution_statistics() except Exception as e: logger.error("Failed to get execution metrics", error=str(e)) @@ -193,7 +195,7 @@ async def get_execution_metrics(_: str = Depends(verify_api_key)): async def get_api_metrics(_: str = Depends(verify_api_key)): """Get API request metrics and statistics.""" try: - return metrics_collector.get_api_statistics() + return metrics_service.get_api_statistics() except Exception as e: logger.error("Failed to get API metrics", error=str(e)) @@ -209,7 +211,7 @@ async def get_service_status(_: str = Depends(verify_api_key)): overall_status = health_service.get_overall_status(service_results) # Get basic metrics - system_metrics = metrics_collector.get_system_metrics() + system_metrics = metrics_service.get_system_metrics() return { "overall_status": overall_status.value, @@ -256,25 +258,19 @@ async def get_detailed_metrics( Summary metrics, language breakdown, and pool statistics """ try: - from ..services.detailed_metrics import get_detailed_metrics_service - - service = get_detailed_metrics_service() - - # Get summary - summary = await service.get_summary() + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) - # Get language stats - language_stats = await service.get_language_stats(hours=hours) - - # Get pool stats - pool_stats = await service.get_pool_stats() + summary = await metrics_service.get_summary_stats(start=start, end=now) + top_langs = await metrics_service.get_top_languages( + start=start, end=now, limit=10 + ) + pool_stats = metrics_service.get_pool_stats() return { - "summary": summary.to_dict(), - "by_language": { - lang: stats.to_dict() for lang, stats in language_stats.items() - }, - "pool_stats": pool_stats.to_dict(), + "summary": summary, + "by_language": {lang["language"]: lang for lang in top_langs}, + "pool_stats": pool_stats, "period_hours": hours, } @@ -296,20 +292,23 @@ async def get_language_metrics( Execution counts, average times, and error rates per language """ try: - from ..services.detailed_metrics import get_detailed_metrics_service + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) - service = get_detailed_metrics_service() - language_stats = await service.get_language_stats(hours=hours) + lang_data = await metrics_service.get_language_usage(start=start, end=now) + by_language = lang_data.get("by_language", {}) - # Sort by execution count - sorted_stats = sorted( - language_stats.values(), key=lambda x: x.execution_count, reverse=True - ) + languages = [ + {"language": lang, "execution_count": count} + for lang, count in sorted( + by_language.items(), key=lambda x: x[1], reverse=True + ) + ] return { - "languages": [stats.to_dict() for stats in sorted_stats], + "languages": languages, "period_hours": hours, - "total_languages": len(sorted_stats), + "total_languages": len(languages), } except Exception as e: @@ -334,14 +333,16 @@ async def get_api_key_metrics( Execution counts, success rates, and resource usage for the key """ try: - from ..services.detailed_metrics import get_detailed_metrics_service + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) - service = get_detailed_metrics_service() - stats = await service.get_api_key_stats(key_hash, hours=hours) + stats = await metrics_service.get_summary_stats( + start=start, end=now, api_key_hash=key_hash + ) return { "api_key_hash": key_hash, - "stats": stats.to_dict(), + "stats": stats, "period_hours": hours, } @@ -352,20 +353,15 @@ async def get_api_key_metrics( ) -@router.get("/metrics/pool", summary="Container pool metrics") +@router.get("/metrics/pool", summary="Sandbox pool metrics") async def get_pool_metrics(_: str = Depends(verify_api_key)): - """Get container pool statistics. + """Get sandbox pool statistics. Returns: Pool hit rates, acquisition times, and exhaustion events """ try: - from ..services.detailed_metrics import get_detailed_metrics_service - - service = get_detailed_metrics_service() - pool_stats = await service.get_pool_stats() - - return pool_stats.to_dict() + return metrics_service.get_pool_stats() except Exception as e: logger.error("Failed to get pool metrics", error=str(e)) diff --git a/src/api/state.py b/src/api/state.py deleted file mode 100644 index b8dd375..0000000 --- a/src/api/state.py +++ /dev/null @@ -1,228 +0,0 @@ -"""State management API endpoints. - -These endpoints allow clients (like LibreChat) to download and upload -Python session state for client-side caching and restoration. - -Wire format: Raw lz4-compressed binary (not base64). -""" - -from typing import Optional - -import structlog -from fastapi import APIRouter, HTTPException, Header, Request, Response - -from ..config import settings -from ..dependencies.services import StateServiceDep, StateArchivalServiceDep -from ..models.state import StateInfo, StateUploadResponse - -logger = structlog.get_logger(__name__) -router = APIRouter() - -# Maximum state size (50 MB) -MAX_STATE_SIZE = 50 * 1024 * 1024 # 52428800 bytes - - -@router.get("/state/{session_id}") -async def download_state( - session_id: str, - state_service: StateServiceDep, - state_archival_service: StateArchivalServiceDep, - if_none_match: Optional[str] = Header(None, alias="If-None-Match"), -): - """Download session state as raw lz4 binary. - - Supports ETag-based caching via If-None-Match header. - - Args: - session_id: Session identifier - if_none_match: ETag for conditional request (returns 304 if unchanged) - - Returns: - - 200: Raw lz4 binary state with ETag header - - 304: Not Modified (if ETag matches) - - 404: No state exists for session - """ - # Get hash for ETag check - state_hash = await state_service.get_state_hash(session_id) - - # Try MinIO if not in Redis - if not state_hash and settings.state_archive_enabled: - restored = await state_archival_service.restore_state(session_id) - if restored: - state_hash = await state_service.get_state_hash(session_id) - - if not state_hash: - raise HTTPException( - status_code=404, - detail={"error": "state_not_found", "message": "No state for session"}, - ) - - etag = f'"{state_hash}"' - - # Check If-None-Match for 304 response - if if_none_match: - # Handle both quoted and unquoted ETags - client_etag = if_none_match.strip('"') - if client_etag == state_hash: - return Response(status_code=304, headers={"ETag": etag}) - - # Get raw binary state - raw_bytes = await state_service.get_state_raw(session_id) - if not raw_bytes: - raise HTTPException( - status_code=404, - detail={"error": "state_not_found", "message": "No state for session"}, - ) - - logger.info( - "State downloaded", - session_id=session_id[:12], - size=len(raw_bytes), - hash=state_hash[:12], - ) - - return Response( - content=raw_bytes, - media_type="application/octet-stream", - headers={"ETag": etag, "Content-Length": str(len(raw_bytes))}, - ) - - -@router.post("/state/{session_id}", status_code=201, response_model=StateUploadResponse) -async def upload_state( - session_id: str, - request: Request, - state_service: StateServiceDep, -): - """Upload session state as raw lz4 binary. - - Validates state format and stores in Redis with standard TTL. - Sets upload marker for priority loading in next execution. - - Args: - session_id: Session identifier - request: Raw binary body (lz4-compressed cloudpickle) - - Returns: - - 201: State uploaded successfully - - 400: Invalid state format - - 413: State exceeds 50MB limit - """ - raw_bytes = await request.body() - - # Size check - if len(raw_bytes) > MAX_STATE_SIZE: - raise HTTPException( - status_code=413, - detail={ - "error": "state_too_large", - "message": "State exceeds 50MB limit", - "max_bytes": MAX_STATE_SIZE, - }, - ) - - # Validate format: minimum size - if len(raw_bytes) < 2: - raise HTTPException( - status_code=400, - detail={"error": "invalid_state", "message": "State too short"}, - ) - - # Validate version byte (first byte should be protocol version 1 or 2) - version = raw_bytes[0] - if version not in (1, 2): - raise HTTPException( - status_code=400, - detail={ - "error": "invalid_state", - "message": f"Unknown state version: {version}", - }, - ) - - # Save state with upload marker - success = await state_service.save_state_raw( - session_id, raw_bytes, from_upload=True - ) - - if not success: - raise HTTPException( - status_code=500, - detail={"error": "save_failed", "message": "Failed to save state"}, - ) - - logger.info( - "State uploaded", - session_id=session_id[:12], - size=len(raw_bytes), - version=version, - ) - - return StateUploadResponse(message="state_uploaded", size=len(raw_bytes)) - - -@router.get("/state/{session_id}/info", response_model=StateInfo) -async def get_state_info( - session_id: str, - state_service: StateServiceDep, - state_archival_service: StateArchivalServiceDep, -): - """Get metadata about stored state without downloading it. - - Checks both Redis (hot storage) and MinIO (cold archive). - - Args: - session_id: Session identifier - - Returns: - StateInfo with exists flag and metadata if available - """ - # Check Redis first - info = await state_service.get_full_state_info(session_id) - - if info: - return StateInfo( - exists=True, - session_id=session_id, - size_bytes=info.get("size_bytes"), - hash=info.get("hash"), - created_at=info.get("created_at"), - expires_at=info.get("expires_at"), - source="redis", - ) - - # Check MinIO archive - if settings.state_archive_enabled: - has_archive = await state_archival_service.has_archived_state(session_id) - if has_archive: - return StateInfo(exists=True, session_id=session_id, source="archive") - - return StateInfo(exists=False, session_id=session_id) - - -@router.delete("/state/{session_id}", status_code=204) -async def delete_state( - session_id: str, - state_service: StateServiceDep, - state_archival_service: StateArchivalServiceDep, -): - """Delete session state. - - Removes state from both Redis and MinIO archive. - Always returns 204 (even if state didn't exist). - - Args: - session_id: Session identifier - - Returns: - 204 No Content - """ - # Delete from Redis (includes hash, meta, marker keys) - await state_service.delete_state(session_id) - - # Delete from MinIO archive - if settings.state_archive_enabled: - await state_archival_service.delete_archived_state(session_id) - - logger.info("State deleted", session_id=session_id[:12]) - - return Response(status_code=204) diff --git a/src/config/__init__.py b/src/config/__init__.py index 5e836e5..6f4a8fb 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -9,36 +9,37 @@ # Access grouped settings settings.api.host - settings.docker.timeout + settings.sandbox.nsjail_binary settings.redis.get_url() # Or use the backward-compatible flat access settings.api_host - settings.docker_timeout + settings.nsjail_binary settings.get_redis_url() """ +import secrets from pathlib import Path from typing import Any, Dict, List, Optional +import structlog from pydantic import Field, validator from pydantic_settings import BaseSettings, SettingsConfigDict # Import grouped configurations from .api import APIConfig -from .docker import DockerConfig from .redis import RedisConfig from .minio import MinIOConfig from .security import SecurityConfig from .resources import ResourcesConfig from .logging import LoggingConfig +from .sandbox import SandboxConfig from .languages import ( LANGUAGES, LanguageConfig, get_language, get_supported_languages, is_supported_language, - get_image_for_language, get_user_id_for_language, get_execution_command, uses_stdin, @@ -69,18 +70,20 @@ class Settings(BaseSettings): api_reload: bool = Field(default=False) # SSL/HTTPS Configuration - enable_https: bool = Field(default=False) + # HTTPS is auto-enabled when ssl_cert_file and ssl_key_file exist on disk. + # Override with ENABLE_HTTPS=false to force HTTP even if certs are present. + enable_https: Optional[bool] = Field(default=None) https_port: int = Field(default=443, ge=1, le=65535) - ssl_cert_file: Optional[str] = Field(default=None) - ssl_key_file: Optional[str] = Field(default=None) - ssl_redirect: bool = Field(default=False) + ssl_cert_file: str = Field(default="/app/ssl/fullchain.pem") + ssl_key_file: str = Field(default="/app/ssl/privkey.pem") ssl_ca_certs: Optional[str] = Field(default=None) # Authentication Configuration - api_key: str = Field(default="test-api-key", min_length=16) + api_key: str = Field( + default_factory=lambda: secrets.token_urlsafe(24), + min_length=16, + ) api_keys: Optional[str] = Field(default=None) - api_key_header: str = Field(default="x-api-key") - api_key_cache_ttl: int = Field(default=300, ge=60) # API Key Management Configuration master_api_key: Optional[str] = Field( @@ -107,158 +110,85 @@ class Settings(BaseSettings): minio_secret_key: str = Field(default="test-secret-key", min_length=8) minio_secure: bool = Field(default=False) minio_bucket: str = Field(default="code-interpreter-files") - minio_region: str = Field(default="us-east-1") - # Docker Configuration - docker_base_url: Optional[str] = Field(default=None) - docker_image_registry: str = Field( - default="code-interpreter", - description="Registry/namespace prefix for execution environment images", + # Sandbox (nsjail) Configuration + nsjail_binary: str = Field( + default="nsjail", + description="Path to nsjail binary", ) - docker_image_tag: str = Field( - default="latest", - description="Tag for execution environment images (e.g. 'latest', 'dev')", + sandbox_base_dir: str = Field( + default="/var/lib/code-interpreter/sandboxes", + description="Root directory for all sandbox instances", ) - docker_timeout: int = Field(default=60, ge=10) - docker_network_mode: str = Field(default="none") - docker_security_opt: List[str] = Field( - default_factory=lambda: ["no-new-privileges:true"] + sandbox_tmpfs_size_mb: int = Field( + default=100, + ge=10, + le=1024, + description="Size of tmpfs mount for /tmp inside sandboxes (MB)", ) - docker_cap_drop: List[str] = Field(default_factory=lambda: ["ALL"]) - docker_read_only: bool = Field(default=True) - docker_tmpfs: Dict[str, str] = Field( - default_factory=lambda: {"/tmp": "rw,noexec,nosuid,size=100m"} + sandbox_ttl_minutes: int = Field( + default=5, + ge=1, + le=1440, + description="TTL for sandbox directories before cleanup", ) - docker_seccomp_profile: Optional[str] = Field( - default="docker/seccomp-sandbox.json", - description="Path to seccomp profile JSON file (None to disable)", + sandbox_cleanup_interval_minutes: int = Field( + default=5, + ge=1, + le=60, + description="Interval between sandbox cleanup sweeps", ) # Resource Limits - Execution - max_execution_time: int = Field(default=30, ge=1, le=300) + max_execution_time: int = Field(default=120, ge=1, le=300) max_memory_mb: int = Field(default=512, ge=64, le=4096) - max_cpus: float = Field( - default=4.0, - ge=0.5, - le=16.0, - description="Maximum CPU cores available to execution containers", - ) - max_pids: int = Field( - default=512, - ge=64, - le=4096, - description="Per-container process limit (cgroup pids_limit). Prevents fork bombs.", - ) - max_open_files: int = Field(default=1024, ge=64, le=4096) # Resource Limits - Files - max_file_size_mb: int = Field(default=10, ge=1, le=100) - max_total_file_size_mb: int = Field(default=50, ge=10, le=500) + max_file_size_mb: int = Field(default=100, ge=1, le=500) max_files_per_session: int = Field(default=50, ge=1, le=200) max_output_files: int = Field(default=10, ge=1, le=50) max_filename_length: int = Field(default=255, ge=1, le=255) - # Resource Limits - Sessions - max_concurrent_executions: int = Field(default=10, ge=1, le=50) - max_sessions_per_entity: int = Field(default=100, ge=1, le=1000) - # Session Configuration session_ttl_hours: int = Field(default=24, ge=1, le=168) - session_cleanup_interval_minutes: int = Field(default=10, ge=1, le=1440) - session_id_length: int = Field(default=32, ge=16, le=64) - enable_orphan_minio_cleanup: bool = Field(default=False) + session_cleanup_interval_minutes: int = Field(default=60, ge=1, le=1440) + enable_orphan_minio_cleanup: bool = Field(default=True) - # Container Configuration - container_ttl_minutes: int = Field(default=5, ge=1, le=1440) - container_cleanup_interval_minutes: int = Field(default=5, ge=1, le=60) + # Sandbox Pool Configuration + sandbox_pool_enabled: bool = Field(default=True) + sandbox_pool_warmup_on_startup: bool = Field(default=True) - # Container Pool Configuration - container_pool_enabled: bool = Field(default=True) - container_pool_warmup_on_startup: bool = Field(default=True) - - # Per-language pool sizes (0 = on-demand only, no pre-warming) - container_pool_py: int = Field( - default=5, ge=0, le=50, description="Python pool size" - ) - container_pool_js: int = Field( - default=2, ge=0, le=50, description="JavaScript pool size" - ) - container_pool_ts: int = Field( - default=0, ge=0, le=50, description="TypeScript pool size" + # Python REPL pool size (only Python supports REPL pre-warming) + sandbox_pool_py: int = Field( + default=2, ge=0, le=50, description="Python REPL pool size" ) - container_pool_go: int = Field(default=0, ge=0, le=50, description="Go pool size") - container_pool_java: int = Field( - default=0, ge=0, le=50, description="Java pool size" - ) - container_pool_c: int = Field(default=0, ge=0, le=50, description="C pool size") - container_pool_cpp: int = Field(default=0, ge=0, le=50, description="C++ pool size") - container_pool_php: int = Field(default=0, ge=0, le=50, description="PHP pool size") - container_pool_rs: int = Field(default=0, ge=0, le=50, description="Rust pool size") - container_pool_r: int = Field(default=0, ge=0, le=50, description="R pool size") - container_pool_f90: int = Field( - default=0, ge=0, le=50, description="Fortran pool size" - ) - container_pool_d: int = Field(default=0, ge=0, le=50, description="D pool size") # Pool Optimization Configuration - container_pool_parallel_batch: int = Field( + sandbox_pool_parallel_batch: int = Field( default=5, ge=1, le=10, - description="Number of containers to start in parallel during warmup", + description="Number of sandboxes to start in parallel during warmup", ) - container_pool_replenish_interval: int = Field( + sandbox_pool_replenish_interval: int = Field( default=2, ge=1, le=30, description="Seconds between pool replenishment checks" ) - container_pool_exhaustion_trigger: bool = Field( + sandbox_pool_exhaustion_trigger: bool = Field( default=True, description="Trigger immediate replenishment when pool is exhausted", ) - # WAN Network Access Configuration - # When enabled, execution containers can access the public internet - # but are blocked from accessing host, other containers, and private networks - enable_wan_access: bool = Field( - default=False, - description="Enable WAN-only network access for execution containers", - ) - wan_network_name: str = Field( - default="code-interpreter-wan", - description="Docker network name for WAN-access containers", - ) - wan_dns_servers: List[str] = Field( - default_factory=lambda: ["8.8.8.8", "1.1.1.1", "8.8.4.4"], - description="Public DNS servers for WAN-access containers", - ) - - # Container Hardening Configuration - container_mask_host_info: bool = Field( - default=True, - description="Mask sensitive /proc paths to prevent host info leakage", - ) - container_generic_hostname: str = Field( - default="sandbox", - description="Generic hostname for execution containers", - ) - # REPL Configuration - Pre-warmed Python interpreter for sub-100ms execution repl_enabled: bool = Field( default=True, - description="Enable REPL mode for Python containers (pre-warmed interpreter)", + description="Enable REPL mode for Python sandboxes (pre-warmed interpreter)", ) repl_warmup_timeout_seconds: int = Field( default=15, ge=5, le=60, - description="Timeout for REPL server to become ready after container start", - ) - repl_health_check_timeout_seconds: int = Field( - default=5, - ge=1, - le=30, - description="Timeout for REPL health check during warmup", + description="Timeout for REPL server to become ready after sandbox start", ) - # State Persistence Configuration - Python session state across executions state_persistence_enabled: bool = Field( default=True, description="Enable Python session state persistence via Redis" @@ -269,9 +199,6 @@ class Settings(BaseSettings): le=86400, description="TTL for persisted Python session state in Redis (seconds). Default: 2 hours", ) - state_max_size_mb: int = Field( - default=50, ge=1, le=200, description="Maximum size for serialized state in MB" - ) state_capture_on_error: bool = Field( default=False, description="Capture and persist state even when execution fails" ) @@ -304,22 +231,6 @@ class Settings(BaseSettings): default=True, description="Enable detailed per-key, per-language metrics tracking", ) - metrics_buffer_size: int = Field( - default=10000, - ge=1000, - le=100000, - description="Maximum number of recent metrics to buffer in memory", - ) - metrics_archive_enabled: bool = Field( - default=True, - description="Enable archiving metrics to MinIO for long-term storage", - ) - metrics_archive_retention_days: int = Field( - default=90, - ge=7, - le=365, - description="Keep archived metrics in MinIO for this many days", - ) # SQLite Metrics Configuration sqlite_metrics_enabled: bool = Field( @@ -439,19 +350,12 @@ class Settings(BaseSettings): @validator("supported_languages", pre=True, always=True) def _set_supported_languages(cls, v, values): - """Initialize supported_languages with registry-prefixed images.""" + """Initialize supported_languages from the LANGUAGES registry.""" if v: return v - registry = values.get("docker_image_registry", "code-interpreter") - tag = values.get("docker_image_tag", "latest") return { code: { - "image": ( - f"{registry}/{lang.image.rsplit(':', 1)[0]}:{tag}" - if registry - else f"{lang.image.rsplit(':', 1)[0]}:{tag}" - ), "timeout_multiplier": lang.timeout_multiplier, "memory_multiplier": lang.memory_multiplier, } @@ -464,13 +368,9 @@ def _set_supported_languages(cls, v, values): log_file: Optional[str] = Field(default=None) log_max_size_mb: int = Field(default=100, ge=1) log_backup_count: int = Field(default=5, ge=1) - enable_access_logs: bool = Field(default=True) + enable_access_logs: bool = Field(default=False) enable_security_logs: bool = Field(default=True) - # Health Check Configuration - health_check_interval: int = Field(default=30, ge=10) - health_check_timeout: int = Field(default=5, ge=1) - # Development Configuration enable_cors: bool = Field(default=False) cors_origins: List[str] = Field(default_factory=list) @@ -480,6 +380,20 @@ def _set_supported_languages(cls, v, values): # VALIDATORS (preserved from original) # ======================================================================== + @validator("api_key") + def warn_auto_generated_api_key(cls, v): + """Log a warning if API_KEY was not explicitly set.""" + import os + + if not os.environ.get("API_KEY"): + _config_logger = structlog.get_logger("config") + _config_logger.warning( + "API_KEY not set in environment; using auto-generated key. " + "Set API_KEY explicitly for production use.", + auto_generated_key=v, + ) + return v + @validator("api_keys") def parse_api_keys(cls, v): """Parse comma-separated API keys into a list.""" @@ -510,7 +424,6 @@ def api(self) -> APIConfig: https_port=self.https_port, ssl_cert_file=self.ssl_cert_file, ssl_key_file=self.ssl_key_file, - ssl_redirect=self.ssl_redirect, ssl_ca_certs=self.ssl_ca_certs, enable_cors=self.enable_cors, cors_origins=self.cors_origins, @@ -518,18 +431,14 @@ def api(self) -> APIConfig: ) @property - def docker(self) -> DockerConfig: - """Access Docker configuration group.""" - return DockerConfig( - docker_base_url=self.docker_base_url, - docker_timeout=self.docker_timeout, - docker_network_mode=self.docker_network_mode, - docker_security_opt=self.docker_security_opt, - docker_cap_drop=self.docker_cap_drop, - docker_read_only=self.docker_read_only, - docker_tmpfs=self.docker_tmpfs, - container_ttl_minutes=self.container_ttl_minutes, - container_cleanup_interval_minutes=self.container_cleanup_interval_minutes, + def sandbox(self) -> SandboxConfig: + """Access sandbox (nsjail) configuration group.""" + return SandboxConfig( + nsjail_binary=self.nsjail_binary, + sandbox_base_dir=self.sandbox_base_dir, + sandbox_tmpfs_size_mb=self.sandbox_tmpfs_size_mb, + sandbox_ttl_minutes=self.sandbox_ttl_minutes, + sandbox_cleanup_interval_minutes=self.sandbox_cleanup_interval_minutes, ) @property @@ -555,7 +464,6 @@ def minio(self) -> MinIOConfig: minio_secret_key=self.minio_secret_key, minio_secure=self.minio_secure, minio_bucket=self.minio_bucket, - minio_region=self.minio_region, ) @property @@ -564,8 +472,6 @@ def security(self) -> SecurityConfig: return SecurityConfig( api_key=self.api_key, api_keys=self.api_keys if isinstance(self.api_keys, str) else None, - api_key_header=self.api_key_header, - api_key_cache_ttl=self.api_key_cache_ttl, enable_network_isolation=self.enable_network_isolation, enable_filesystem_isolation=self.enable_filesystem_isolation, enable_security_logs=self.enable_security_logs, @@ -577,19 +483,12 @@ def resources(self) -> ResourcesConfig: return ResourcesConfig( max_execution_time=self.max_execution_time, max_memory_mb=self.max_memory_mb, - max_cpus=self.max_cpus, - max_pids=self.max_pids, - max_open_files=self.max_open_files, max_file_size_mb=self.max_file_size_mb, - max_total_file_size_mb=self.max_total_file_size_mb, max_files_per_session=self.max_files_per_session, max_output_files=self.max_output_files, max_filename_length=self.max_filename_length, - max_concurrent_executions=self.max_concurrent_executions, - max_sessions_per_entity=self.max_sessions_per_entity, session_ttl_hours=self.session_ttl_hours, session_cleanup_interval_minutes=self.session_cleanup_interval_minutes, - session_id_length=self.session_id_length, enable_orphan_minio_cleanup=self.enable_orphan_minio_cleanup, ) @@ -603,20 +502,27 @@ def logging(self) -> LoggingConfig: log_max_size_mb=self.log_max_size_mb, log_backup_count=self.log_backup_count, enable_access_logs=self.enable_access_logs, - health_check_interval=self.health_check_interval, - health_check_timeout=self.health_check_timeout, ) # ======================================================================== # HELPER METHODS (preserved from original) # ======================================================================== + @property + def https_enabled(self) -> bool: + """Check if HTTPS should be enabled. + + Auto-detects: if enable_https is not explicitly set, returns True + when both ssl_cert_file and ssl_key_file exist on disk. + """ + if self.enable_https is not None: + return self.enable_https + return Path(self.ssl_cert_file).exists() and Path(self.ssl_key_file).exists() + def validate_ssl_files(self) -> bool: """Validate that SSL files exist when HTTPS is enabled.""" - if not self.enable_https: + if not self.https_enabled: return True - if not self.ssl_cert_file or not self.ssl_key_file: - return False return Path(self.ssl_cert_file).exists() and Path(self.ssl_key_file).exists() def get_redis_url(self) -> str: @@ -627,41 +533,10 @@ def get_valid_api_keys(self) -> List[str]: """Get all valid API keys including the primary key.""" return self.security.get_valid_api_keys() - def get_language_config(self, language: str) -> Dict[str, Any]: - """Get configuration for a specific language.""" - return self.supported_languages.get(language, {}) - - def get_image_for_language(self, code: str) -> str: - """Get Docker image for a language.""" - config = self.get_language_config(code) - if config and "image" in config: - return config["image"] - - # Fallback to languages.py logic if not in settings - from .languages import get_image_for_language as get_img - - return get_img( - code, registry=self.docker_image_registry, tag=self.docker_image_tag - ) - - def get_execution_timeout(self, language: str) -> int: - """Get execution timeout for a specific language.""" - multiplier = self.get_language_config(language).get("timeout_multiplier", 1.0) - return int(self.max_execution_time * multiplier) - - def get_memory_limit(self, language: str) -> int: - """Get memory limit for a specific language in MB.""" - multiplier = self.get_language_config(language).get("memory_multiplier", 1.0) - return int(self.max_memory_mb * multiplier) - def get_session_ttl_minutes(self) -> int: """Get session TTL in minutes for backward compatibility.""" return self.session_ttl_hours * 60 - def get_container_ttl_minutes(self) -> int: - """Get container TTL in minutes.""" - return self.container_ttl_minutes - def is_file_allowed(self, filename: str) -> bool: """Check if a file is allowed based on extension and patterns.""" extension = Path(filename).suffix.lower() @@ -686,19 +561,18 @@ def is_file_allowed(self, filename: str) -> bool: "settings", # Grouped configs "APIConfig", - "DockerConfig", "RedisConfig", "MinIOConfig", "SecurityConfig", "ResourcesConfig", "LoggingConfig", + "SandboxConfig", # Language configuration "LANGUAGES", "LanguageConfig", "get_language", "get_supported_languages", "is_supported_language", - "get_image_for_language", "get_user_id_for_language", "get_execution_command", "uses_stdin", diff --git a/src/config/api.py b/src/config/api.py index 7a240c0..cbbf07b 100644 --- a/src/config/api.py +++ b/src/config/api.py @@ -13,11 +13,10 @@ class APIConfig(BaseSettings): reload: bool = Field(default=False, alias="api_reload") # SSL/HTTPS Configuration - enable_https: bool = Field(default=False) + enable_https: bool | None = Field(default=None) https_port: int = Field(default=443, ge=1, le=65535) - ssl_cert_file: str | None = Field(default=None) - ssl_key_file: str | None = Field(default=None) - ssl_redirect: bool = Field(default=False) + ssl_cert_file: str = Field(default="/app/ssl/fullchain.pem") + ssl_key_file: str = Field(default="/app/ssl/privkey.pem") ssl_ca_certs: str | None = Field(default=None) # CORS Configuration diff --git a/src/config/docker.py b/src/config/docker.py deleted file mode 100644 index c4c4a5f..0000000 --- a/src/config/docker.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Docker configuration.""" - -from typing import Dict, List, Optional -from pydantic import Field -from pydantic_settings import BaseSettings - - -class DockerConfig(BaseSettings): - """Docker execution settings.""" - - base_url: str | None = Field(default=None, alias="docker_base_url") - timeout: int = Field(default=60, ge=10, alias="docker_timeout") - network_mode: str = Field(default="none", alias="docker_network_mode") - security_opt: List[str] = Field( - default_factory=lambda: ["no-new-privileges:true"], alias="docker_security_opt" - ) - cap_drop: List[str] = Field( - default_factory=lambda: ["ALL"], alias="docker_cap_drop" - ) - read_only: bool = Field(default=True, alias="docker_read_only") - tmpfs: Dict[str, str] = Field( - default_factory=lambda: {"/tmp": "rw,noexec,nosuid,size=100m"}, - alias="docker_tmpfs", - ) - seccomp_profile: Optional[str] = Field( - default="docker/seccomp-sandbox.json", - alias="docker_seccomp_profile", - description="Path to seccomp profile JSON file (None to disable)", - ) - - # Container lifecycle - container_ttl_minutes: int = Field(default=5, ge=1, le=1440) - container_cleanup_interval_minutes: int = Field(default=5, ge=1, le=60) - - # Container labeling for isolation - container_label_prefix: str = Field(default="com.code-interpreter") - - class Config: - env_prefix = "" - extra = "ignore" diff --git a/src/config/languages.py b/src/config/languages.py index 68bc671..a6310f9 100644 --- a/src/config/languages.py +++ b/src/config/languages.py @@ -1,9 +1,7 @@ """Unified language configuration - single source of truth. -This module replaces the scattered language configuration across: -- config.py: supported_languages dict -- execution.py: execution_commands, stdin_languages, file_extensions -- containers.py: LANGUAGE_IMAGES, LANGUAGE_USER_IDS +This module defines all supported programming languages and their +execution settings (commands, resource multipliers, user IDs, etc.). """ from dataclasses import dataclass, field @@ -19,8 +17,7 @@ class LanguageConfig: code: str # Short code: "py", "js", "go", etc. name: str # Full name: "Python", "JavaScript", etc. - image: str # Docker image to use - user_id: int # Container user ID + user_id: int # Sandbox user ID file_extension: str # File extension without dot: "py", "js", etc. execution_command: str # Command to execute code uses_stdin: bool = False # Whether code is passed via stdin @@ -34,7 +31,6 @@ class LanguageConfig: "py": LanguageConfig( code="py", name="Python", - image="python:latest", user_id=999, file_extension="py", execution_command="python3 -", @@ -45,7 +41,6 @@ class LanguageConfig: "js": LanguageConfig( code="js", name="JavaScript", - image="nodejs:latest", user_id=1001, file_extension="js", execution_command="node", @@ -56,11 +51,10 @@ class LanguageConfig: "ts": LanguageConfig( code="ts", name="TypeScript", - image="nodejs:latest", user_id=1001, file_extension="ts", - execution_command="tsc /mnt/data/code.ts --outDir /mnt/data --module commonjs " - "--target ES2019 && node /mnt/data/code.js", + execution_command="tsc code.ts --outDir . --module commonjs " + "--target ES2019 && node code.js", uses_stdin=False, timeout_multiplier=1.2, memory_multiplier=1.0, @@ -68,7 +62,6 @@ class LanguageConfig: "go": LanguageConfig( code="go", name="Go", - image="go:latest", user_id=1001, file_extension="go", execution_command="go build -o code code.go && ./code", @@ -79,7 +72,6 @@ class LanguageConfig: "java": LanguageConfig( code="java", name="Java", - image="java:latest", user_id=999, file_extension="java", execution_command="javac Code.java && java Code", @@ -90,7 +82,6 @@ class LanguageConfig: "c": LanguageConfig( code="c", name="C", - image="c-cpp:latest", user_id=1001, file_extension="c", execution_command="gcc -o code code.c && ./code", @@ -101,7 +92,6 @@ class LanguageConfig: "cpp": LanguageConfig( code="cpp", name="C++", - image="c-cpp:latest", user_id=1001, file_extension="cpp", execution_command="g++ -o code code.cpp && ./code", @@ -112,7 +102,6 @@ class LanguageConfig: "php": LanguageConfig( code="php", name="PHP", - image="php:latest", user_id=1001, file_extension="php", execution_command="php", @@ -123,7 +112,6 @@ class LanguageConfig: "rs": LanguageConfig( code="rs", name="Rust", - image="rust:latest", user_id=1001, file_extension="rs", execution_command="rustc code.rs -o code && ./code", @@ -134,18 +122,16 @@ class LanguageConfig: "r": LanguageConfig( code="r", name="R", - image="r:latest", user_id=1001, file_extension="r", - execution_command="Rscript /dev/stdin", - uses_stdin=True, + execution_command="Rscript code.r", + uses_stdin=False, timeout_multiplier=1.5, memory_multiplier=1.2, ), "f90": LanguageConfig( code="f90", name="Fortran", - image="fortran:latest", user_id=1001, file_extension="f90", execution_command="gfortran -o code code.f90 && ./code", @@ -156,7 +142,6 @@ class LanguageConfig: "d": LanguageConfig( code="d", name="D", - image="d:latest", user_id=0, file_extension="d", execution_command="ldc2 code.d -of=code && ./code", @@ -182,23 +167,8 @@ def is_supported_language(code: str) -> bool: return code.lower() in LANGUAGES -# Convenience lookups for backward compatibility during transition -def get_image_for_language( - code: str, registry: Optional[str] = None, tag: str = "latest" -) -> str: - """Get Docker image for a language.""" - lang = get_language(code) - if lang: - # Extract base image name without the default :latest tag - base_image = lang.image.rsplit(":", 1)[0] - if registry: - return f"{registry}/{base_image}:{tag}" - return f"{base_image}:{tag}" - raise ValueError(f"Unsupported language: {code}") - - def get_user_id_for_language(code: str) -> int: - """Get container user ID for a language.""" + """Get sandbox user ID for a language.""" lang = get_language(code) if lang: return lang.user_id diff --git a/src/config/logging.py b/src/config/logging.py index 5353361..fb14d99 100644 --- a/src/config/logging.py +++ b/src/config/logging.py @@ -12,11 +12,7 @@ class LoggingConfig(BaseSettings): file: str | None = Field(default=None, alias="log_file") max_size_mb: int = Field(default=100, ge=1, alias="log_max_size_mb") backup_count: int = Field(default=5, ge=1, alias="log_backup_count") - enable_access_logs: bool = Field(default=True) - - # Health Check - health_check_interval: int = Field(default=30, ge=10) - health_check_timeout: int = Field(default=5, ge=1) + enable_access_logs: bool = Field(default=False) class Config: env_prefix = "" diff --git a/src/config/minio.py b/src/config/minio.py index bd7109c..11a8494 100644 --- a/src/config/minio.py +++ b/src/config/minio.py @@ -16,7 +16,6 @@ class MinIOConfig(BaseSettings): ) secure: bool = Field(default=False, alias="minio_secure") bucket: str = Field(default="code-interpreter-files", alias="minio_bucket") - region: str = Field(default="us-east-1", alias="minio_region") @validator("endpoint") def validate_endpoint(cls, v): diff --git a/src/config/resources.py b/src/config/resources.py index 8fd2380..b4c5c44 100644 --- a/src/config/resources.py +++ b/src/config/resources.py @@ -8,38 +8,19 @@ class ResourcesConfig(BaseSettings): """Resource limits for execution and files.""" # Execution Limits - max_execution_time: int = Field(default=30, ge=1, le=300) + max_execution_time: int = Field(default=120, ge=1, le=300) max_memory_mb: int = Field(default=512, ge=64, le=4096) - max_cpus: float = Field( - default=4.0, - ge=0.5, - le=16.0, - description="Maximum CPU cores available to execution containers", - ) - max_pids: int = Field( - default=512, - ge=64, - le=4096, - description="Per-container process limit (cgroup pids_limit). Prevents fork bombs.", - ) - max_open_files: int = Field(default=1024, ge=64, le=4096) # File Limits - max_file_size_mb: int = Field(default=10, ge=1, le=100) - max_total_file_size_mb: int = Field(default=50, ge=10, le=500) + max_file_size_mb: int = Field(default=100, ge=1, le=500) max_files_per_session: int = Field(default=50, ge=1, le=200) max_output_files: int = Field(default=10, ge=1, le=50) max_filename_length: int = Field(default=255, ge=1, le=255) - # Session Limits - max_concurrent_executions: int = Field(default=10, ge=1, le=50) - max_sessions_per_entity: int = Field(default=100, ge=1, le=1000) - # Session Lifecycle session_ttl_hours: int = Field(default=24, ge=1, le=168) - session_cleanup_interval_minutes: int = Field(default=10, ge=1, le=1440) - session_id_length: int = Field(default=32, ge=16, le=64) - enable_orphan_minio_cleanup: bool = Field(default=False) + session_cleanup_interval_minutes: int = Field(default=60, ge=1, le=1440) + enable_orphan_minio_cleanup: bool = Field(default=True) def get_session_ttl_minutes(self) -> int: """Get session TTL in minutes.""" diff --git a/src/config/sandbox.py b/src/config/sandbox.py new file mode 100644 index 0000000..f11b713 --- /dev/null +++ b/src/config/sandbox.py @@ -0,0 +1,27 @@ +"""Sandbox (nsjail) configuration.""" + +from typing import Optional +from pydantic import Field +from pydantic_settings import BaseSettings + + +class SandboxConfig(BaseSettings): + """nsjail sandbox execution settings.""" + + nsjail_binary: str = Field(default="nsjail", alias="nsjail_binary") + sandbox_base_dir: str = Field( + default="/var/lib/code-interpreter/sandboxes", alias="sandbox_base_dir" + ) + sandbox_tmpfs_size_mb: int = Field( + default=100, ge=10, le=1024, alias="sandbox_tmpfs_size_mb" + ) + sandbox_ttl_minutes: int = Field( + default=5, ge=1, le=1440, alias="sandbox_ttl_minutes" + ) + sandbox_cleanup_interval_minutes: int = Field( + default=5, ge=1, le=60, alias="sandbox_cleanup_interval_minutes" + ) + + class Config: + env_prefix = "" + extra = "ignore" diff --git a/src/config/security.py b/src/config/security.py index 64f3805..f61fbea 100644 --- a/src/config/security.py +++ b/src/config/security.py @@ -11,8 +11,6 @@ class SecurityConfig(BaseSettings): # API Key Authentication api_key: str = Field(default="test-api-key", min_length=16) api_keys: str | None = Field(default=None) # Comma-separated additional keys - api_key_header: str = Field(default="x-api-key") - api_key_cache_ttl: int = Field(default=300, ge=60) # Container Isolation enable_network_isolation: bool = Field(default=True) diff --git a/src/dependencies/__init__.py b/src/dependencies/__init__.py index 40794f5..519858b 100644 --- a/src/dependencies/__init__.py +++ b/src/dependencies/__init__.py @@ -11,8 +11,6 @@ get_state_archival_service, FileServiceDep, SessionServiceDep, - StateServiceDep, - StateArchivalServiceDep, ) __all__ = [ @@ -24,6 +22,4 @@ "get_state_archival_service", "FileServiceDep", "SessionServiceDep", - "StateServiceDep", - "StateArchivalServiceDep", ] diff --git a/src/dependencies/auth.py b/src/dependencies/auth.py index 2c32156..e0042e6 100644 --- a/src/dependencies/auth.py +++ b/src/dependencies/auth.py @@ -5,20 +5,18 @@ # Third-party imports import structlog -from fastapi import Request, HTTPException, Depends -from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from fastapi import Request, HTTPException, Header # Local application imports +from ..config import settings from ..services.auth import get_auth_service from ..utils.request_helpers import extract_api_key logger = structlog.get_logger(__name__) -security = HTTPBearer(auto_error=False) async def verify_api_key( request: Request, - credentials: Optional[HTTPAuthorizationCredentials] = Depends(security), ) -> str: """ Verify API key authentication. @@ -35,7 +33,7 @@ async def verify_api_key( logger.warning("No API key provided in request") raise HTTPException( status_code=401, - detail="API key required. Provide it in x-api-key header or Authorization header.", + detail="API key required. Provide it in the x-api-key header.", ) # Validate API key @@ -47,16 +45,28 @@ async def verify_api_key( return api_key +async def verify_master_key(x_api_key: str = Header(...)): + """Verify the Master API Key for admin operations.""" + if not settings.master_api_key: + raise HTTPException( + status_code=500, + detail="Admin operations are disabled (no MASTER_API_KEY configured)", + ) + + if x_api_key != settings.master_api_key: + raise HTTPException(status_code=403, detail="Invalid Master API Key") + return x_api_key + + async def verify_api_key_optional( request: Request, - credentials: Optional[HTTPAuthorizationCredentials] = Depends(security), ) -> Optional[str]: """ Optional API key verification for endpoints that may not require authentication. Returns None if no API key is provided, raises exception if invalid key is provided. """ try: - return await verify_api_key(request, credentials) + return await verify_api_key(request) except HTTPException as e: if "required" in e.detail: return None # No API key provided, which is OK for optional endpoints diff --git a/src/dependencies/services.py b/src/dependencies/services.py index 3532134..5de3703 100644 --- a/src/dependencies/services.py +++ b/src/dependencies/services.py @@ -20,23 +20,23 @@ logger = structlog.get_logger(__name__) -# Global reference to container pool (set by main.py lifespan) -_container_pool = None +# Global reference to sandbox pool (set by main.py lifespan) +_sandbox_pool = None -def set_container_pool(pool) -> None: - """Set the global container pool reference. +def set_sandbox_pool(pool) -> None: + """Set the global sandbox pool reference. Called by main.py after the pool is initialized in lifespan. """ - global _container_pool - _container_pool = pool - logger.info("Container pool registered with dependency injection") + global _sandbox_pool + _sandbox_pool = pool + logger.info("Sandbox pool registered with dependency injection") -def get_container_pool(): - """Get the container pool instance (may be None if disabled).""" - return _container_pool +def get_sandbox_pool(): + """Get the sandbox pool instance (may be None if disabled).""" + return _sandbox_pool @lru_cache() @@ -62,20 +62,20 @@ def get_state_archival_service() -> StateArchivalService: def get_execution_service() -> ExecutionServiceInterface: """Get execution service instance. - Note: Container pool is injected separately after creation via set_container_pool. + Note: Sandbox pool is injected separately after creation via set_sandbox_pool. """ return CodeExecutionService() -def inject_container_pool_to_execution_service(): - """Inject container pool into the execution service. +def inject_sandbox_pool_to_execution_service(): + """Inject sandbox pool into the execution service. Called after pool is initialized to wire it into the cached execution service. """ - if _container_pool: + if _sandbox_pool: execution_service = get_execution_service() - execution_service.container_pool = _container_pool - logger.info("Container pool injected into execution service") + execution_service.set_sandbox_pool(_sandbox_pool) + logger.info("Sandbox pool injected into execution service") @lru_cache() @@ -91,8 +91,8 @@ def get_session_service() -> SessionServiceInterface: file_service = get_file_service() # Wire up the dependencies - session_service._execution_service = execution_service - session_service._file_service = file_service + session_service.set_execution_service(execution_service) + session_service.set_file_service(file_service) logger.info("Session service initialized with dependencies") return session_service diff --git a/src/main.py b/src/main.py index 58e95a6..dfef0ab 100644 --- a/src/main.py +++ b/src/main.py @@ -15,13 +15,13 @@ from pydantic import ValidationError # Local application imports -from .api import files, exec, health, state, admin, dashboard_metrics +from .api import files, exec, health, admin, dashboard_metrics from .config import settings from .middleware.security import SecurityMiddleware, RequestLoggingMiddleware from .middleware.metrics import MetricsMiddleware from .models.errors import CodeInterpreterException from .services.health import health_service -from .services.metrics import metrics_collector +from .services.metrics import metrics_service from .utils.config_validator import validate_configuration, get_configuration_summary from .utils.error_handlers import ( code_interpreter_exception_handler, @@ -37,77 +37,28 @@ logger = structlog.get_logger() -@asynccontextmanager -async def lifespan(app: FastAPI): - """Application lifespan manager.""" - # Startup - logger.info("Starting Code Interpreter API", version="1.0.0") - - # Setup graceful shutdown callbacks (uvicorn handles signals) - setup_graceful_shutdown() - - # Validate configuration on startup - if not validate_configuration(): - logger.error("Configuration validation failed - shutting down") - sys.exit(1) - - # Log security warnings if applicable - if settings.api_key == "test-api-key": - logger.warning("Using default API key - CHANGE THIS IN PRODUCTION!") - - if settings.api_debug: - logger.warning("Debug mode is enabled - disable in production") - - # Log API key management status - if settings.master_api_key: - logger.info("API key management enabled (MASTER_API_KEY configured)") - else: - logger.info("API key management: CLI disabled (no MASTER_API_KEY set)") - - logger.info( - "Rate limiting configuration", rate_limit_enabled=settings.rate_limit_enabled - ) - - # Start monitoring services +async def _startup_monitoring(app: FastAPI) -> None: + """Start metrics and monitoring services.""" try: - logger.info("Starting metrics collector...") - await metrics_collector.start() - logger.info("Metrics collector started successfully") + await metrics_service.start() + metrics_service.register_event_handlers() + logger.info("Metrics service started") except Exception as e: - logger.error("Failed to start metrics collector", error=str(e)) - # Don't fail startup if metrics collector fails + logger.error("Failed to start metrics service", error=str(e)) - # Start SQLite metrics service for long-term analytics - if settings.sqlite_metrics_enabled: - try: - logger.info("Starting SQLite metrics service...") - from .services.sqlite_metrics import sqlite_metrics_service - - await sqlite_metrics_service.start() - app.state.sqlite_metrics_service = sqlite_metrics_service - logger.info( - "SQLite metrics service started successfully", - db_path=settings.sqlite_metrics_db_path, - ) - except Exception as e: - logger.error("Failed to start SQLite metrics service", error=str(e)) - # Don't fail startup if SQLite metrics fails - # Start session cleanup task +async def _startup_cleanup_tasks() -> None: + """Start session cleanup and event-driven cleanup scheduler.""" try: - logger.info("Starting session cleanup task...") from .dependencies.services import get_session_service session_service = get_session_service() await session_service.start_cleanup_task() - logger.info("Session cleanup task started successfully") + logger.info("Session cleanup task started") except Exception as e: logger.error("Failed to start session cleanup task", error=str(e)) - # Don't fail startup if cleanup task fails - # Start event-driven cleanup scheduler try: - logger.info("Starting cleanup scheduler...") from .services.cleanup import cleanup_scheduler from .dependencies.services import ( get_execution_service, @@ -123,93 +74,56 @@ async def lifespan(app: FastAPI): ), ) cleanup_scheduler.start() - logger.info( - "Cleanup scheduler started successfully", - state_archival_enabled=settings.state_archive_enabled, - ) + logger.info("Cleanup scheduler started") except Exception as e: logger.error("Failed to start cleanup scheduler", error=str(e)) - # Don't fail startup if cleanup scheduler fails - # Initialize WAN network for container internet access if enabled - # IMPORTANT: This must happen BEFORE the container pool starts - if settings.enable_wan_access: - try: - logger.info("Initializing WAN network for container internet access...") - from .services.container.network import WANNetworkManager - from .services.container.manager import ContainerManager - - temp_manager = ContainerManager() - if temp_manager.is_available(): - wan_network_manager = WANNetworkManager(temp_manager.client) - if await wan_network_manager.initialize(): - app.state.wan_network_manager = wan_network_manager - logger.info( - "WAN network initialized successfully", - network_name=settings.wan_network_name, - dns_servers=settings.wan_dns_servers, - ) - else: - logger.error("Failed to initialize WAN network") - else: - logger.warning("Docker not available, skipping WAN network setup") - except Exception as e: - logger.error("Error initializing WAN network", error=str(e)) - # Don't fail startup if WAN network fails - else: - logger.info("WAN network access disabled (containers have no network access)") - # Start container pool if enabled - container_pool = None - if settings.container_pool_enabled: +async def _startup_sandbox_pool(app: FastAPI) -> None: + """Start the sandbox pool if enabled.""" + if settings.sandbox_pool_enabled: try: - logger.info("Starting container pool...") - from .services.container.pool import ContainerPool - from .services.container.manager import ContainerManager + from .services.sandbox.pool import SandboxPool + from .services.sandbox.manager import SandboxManager from .services.cleanup import cleanup_scheduler from .dependencies.services import ( - set_container_pool, - inject_container_pool_to_execution_service, + set_sandbox_pool, + inject_sandbox_pool_to_execution_service, ) - container_manager = ContainerManager() - container_pool = ContainerPool(container_manager) - await container_pool.start() + sandbox_manager = SandboxManager() + sandbox_pool = SandboxPool(sandbox_manager) + await sandbox_pool.start() # Connect pool to cleanup scheduler - cleanup_scheduler.set_container_pool(container_pool) + cleanup_scheduler.set_sandbox_pool(sandbox_pool) # Register pool with dependency injection system - set_container_pool(container_pool) - inject_container_pool_to_execution_service() + set_sandbox_pool(sandbox_pool) + inject_sandbox_pool_to_execution_service() # Register pool with health service for monitoring - health_service.set_container_pool(container_pool) + health_service.set_sandbox_pool(sandbox_pool) # Store pool reference in app state - app.state.container_pool = container_pool + app.state.sandbox_pool = sandbox_pool - logger.info( - "Container pool started successfully", - warmup_languages=["py", "js", "ts", "go", "java"], - ) + logger.info("Sandbox pool started") except Exception as e: - logger.error("Failed to start container pool", error=str(e)) - # Don't fail startup if container pool fails - container_pool = None + logger.error("Failed to start sandbox pool", error=str(e)) else: - logger.info("Container pool disabled by configuration") + logger.info("Sandbox pool disabled") + - # Perform initial health checks +async def _perform_health_checks() -> None: + """Perform initial health checks on all services.""" try: - logger.info("Performing initial health checks...") health_results = await health_service.check_all_services(use_cache=False) - # Log health check results for service_name, result in health_results.items(): if result.status.value == "healthy": - logger.info( - f"{service_name} health check passed", + logger.debug( + f"{service_name} healthy", response_time_ms=result.response_time_ms, ) else: @@ -220,49 +134,26 @@ async def lifespan(app: FastAPI): ) overall_status = health_service.get_overall_status(health_results) - logger.info( - "Initial health checks completed", overall_status=overall_status.value - ) - + logger.info("Health checks completed", overall_status=overall_status.value) except Exception as e: logger.error("Initial health checks failed", error=str(e)) - # Don't fail startup if health checks fail - - logger.info("Code Interpreter API startup completed") - yield - # Shutdown - logger.info("Shutting down Code Interpreter API") - - # Cleanup WAN network iptables rules - if hasattr(app.state, "wan_network_manager") and app.state.wan_network_manager: - try: - await app.state.wan_network_manager.cleanup() - logger.info("WAN network iptables rules cleaned up") - except Exception as e: - logger.error("Error cleaning up WAN network", error=str(e)) - - # Stop SQLite metrics service (flush pending writes) - if ( - hasattr(app.state, "sqlite_metrics_service") - and app.state.sqlite_metrics_service - ): - try: - await app.state.sqlite_metrics_service.stop() - logger.info("SQLite metrics service stopped") - except Exception as e: - logger.error("Error stopping SQLite metrics service", error=str(e)) +async def _shutdown_services(app: FastAPI) -> None: + """Stop monitoring services, sandbox pool, and cleanup scheduler.""" + try: + await metrics_service.stop() + logger.info("Metrics service stopped") + except Exception as e: + logger.error("Error stopping metrics service", error=str(e)) - # Stop container pool first (it manages active containers) - if hasattr(app.state, "container_pool") and app.state.container_pool: + if hasattr(app.state, "sandbox_pool") and app.state.sandbox_pool: try: - await app.state.container_pool.stop() - logger.info("Container pool stopped") + await app.state.sandbox_pool.stop() + logger.info("Sandbox pool stopped") except Exception as e: - logger.error("Error stopping container pool", error=str(e)) + logger.error("Error stopping sandbox pool", error=str(e)) - # Stop cleanup scheduler try: from .services.cleanup import cleanup_scheduler @@ -271,7 +162,39 @@ async def lifespan(app: FastAPI): except Exception as e: logger.error("Error stopping cleanup scheduler", error=str(e)) - # Perform graceful shutdown + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application lifespan manager.""" + logger.info("Starting Code Interpreter API", version="1.0.0") + + setup_graceful_shutdown() + + if not validate_configuration(): + logger.error("Configuration validation failed - shutting down") + sys.exit(1) + + if settings.api_key == "test-api-key": + logger.warning("Using default API key - CHANGE THIS IN PRODUCTION!") + if settings.api_debug: + logger.warning("Debug mode is enabled - disable in production") + if settings.master_api_key: + logger.info("API key management enabled") + logger.debug("Rate limiting", enabled=settings.rate_limit_enabled) + + await _startup_monitoring(app) + await _startup_cleanup_tasks() + await _startup_sandbox_pool(app) + await _perform_health_checks() + + logger.info("Code Interpreter API startup completed") + + yield + + logger.info("Shutting down Code Interpreter API") + + await _shutdown_services(app) + try: await shutdown_handler.shutdown() except Exception as e: @@ -319,20 +242,6 @@ async def lifespan(app: FastAPI): app.add_exception_handler(Exception, general_exception_handler) -@app.get("/health") -async def health_check(): - """Health check endpoint.""" - return { - "status": "healthy", - "version": "1.0.0", - "config": { - "debug": settings.api_debug, - "docs_enabled": settings.enable_docs, - "cors_enabled": settings.enable_cors, - }, - } - - @app.get("/config") async def config_info(): """Configuration information endpoint (non-sensitive data only).""" @@ -350,8 +259,6 @@ async def config_info(): app.include_router(health.router, tags=["health", "monitoring"]) -app.include_router(state.router, tags=["state"]) - app.include_router(admin.router, prefix="/api/v1", tags=["admin"]) app.include_router(dashboard_metrics.router, prefix="/api/v1", tags=["admin-metrics"]) @@ -377,7 +284,7 @@ async def get_admin_dashboard_deep_link(rest_of_path: str): def run_server(): - if settings.enable_https: + if settings.https_enabled: # Validate SSL files exist if not settings.validate_ssl_files(): logger.error("SSL configuration invalid - missing certificate files") @@ -400,6 +307,8 @@ def run_server(): port=settings.https_port, reload=settings.api_reload, log_level=settings.log_level.lower(), + access_log=settings.enable_access_logs, + timeout_keep_alive=120, **ssl_config, ) else: @@ -410,6 +319,8 @@ def run_server(): port=settings.api_port, reload=settings.api_reload, log_level=settings.log_level.lower(), + access_log=settings.enable_access_logs, + timeout_keep_alive=120, ) diff --git a/src/middleware/metrics.py b/src/middleware/metrics.py index e853d61..a1c8520 100644 --- a/src/middleware/metrics.py +++ b/src/middleware/metrics.py @@ -2,58 +2,76 @@ # Standard library imports import time -from typing import Callable # Third-party imports import structlog -from fastapi import Request, Response -from starlette.middleware.base import BaseHTTPMiddleware # Local application imports -from ..services.metrics import metrics_collector, APIMetrics +from ..services.metrics import metrics_service, APIRequestMetrics from ..config import settings logger = structlog.get_logger(__name__) -class MetricsMiddleware(BaseHTTPMiddleware): - """Optimized middleware to collect essential API request metrics.""" +class MetricsMiddleware: + """Optimized ASGI middleware to collect essential API request metrics.""" - async def dispatch(self, request: Request, call_next: Callable) -> Response: - """Process request and collect essential metrics.""" - start_time = time.time() - - # Process request - response = await call_next(request) - - # Calculate response time - response_time_ms = (time.time() - start_time) * 1000 + def __init__(self, app): + self.app = app - # Normalize endpoint path for metrics - normalized_endpoint = self._normalize_endpoint(request.url.path) + async def __call__(self, scope, receive, send): + """Process request and collect essential metrics.""" + if scope["type"] != "http": + await self.app(scope, receive, send) + return - # Create simplified metrics record - api_metrics = APIMetrics( - endpoint=normalized_endpoint, - method=request.method, - status_code=response.status_code, - response_time_ms=response_time_ms, - request_size_bytes=0, # Simplified - not essential for monitoring - response_size_bytes=0, # Simplified - not essential for monitoring - user_agent=None, # Simplified - not essential for core metrics - ) + start_time = time.time() + status_code = 500 # Default in case of error + + async def send_wrapper(message): + nonlocal status_code + if message["type"] == "http.response.start": + status_code = message.get("status", 500) + + # Only add debug headers in debug mode + if settings.api_debug: + response_time_ms = (time.time() - start_time) * 1000 + headers = list(message.get("headers", [])) + headers.append( + ( + b"x-response-time-ms", + str(round(response_time_ms, 2)).encode(), + ) + ) + message = {**message, "headers": headers} + + await send(message) - # Record metrics (fail silently to avoid impacting performance) try: - metrics_collector.record_api_metrics(api_metrics) - except Exception as e: - logger.error("Failed to record API metrics", error=str(e)) - - # Only add debug headers in debug mode - if settings.api_debug: - response.headers["X-Response-Time-Ms"] = str(round(response_time_ms, 2)) - - return response + await self.app(scope, receive, send_wrapper) + finally: + # Calculate response time + response_time_ms = (time.time() - start_time) * 1000 + + # Normalize endpoint path for metrics + path = scope.get("path", "") + normalized_endpoint = self._normalize_endpoint(path) + + method = scope.get("method", "GET") + + # Create simplified metrics record + api_metrics = APIRequestMetrics( + endpoint=normalized_endpoint, + method=method, + status_code=status_code, + response_time_ms=response_time_ms, + ) + + # Record metrics (fail silently to avoid impacting performance) + try: + metrics_service.record_api_request(api_metrics) + except Exception as e: + logger.error("Failed to record API metrics", error=str(e)) def _normalize_endpoint(self, path: str) -> str: """Simplified endpoint path normalization.""" diff --git a/src/middleware/security.py b/src/middleware/security.py index 68f434b..5772706 100644 --- a/src/middleware/security.py +++ b/src/middleware/security.py @@ -1,7 +1,9 @@ """Consolidated security middleware for the Code Interpreter API.""" # Standard library imports +import hmac import time +from datetime import datetime, timezone from typing import Callable, Optional # Third-party imports @@ -27,8 +29,6 @@ def __init__(self, app: Callable): "/docs", "/redoc", "/openapi.json", - "/api/v1/admin", - "/admin-dashboard", } async def __call__(self, scope: dict, receive: Callable, send: Callable): @@ -142,12 +142,14 @@ async def _validate_request(self, request: Request): def _should_skip_auth(self, request: Request) -> bool: """Check if authentication should be skipped.""" path = request.url.path - return ( - path in self.excluded_paths - or path.startswith("/api/v1/admin") - or path.startswith("/admin-dashboard") - or request.method == "OPTIONS" - ) + if path in self.excluded_paths or request.method == "OPTIONS": + return True + # Allow the admin dashboard UI (HTML/static assets) to load without auth. + # The dashboard itself has a login form where users enter the master key, + # which is then sent as a header with API requests. + if path.startswith("/admin-dashboard"): + return True + return False async def _authenticate_request(self, request: Request, scope: dict): """Handle API key authentication with rate limiting.""" @@ -165,6 +167,20 @@ async def _authenticate_request(self, request: Request, scope: dict): detail="Too many authentication failures. Please try again later.", ) + # For admin endpoints, accept the master API key directly + path = request.url.path + is_admin_path = path.startswith("/api/v1/admin") or path.startswith( + "/admin-dashboard" + ) + if is_admin_path and api_key and settings.master_api_key: + if hmac.compare_digest(api_key, settings.master_api_key): + scope["state"] = scope.get("state", {}) + scope["state"]["authenticated"] = True + scope["state"]["api_key"] = api_key + scope["state"]["api_key_hash"] = "master" + scope["state"]["is_env_key"] = True + return + # Validate API key with full details result = await auth_service.validate_api_key_full(api_key) @@ -185,18 +201,14 @@ async def _authenticate_request(self, request: Request, scope: dict): "X-RateLimit-Reset": exceeded.resets_at.isoformat(), "X-RateLimit-Period": exceeded.period, "Retry-After": str( - int( - ( - exceeded.resets_at - - exceeded.resets_at.replace( - hour=exceeded.resets_at.hour, - minute=0, - second=0, - microsecond=0, - ) - ).total_seconds() + max( + int( + ( + exceeded.resets_at - datetime.now(timezone.utc) + ).total_seconds() + ), + 60, ) - or 60 ), } raise HTTPException( @@ -264,10 +276,15 @@ async def send_wrapper(message): finally: if not skip_logging: duration = time.time() - start_time - logger.info( - "Request processed", + log_kwargs = dict( method=request.method, path=request.url.path, status=response_status, duration_ms=round(duration * 1000, 2), ) + if response_status and response_status >= 500: + logger.error("Request failed", **log_kwargs) + elif response_status and response_status >= 400: + logger.warning("Request error", **log_kwargs) + else: + logger.debug("Request processed", **log_kwargs) diff --git a/src/models/__init__.py b/src/models/__init__.py index b8eb0c9..5cbb100 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -5,7 +5,6 @@ SessionStatus, SessionCreate, SessionResponse, - FileInfo as SessionFileInfo, ) from .execution import ( CodeExecution, @@ -29,20 +28,10 @@ ErrorDetail, ErrorResponse, CodeInterpreterException, - AuthenticationError, - AuthorizationError, ValidationError, - ResourceNotFoundError, - ResourceConflictError, - ResourceExhaustedError, - ExecutionError, - TimeoutError, - RateLimitError, ServiceUnavailableError, - ExternalServiceError, ) from .pool import PooledContainer, PoolStats, PoolConfig -from .state import StateInfo, StateUploadResponse __all__ = [ # Session models @@ -50,7 +39,6 @@ "SessionStatus", "SessionCreate", "SessionResponse", - "SessionFileInfo", # Execution models "CodeExecution", "ExecutionStatus", @@ -75,22 +63,10 @@ "ErrorDetail", "ErrorResponse", "CodeInterpreterException", - "AuthenticationError", - "AuthorizationError", "ValidationError", - "ResourceNotFoundError", - "ResourceConflictError", - "ResourceExhaustedError", - "ExecutionError", - "TimeoutError", - "RateLimitError", "ServiceUnavailableError", - "ExternalServiceError", # Pool models "PooledContainer", "PoolStats", "PoolConfig", - # State models - "StateInfo", - "StateUploadResponse", ] diff --git a/src/models/errors.py b/src/models/errors.py index b64f2b9..bd1571b 100644 --- a/src/models/errors.py +++ b/src/models/errors.py @@ -79,30 +79,6 @@ def to_response(self) -> ErrorResponse: ) -class AuthenticationError(CodeInterpreterException): - """Authentication related errors.""" - - def __init__(self, message: str = "Authentication failed", **kwargs): - super().__init__( - message=message, - error_type=ErrorType.AUTHENTICATION, - status_code=401, - **kwargs, - ) - - -class AuthorizationError(CodeInterpreterException): - """Authorization related errors.""" - - def __init__(self, message: str = "Access denied", **kwargs): - super().__init__( - message=message, - error_type=ErrorType.AUTHORIZATION, - status_code=403, - **kwargs, - ) - - class ValidationError(CodeInterpreterException): """Request validation errors.""" @@ -112,81 +88,6 @@ def __init__(self, message: str = "Validation failed", **kwargs): ) -class ResourceNotFoundError(CodeInterpreterException): - """Resource not found errors.""" - - def __init__(self, resource: str, resource_id: str = None, **kwargs): - message = f"{resource} not found" - if resource_id: - message += f": {resource_id}" - super().__init__( - message=message, - error_type=ErrorType.RESOURCE_NOT_FOUND, - status_code=404, - **kwargs, - ) - - -class ResourceConflictError(CodeInterpreterException): - """Resource conflict errors.""" - - def __init__(self, message: str = "Resource conflict", **kwargs): - super().__init__( - message=message, - error_type=ErrorType.RESOURCE_CONFLICT, - status_code=409, - **kwargs, - ) - - -class ResourceExhaustedError(CodeInterpreterException): - """Resource exhaustion errors.""" - - def __init__(self, resource: str, **kwargs): - super().__init__( - message=f"{resource} limit exceeded", - error_type=ErrorType.RESOURCE_EXHAUSTED, - status_code=429, - **kwargs, - ) - - -class ExecutionError(CodeInterpreterException): - """Code execution related errors.""" - - def __init__(self, message: str = "Code execution failed", **kwargs): - super().__init__( - message=message, - error_type=ErrorType.EXECUTION_FAILED, - status_code=422, - **kwargs, - ) - - -class TimeoutError(CodeInterpreterException): - """Timeout related errors.""" - - def __init__(self, operation: str, timeout: int, **kwargs): - super().__init__( - message=f"{operation} timed out after {timeout} seconds", - error_type=ErrorType.TIMEOUT, - status_code=408, - **kwargs, - ) - - -class RateLimitError(CodeInterpreterException): - """Rate limiting errors.""" - - def __init__(self, message: str = "Rate limit exceeded", **kwargs): - super().__init__( - message=message, - error_type=ErrorType.RATE_LIMITED, - status_code=429, - **kwargs, - ) - - class ServiceUnavailableError(CodeInterpreterException): """Service unavailable errors.""" @@ -198,16 +99,3 @@ def __init__(self, service: str, message: str = None, **kwargs): status_code=503, **kwargs, ) - - -class ExternalServiceError(CodeInterpreterException): - """External service integration errors.""" - - def __init__(self, service: str, message: str = None, **kwargs): - error_message = message or f"External service error: {service}" - super().__init__( - message=error_message, - error_type=ErrorType.EXTERNAL_SERVICE, - status_code=502, - **kwargs, - ) diff --git a/src/models/exec.py b/src/models/exec.py index 734f765..a7cfb63 100644 --- a/src/models/exec.py +++ b/src/models/exec.py @@ -23,10 +23,6 @@ class RequestFile(BaseModel): id: str session_id: str name: str - restore_state: bool = Field( - default=False, - description="If true, restore Python state from when this file was last used", - ) class ExecRequest(BaseModel): @@ -62,17 +58,6 @@ class ExecResponse(BaseModel): files: List[FileRef] = Field(default_factory=list) stdout: str = "" stderr: str = "" - # State persistence fields (Python only) - has_state: bool = Field( - default=False, - description="Whether Python state was captured (Python executions only)", - ) - state_size: Optional[int] = Field( - default=None, description="Compressed state size in bytes" - ) - state_hash: Optional[str] = Field( - default=None, description="SHA256 hash for ETag/change detection" - ) class Config: json_encoders = {datetime: lambda v: v.isoformat()} diff --git a/src/models/metrics.py b/src/models/metrics.py index 4a0ef70..00dd97e 100644 --- a/src/models/metrics.py +++ b/src/models/metrics.py @@ -1,55 +1,23 @@ -"""Extended metrics data models for detailed usage tracking. +"""Metrics data models for execution tracking and analytics.""" -These models extend the basic metrics with additional dimensions: -- Per-API-key tracking -- Per-language breakdown -- Container pool metrics -- Detailed resource consumption -""" - -from dataclasses import dataclass, field +from dataclasses import dataclass, asdict, field from datetime import datetime, timezone -from typing import Dict, Any, Optional, List -from enum import Enum - - -class AggregationPeriod(str, Enum): - """Time period for metrics aggregation.""" - - HOURLY = "hourly" - DAILY = "daily" - MONTHLY = "monthly" - - -class ContainerSource(str, Enum): - """Source of container for execution.""" - - POOL_HIT = "pool_hit" # Container from warm pool - POOL_MISS = "pool_miss" # Created fresh (pool exhausted or disabled) - POOL_DISABLED = "pool_disabled" # Pool is disabled - - -class ExecutionStatus(str, Enum): - """Execution result status.""" - - COMPLETED = "completed" - FAILED = "failed" - TIMEOUT = "timeout" +from typing import Any, Dict, Optional @dataclass class DetailedExecutionMetrics: """Per-execution metrics with all dimensions for tracking. - This extends the basic ExecutionMetrics with additional fields - for per-key and per-language analytics. + Used as the single metrics record type throughout the system. + Written to SQLite for long-term storage and dashboard queries. """ execution_id: str session_id: str api_key_hash: str # SHA256 hash (first 16 chars) for grouping - user_id: Optional[str] # From request - entity_id: Optional[str] # From request + user_id: Optional[str] + entity_id: Optional[str] language: str status: str # completed, failed, timeout execution_time_ms: float @@ -65,25 +33,9 @@ class DetailedExecutionMetrics: def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for JSON serialization.""" - return { - "execution_id": self.execution_id, - "session_id": self.session_id, - "api_key_hash": self.api_key_hash, - "user_id": self.user_id, - "entity_id": self.entity_id, - "language": self.language, - "status": self.status, - "execution_time_ms": self.execution_time_ms, - "memory_peak_mb": self.memory_peak_mb, - "cpu_time_ms": self.cpu_time_ms, - "container_source": self.container_source, - "repl_mode": self.repl_mode, - "files_uploaded": self.files_uploaded, - "files_generated": self.files_generated, - "output_size_bytes": self.output_size_bytes, - "state_size_bytes": self.state_size_bytes, - "timestamp": self.timestamp.isoformat(), - } + d = asdict(self) + d["timestamp"] = self.timestamp.isoformat() + return d @classmethod def from_dict(cls, data: Dict[str, Any]) -> "DetailedExecutionMetrics": @@ -113,156 +65,3 @@ def from_dict(cls, data: Dict[str, Any]) -> "DetailedExecutionMetrics": state_size_bytes=data.get("state_size_bytes"), timestamp=timestamp, ) - - -@dataclass -class LanguageMetrics: - """Per-language aggregated metrics.""" - - language: str - execution_count: int = 0 - success_count: int = 0 - failure_count: int = 0 - timeout_count: int = 0 - total_execution_time_ms: float = 0 - total_memory_mb: float = 0 - avg_execution_time_ms: float = 0 - avg_memory_mb: float = 0 - error_rate: float = 0.0 # Percentage (0-100) - repl_mode_count: int = 0 # Executions using REPL - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary.""" - return { - "language": self.language, - "execution_count": self.execution_count, - "success_count": self.success_count, - "failure_count": self.failure_count, - "timeout_count": self.timeout_count, - "total_execution_time_ms": self.total_execution_time_ms, - "total_memory_mb": self.total_memory_mb, - "avg_execution_time_ms": self.avg_execution_time_ms, - "avg_memory_mb": self.avg_memory_mb, - "error_rate": self.error_rate, - "repl_mode_count": self.repl_mode_count, - } - - -@dataclass -class ApiKeyUsageMetrics: - """Per-API-key aggregated metrics.""" - - api_key_hash: str - execution_count: int = 0 - success_count: int = 0 - failure_count: int = 0 - total_execution_time_ms: float = 0 - total_memory_mb: float = 0 - file_operations: int = 0 - success_rate: float = 100.0 # Percentage (0-100) - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary.""" - return { - "api_key_hash": self.api_key_hash, - "execution_count": self.execution_count, - "success_count": self.success_count, - "failure_count": self.failure_count, - "total_execution_time_ms": self.total_execution_time_ms, - "total_memory_mb": self.total_memory_mb, - "file_operations": self.file_operations, - "success_rate": self.success_rate, - } - - -@dataclass -class PoolMetricsSummary: - """Container pool metrics.""" - - total_acquisitions: int = 0 - pool_hits: int = 0 - pool_misses: int = 0 - hit_rate: float = 0.0 # Percentage (0-100) - avg_acquire_time_ms: float = 0 - exhaustion_events: int = 0 # Times pool was empty when needed - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary.""" - return { - "total_acquisitions": self.total_acquisitions, - "pool_hits": self.pool_hits, - "pool_misses": self.pool_misses, - "hit_rate": self.hit_rate, - "avg_acquire_time_ms": self.avg_acquire_time_ms, - "exhaustion_events": self.exhaustion_events, - } - - -@dataclass -class AggregatedMetrics: - """Aggregated metrics for a time period.""" - - period: str # ISO format: "2025-12-20T14:00:00Z" - period_type: str # hourly, daily, monthly - execution_count: int = 0 - success_count: int = 0 - failure_count: int = 0 - timeout_count: int = 0 - total_execution_time_ms: float = 0 - avg_execution_time_ms: float = 0 - p50_execution_time_ms: float = 0 - p95_execution_time_ms: float = 0 - p99_execution_time_ms: float = 0 - total_memory_mb: float = 0 - avg_memory_mb: float = 0 - by_language: Dict[str, LanguageMetrics] = field(default_factory=dict) - by_api_key: Dict[str, ApiKeyUsageMetrics] = field(default_factory=dict) - pool_stats: Optional[PoolMetricsSummary] = None - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - return { - "period": self.period, - "period_type": self.period_type, - "execution_count": self.execution_count, - "success_count": self.success_count, - "failure_count": self.failure_count, - "timeout_count": self.timeout_count, - "total_execution_time_ms": self.total_execution_time_ms, - "avg_execution_time_ms": self.avg_execution_time_ms, - "p50_execution_time_ms": self.p50_execution_time_ms, - "p95_execution_time_ms": self.p95_execution_time_ms, - "p99_execution_time_ms": self.p99_execution_time_ms, - "total_memory_mb": self.total_memory_mb, - "avg_memory_mb": self.avg_memory_mb, - "by_language": {k: v.to_dict() for k, v in self.by_language.items()}, - "by_api_key": {k: v.to_dict() for k, v in self.by_api_key.items()}, - "pool_stats": self.pool_stats.to_dict() if self.pool_stats else None, - } - - -@dataclass -class MetricsSummary: - """High-level metrics summary for dashboard/status.""" - - total_executions: int = 0 - total_executions_today: int = 0 - total_executions_hour: int = 0 - success_rate: float = 100.0 - avg_execution_time_ms: float = 0 - active_api_keys: int = 0 - top_languages: List[Dict[str, Any]] = field(default_factory=list) - pool_hit_rate: float = 0.0 - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary.""" - return { - "total_executions": self.total_executions, - "total_executions_today": self.total_executions_today, - "total_executions_hour": self.total_executions_hour, - "success_rate": self.success_rate, - "avg_execution_time_ms": self.avg_execution_time_ms, - "active_api_keys": self.active_api_keys, - "top_languages": self.top_languages, - "pool_hit_rate": self.pool_hit_rate, - } diff --git a/src/models/pool.py b/src/models/pool.py index 19fbc2b..d5930eb 100644 --- a/src/models/pool.py +++ b/src/models/pool.py @@ -1,7 +1,7 @@ -"""Container pool data models. +"""Sandbox pool data models. -These models track containers in the pool. The pool is stateless with respect -to sessions - containers are provided fresh and destroyed after each execution. +These models track sandboxes in the pool. The pool is stateless with respect +to sessions - sandboxes are provided fresh and destroyed after each execution. """ from dataclasses import dataclass, field @@ -10,37 +10,39 @@ @dataclass -class PooledContainer: - """Represents a container available in the pool. +class PooledSandbox: + """Represents a sandbox available in the pool. - Containers in the pool are pre-warmed and ready to be used. - After use, containers are destroyed (not returned to pool). + Sandboxes in the pool are pre-warmed and ready to be used. + After use, sandboxes are destroyed (not returned to pool). """ - container_id: str + sandbox_id: str language: str - image: str created_at: datetime - status: Literal["available", "starting", "unhealthy"] = "available" - repl_enabled: bool = False # Whether REPL mode is enabled for this container - repl_ready: bool = False # Whether REPL server is ready and responsive + status: Literal["available"] = "available" + repl_enabled: bool = False + repl_ready: bool = False def __hash__(self): - return hash(self.container_id) + return hash(self.sandbox_id) def __eq__(self, other): - if not isinstance(other, PooledContainer): + if not isinstance(other, PooledSandbox): return False - return self.container_id == other.container_id + return self.sandbox_id == other.sandbox_id + + +# Backward compatibility alias +PooledContainer = PooledSandbox @dataclass class PoolStats: - """Container pool statistics for monitoring.""" + """Sandbox pool statistics for monitoring.""" language: str - available_count: int - assigned_count: int # Kept for backward compatibility (always 0 now) + available_count: int = 0 total_acquisitions: int = 0 pool_hits: int = 0 # Acquired from pool pool_misses: int = 0 # Created fresh (pool empty) @@ -52,7 +54,7 @@ class PoolStats: @dataclass class PoolConfig: - """Configuration for a language-specific container pool.""" + """Configuration for a language-specific sandbox pool.""" language: str size: int # Single pool size (0 = on-demand only) @@ -60,28 +62,17 @@ class PoolConfig: @classmethod def from_settings(cls, language: str) -> "PoolConfig": - """Create pool config from settings for a specific language.""" + """Create pool config from settings for a specific language. + + Only Python supports REPL pool pre-warming. All other languages + use one-shot nsjail execution with no pooling. + """ from ..config import settings - # Map language to its pool size setting - pool_sizes = { - "py": settings.container_pool_py, - "js": settings.container_pool_js, - "ts": settings.container_pool_ts, - "go": settings.container_pool_go, - "java": settings.container_pool_java, - "c": settings.container_pool_c, - "cpp": settings.container_pool_cpp, - "php": settings.container_pool_php, - "rs": settings.container_pool_rs, - "r": settings.container_pool_r, - "f90": settings.container_pool_f90, - "d": settings.container_pool_d, - } - - size = pool_sizes.get(language, 0) + # Only Python has a configurable pool size + size = settings.sandbox_pool_py if language == "py" else 0 return cls( language=language, size=size, - warmup_on_startup=size > 0 and settings.container_pool_warmup_on_startup, + warmup_on_startup=size > 0 and settings.sandbox_pool_warmup_on_startup, ) diff --git a/src/models/session.py b/src/models/session.py index 58e9000..115419a 100644 --- a/src/models/session.py +++ b/src/models/session.py @@ -43,11 +43,9 @@ class Session(BaseModel): ) expires_at: datetime = Field(..., description="Session expiration timestamp") - # Container information - container_id: Optional[str] = Field(default=None, description="Docker container ID") - container_status: Optional[str] = Field( - default=None, description="Container status" - ) + # Sandbox information + container_id: Optional[str] = Field(default=None, description="Sandbox ID") + container_status: Optional[str] = Field(default=None, description="Sandbox status") # File management files: Dict[str, FileInfo] = Field( diff --git a/src/models/state.py b/src/models/state.py deleted file mode 100644 index 4f3f47e..0000000 --- a/src/models/state.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Models for state management API endpoints.""" - -from datetime import datetime -from typing import Optional - -from pydantic import BaseModel, Field - - -class StateInfo(BaseModel): - """Metadata about stored session state. - - Returned by GET /state/{session_id}/info endpoint. - """ - - exists: bool = Field(..., description="Whether state exists for this session") - session_id: Optional[str] = Field(None, description="Session identifier") - size_bytes: Optional[int] = Field( - None, description="Compressed state size in bytes" - ) - hash: Optional[str] = Field( - None, description="SHA256 hash for ETag/change detection" - ) - created_at: Optional[datetime] = Field( - None, description="When state was created/updated" - ) - expires_at: Optional[datetime] = Field(None, description="When state will expire") - source: Optional[str] = Field( - None, description="Storage source: 'redis' or 'archive'" - ) - - class Config: - json_encoders = {datetime: lambda v: v.isoformat() if v else None} - - -class StateUploadResponse(BaseModel): - """Response for state upload endpoint. - - Returned by POST /state/{session_id} endpoint. - """ - - message: str = Field(default="state_uploaded", description="Status message") - size: int = Field(..., description="Uploaded state size in bytes") diff --git a/src/services/__init__.py b/src/services/__init__.py index 164ba21..0b751dc 100644 --- a/src/services/__init__.py +++ b/src/services/__init__.py @@ -7,7 +7,6 @@ SessionServiceInterface, ExecutionServiceInterface, FileServiceInterface, - ContainerServiceInterface, ) __all__ = [ @@ -17,5 +16,4 @@ "SessionServiceInterface", "ExecutionServiceInterface", "FileServiceInterface", - "ContainerServiceInterface", ] diff --git a/src/services/cleanup.py b/src/services/cleanup.py index 2444a69..6d43183 100644 --- a/src/services/cleanup.py +++ b/src/services/cleanup.py @@ -54,14 +54,14 @@ def set_services( self._file_service = file_service self._state_archival_service = state_archival_service - def set_container_pool(self, pool): - """Set container pool reference (kept for backward compatibility). + def set_sandbox_pool(self, pool): + """Set sandbox pool reference. - Note: With simplified pool, containers are destroyed immediately + Note: With simplified pool, sandboxes are destroyed immediately after execution. Pool reference is no longer used for cleanup. """ - logger.info( - "Cleanup scheduler initialized (containers destroyed after each execution)" + logger.debug( + "Cleanup scheduler initialized (sandboxes destroyed after each execution)" ) def start(self): diff --git a/src/services/container/__init__.py b/src/services/container/__init__.py deleted file mode 100644 index d306154..0000000 --- a/src/services/container/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Container management services. - -This package provides Docker container management functionality split into: -- client.py: Docker client factory and initialization -- executor.py: Command execution in containers -- manager.py: Container lifecycle management -- utils.py: Shared utilities for container operations -""" - -from .manager import ContainerManager -from .client import DockerClientFactory -from .executor import ContainerExecutor -from .utils import wait_for_container_ready, receive_socket_output, run_in_executor - -__all__ = [ - "ContainerManager", - "DockerClientFactory", - "ContainerExecutor", - "wait_for_container_ready", - "receive_socket_output", - "run_in_executor", -] diff --git a/src/services/container/client.py b/src/services/container/client.py deleted file mode 100644 index c90f070..0000000 --- a/src/services/container/client.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Docker client factory and initialization.""" - -import os -from typing import Optional - -import docker -import structlog -from docker.errors import DockerException - -from ...config import settings - -logger = structlog.get_logger(__name__) - - -class DockerClientFactory: - """Factory for creating Docker clients with proper initialization.""" - - def __init__(self): - """Initialize Docker client manager without blocking operations.""" - self.client: Optional[docker.DockerClient] = None - self._initialization_error: Optional[str] = None - self._initialization_attempted: bool = False - logger.info( - "DockerClientFactory initialized (client will be created on first use)" - ) - - def _ensure_client(self) -> bool: - """Ensure Docker client is initialized. Returns True if successful.""" - if self.client is not None: - return True - - if self._initialization_attempted and self._initialization_error: - return False - - try: - logger.info("Initializing Docker client on first use") - self._initialization_attempted = True - - socket_path = "/var/run/docker.sock" - if not os.path.exists(socket_path): - raise DockerException(f"Docker socket not found at {socket_path}") - - if not os.access(socket_path, os.R_OK | os.W_OK): - raise DockerException( - f"No permission to access Docker socket at {socket_path}" - ) - - client_created = False - last_error = None - - # Approach 1: Try with requests-unixsocket session - try: - logger.info( - "Attempting Docker client creation with requests-unixsocket" - ) - import requests_unixsocket - - session = requests_unixsocket.Session() - self.client = docker.DockerClient( - base_url="unix://var/run/docker.sock", - timeout=settings.docker_timeout, - ) - self.client.api._session = session - - version_info = self.client.version() - logger.info( - f"Docker connection successful. Server version: {version_info.get('ServerVersion', 'unknown')}" - ) - client_created = True - - except Exception as e: - logger.warning(f"requests-unixsocket approach failed: {e}") - last_error = e - - # Approach 2: Try with environment variables - if not client_created: - try: - logger.info( - "Attempting Docker client creation with environment override" - ) - old_docker_host = os.environ.get("DOCKER_HOST") - os.environ["DOCKER_HOST"] = "unix:///var/run/docker.sock" - - try: - self.client = docker.from_env(timeout=settings.docker_timeout) - version_info = self.client.version() - logger.info( - f"Docker connection successful. Server version: {version_info.get('ServerVersion', 'unknown')}" - ) - client_created = True - finally: - if old_docker_host is not None: - os.environ["DOCKER_HOST"] = old_docker_host - elif "DOCKER_HOST" in os.environ: - del os.environ["DOCKER_HOST"] - - except Exception as e: - logger.warning(f"Environment override approach failed: {e}") - last_error = e - - # Approach 3: Direct socket connection - if not client_created: - try: - logger.info("Attempting Docker client creation with direct socket") - self.client = docker.DockerClient( - base_url="unix:///var/run/docker.sock" - ) - self.client.ping() - logger.info("Docker connection successful with direct socket") - client_created = True - except Exception as e: - logger.warning(f"Direct socket approach failed: {e}") - last_error = e - - if not client_created: - error_msg = f"All Docker client initialization approaches failed. Last error: {last_error}" - logger.error(error_msg) - raise DockerException(error_msg) - - # Test connection - logger.info("Testing Docker connection...") - try: - self.client.ping() - logger.info("Docker connection test successful") - except Exception as ping_error: - logger.error(f"Docker ping failed: {ping_error}") - try: - info = self.client.info() - logger.info( - f"Docker info retrieved: {info.get('ServerVersion', 'unknown')}" - ) - except Exception as info_error: - logger.error(f"Docker info failed: {info_error}") - raise ping_error - - logger.info("Docker client initialized and tested successfully") - return True - - except Exception as e: - logger.error(f"Failed to create Docker client: {e}") - self._initialization_error = str(e) - self.client = None - return False - - def is_available(self) -> bool: - """Check if Docker is available.""" - return self._ensure_client() - - def get_initialization_error(self) -> Optional[str]: - """Get Docker initialization error if any.""" - return self._initialization_error - - def reset_initialization(self) -> None: - """Reset initialization state to allow retry.""" - self._initialization_attempted = False - self._initialization_error = None - if self.client: - try: - self.client.close() - except Exception: - pass - self.client = None - logger.info("Docker client initialization state reset") - - def get_client(self) -> Optional[docker.DockerClient]: - """Get the Docker client, ensuring it's initialized.""" - if self._ensure_client(): - return self.client - return None - - def close(self): - """Close Docker client connection.""" - try: - if self.client is not None: - self.client.close() - except Exception as e: - logger.error(f"Error closing Docker client: {e}") diff --git a/src/services/container/executor.py b/src/services/container/executor.py deleted file mode 100644 index 47024d2..0000000 --- a/src/services/container/executor.py +++ /dev/null @@ -1,281 +0,0 @@ -"""Command execution in Docker containers.""" - -import asyncio -import re -import shlex -from typing import Any, Dict, Optional, Tuple - -import structlog -from docker.errors import DockerException -from docker.models.containers import Container - -from ...config import settings -from .utils import wait_for_container_ready, receive_socket_output, run_in_executor - -logger = structlog.get_logger(__name__) - - -class ContainerExecutor: - """Handles command execution inside Docker containers.""" - - def __init__(self, docker_client): - """Initialize executor with Docker client.""" - self.client = docker_client - - async def execute_command( - self, - container: Container, - command: str, - timeout: int = None, - working_dir: Optional[str] = None, - language: Optional[str] = None, - stdin_payload: Optional[str] = None, - ) -> Tuple[int, str, str]: - """Execute a command in the container with enhanced security.""" - if timeout is None: - timeout = settings.max_execution_time - - # Ensure container is running - try: - container.reload() - if getattr(container, "status", "") != "running": - await self._start_container(container) - except Exception: - pass - - # Build sanitized environment - sanitized_env = self._build_sanitized_env(language) - env_assignments = " ".join( - [ - f"{key}={self._escape_env_value(value)}" - for key, value in sanitized_env.items() - ] - ) - - # Preamble commands - preamble = "mkdir -p /tmp || true" - - # Build sanitized command - inner_shell_cmd = shlex.quote(f"{preamble} && {command}") - if env_assignments: - sanitized_command = f"env -i {env_assignments} sh -c {inner_shell_cmd}" - else: - sanitized_command = f"env -i sh -c {inner_shell_cmd}" - - exec_config = { - "cmd": ["sh", "-c", sanitized_command], - "stdout": True, - "stderr": True, - "stdin": stdin_payload is not None, - "tty": False, - "privileged": False, - } - - if working_dir: - exec_config["workdir"] = working_dir - - try: - return self._execute_via_socket( - container, exec_config, stdin_payload, timeout - ) - - except DockerException as e: - error_text = str(e) - logger.error(f"Failed to execute command in container: {error_text}") - - if "is not running" in error_text.lower(): - try: - await self._start_container(container) - return self._execute_via_socket( - container, exec_config, stdin_payload, timeout - ) - except Exception as retry_err: - logger.error(f"Retry failed: {retry_err}") - - return 1, "", f"Execution failed: {error_text}" - except Exception as e: - logger.error(f"Unexpected error during command execution: {e}") - return 1, "", f"Unexpected execution error: {str(e)}" - - def _execute_via_socket( - self, - container: Container, - exec_config: Dict[str, Any], - stdin_payload: Optional[str], - timeout: int, - ) -> Tuple[int, str, str]: - """Execute command and collect output via socket.""" - exec_instance = self.client.api.exec_create(container.id, **exec_config) - exec_id = exec_instance["Id"] - - sock = self.client.api.exec_start(exec_id, socket=True) - raw_sock = sock._sock - raw_sock.settimeout(timeout) - - if stdin_payload: - raw_sock.sendall(stdin_payload.encode("utf-8")) - raw_sock.shutdown(1) - - output = receive_socket_output(raw_sock) - exec_info = self.client.api.exec_inspect(exec_id) - exit_code = exec_info["ExitCode"] - - output_str = self._sanitize_output(output) if output else "" - stdout, stderr = self._separate_output_streams(output_str, exit_code) - - return exit_code, stdout, stderr - - async def _start_container(self, container: Container) -> bool: - """Start a container and wait for running state.""" - await run_in_executor(container.start) - return await wait_for_container_ready(container) - - def _build_sanitized_env(self, language: Optional[str]) -> Dict[str, str]: - """Build environment whitelist for execution.""" - normalized_lang = (language or "").lower().strip() - - env_whitelist: Dict[str, str] = { - "PATH": "/usr/local/bin:/usr/bin:/bin", - "HOME": "/tmp", - "TMPDIR": "/tmp", - } - - if normalized_lang in {"py", "python"}: - env_whitelist.update( - { - "PYTHONUNBUFFERED": "1", - "PYTHONDONTWRITEBYTECODE": "1", - "PYTHONPATH": "/mnt/data", - "MPLCONFIGDIR": "/tmp/mplconfig", - "XDG_CACHE_HOME": "/tmp/.cache", - "MPLBACKEND": "Agg", - } - ) - elif normalized_lang in {"js", "ts"}: - env_whitelist.update( - { - "NODE_PATH": "/usr/local/lib/node_modules", - } - ) - elif normalized_lang == "java": - env_whitelist.update( - { - "CLASSPATH": "/mnt/data:/opt/java/lib/*", - "JAVA_OPTS": "-Xmx512m -Xms128m", - "PATH": "/opt/java/openjdk/bin:/usr/local/bin:/usr/bin:/bin", - } - ) - elif normalized_lang == "go": - env_whitelist.update( - { - "GO111MODULE": "on", - "GOPROXY": "https://proxy.golang.org,direct", - "GOSUMDB": "sum.golang.org", - "GOCACHE": "/mnt/data/go-build", - "PATH": "/usr/local/go/bin:/usr/local/bin:/usr/bin:/bin", - } - ) - elif normalized_lang in {"c", "cpp"}: - env_whitelist.update( - { - "CC": "gcc", - "CXX": "g++", - "PKG_CONFIG_PATH": "/usr/lib/x86_64-linux-gnu/pkgconfig", - } - ) - elif normalized_lang == "php": - env_whitelist.update( - { - "PHP_INI_SCAN_DIR": "/usr/local/etc/php/conf.d", - "COMPOSER_HOME": "/opt/composer/global", - "PATH": "/opt/composer/global/vendor/bin:/usr/local/bin:/usr/bin:/bin", - } - ) - elif normalized_lang == "rs": - env_whitelist.update( - { - "CARGO_HOME": "/usr/local/cargo", - "RUSTUP_HOME": "/usr/local/rustup", - "PATH": "/usr/local/cargo/bin:/usr/local/rustup/toolchains/stable-x86_64-unknown-linux-gnu/bin:/usr/local/bin:/usr/bin:/bin", - } - ) - elif normalized_lang == "r": - env_whitelist.update( - { - "R_LIBS_USER": "/usr/local/lib/R/site-library", - } - ) - elif normalized_lang == "f90": - env_whitelist.update( - { - "FORTRAN_COMPILER": "gfortran", - "FC": "gfortran", - "F77": "gfortran", - "F90": "gfortran", - "F95": "gfortran", - } - ) - - return env_whitelist - - def _escape_env_value(self, value: str) -> str: - """Escape env var values for shell.""" - try: - safe = str(value).replace("'", "'\\''") - return f"'{safe}'" - except Exception: - return "''" - - def _sanitize_output(self, output: bytes) -> str: - """Sanitize command output for security.""" - try: - output_str = output.decode("utf-8", errors="replace") - - max_output_size = 1024 * 1024 # 1MB limit - if len(output_str) > max_output_size: - output_str = ( - output_str[:max_output_size] - + "\n[Output truncated - size limit exceeded]" - ) - - output_str = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", output_str) - return output_str - - except Exception as e: - logger.error(f"Failed to sanitize output: {e}") - return "[Output sanitization failed]" - - def _separate_output_streams(self, output: str, exit_code: int) -> Tuple[str, str]: - """Separate stdout and stderr from combined output.""" - if exit_code != 0: - error_patterns = [ - "error:", - "Error:", - "ERROR:", - "exception:", - "Exception:", - "EXCEPTION:", - "traceback", - "Traceback", - "TRACEBACK", - "failed", - "Failed", - "FAILED", - ] - - lines = output.split("\n") - stdout_lines = [] - stderr_lines = [] - - for line in lines: - is_error = any( - pattern.lower() in line.lower() for pattern in error_patterns - ) - if is_error: - stderr_lines.append(line) - else: - stdout_lines.append(line) - - return "\n".join(stdout_lines), "\n".join(stderr_lines) - else: - return output, "" diff --git a/src/services/container/manager.py b/src/services/container/manager.py deleted file mode 100644 index 32aa1ba..0000000 --- a/src/services/container/manager.py +++ /dev/null @@ -1,664 +0,0 @@ -"""Container lifecycle management.""" - -import asyncio -import io -import json -import tarfile -import uuid -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -import structlog -import docker.types -from docker.errors import DockerException, ImageNotFound -from docker.models.containers import Container - -from ...config import settings -from ...config.languages import ( - get_user_id_for_language, -) -from .client import DockerClientFactory -from .executor import ContainerExecutor -from .utils import wait_for_container_ready, run_in_executor - -logger = structlog.get_logger(__name__) - - -class ContainerManager: - """Manages Docker container lifecycle operations.""" - - def __init__(self): - """Initialize the container manager.""" - self._client_factory = DockerClientFactory() - self._executor: Optional[ContainerExecutor] = None - - @property - def client(self): - """Get the Docker client.""" - return self._client_factory.get_client() - - @property - def executor(self) -> ContainerExecutor: - """Get the container executor.""" - if self._executor is None and self.client: - self._executor = ContainerExecutor(self.client) - return self._executor - - def is_available(self) -> bool: - """Check if Docker is available.""" - return self._client_factory.is_available() - - def get_initialization_error(self) -> Optional[str]: - """Get Docker initialization error if any.""" - return self._client_factory.get_initialization_error() - - def reset_initialization(self) -> None: - """Reset initialization state.""" - self._client_factory.reset_initialization() - self._executor = None - - def get_image_for_language(self, language: str) -> str: - """Get Docker image for a programming language. - - Uses fallback logic to find available images: - 1. Configured image from settings/env (e.g., DOCKER_IMAGE_REGISTRY) - 2. Local build prefix: code-interpreter/:latest - 3. GHCR prefix: ghcr.io/usnavy13/librecodeinterpreter/:latest - """ - lang = language.lower().strip() - - # Get the configured image name - configured_image = settings.get_image_for_language(lang) - - # Build list of fallback images to try - # Extract the language-specific part (e.g., "python" from "registry/python:tag") - lang_part = configured_image.split("/")[-1] # e.g., "python:latest" - - fallback_images = [ - configured_image, # First: configured image - f"code-interpreter/{lang_part}", # Second: local build - f"ghcr.io/usnavy13/librecodeinterpreter/{lang_part}", # Third: GHCR - ] - - # Remove duplicates while preserving order - seen = set() - unique_images = [] - for img in fallback_images: - if img not in seen: - seen.add(img) - unique_images.append(img) - - # Check which image exists locally - if self.is_available(): - for image in unique_images: - try: - self.client.images.get(image) - if image != configured_image: - logger.info(f"Using fallback image {image} for language {lang}") - return image - except ImageNotFound: - continue - except Exception: - continue - - # No local image found - fail fast with clear error - tried_images = ", ".join(unique_images) - error_msg = ( - f"No Docker image found for language '{lang}'. " - f"Tried: {tried_images}. " - f"Please build images with 'docker compose build' or pull from GHCR." - ) - logger.error(error_msg) - raise ImageNotFound(error_msg) - - # Docker not available, return configured (will fail later with better error) - return configured_image - - def get_user_id_for_language(self, language: str) -> int: - """Get the user ID for a language container.""" - return get_user_id_for_language(language.lower().strip()) - - async def pull_image_if_needed(self, image: str) -> bool: - """Pull Docker image if not available locally.""" - if not self.is_available(): - logger.error(f"Cannot pull image {image}: Docker not available") - return False - - try: - self.client.images.get(image) - return True - except ImageNotFound: - logger.info(f"Pulling Docker image: {image}") - try: - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, self.client.images.pull, image) - logger.info(f"Successfully pulled image: {image}") - return True - except DockerException as e: - logger.error(f"Failed to pull image {image}: {e}") - return False - except Exception as e: - logger.error(f"Unexpected error checking/pulling image {image}: {e}") - return False - - def create_container( - self, - image: str, - session_id: str, - command: Optional[str] = None, - working_dir: str = "/mnt/data", - environment: Optional[Dict[str, str]] = None, - language: Optional[str] = None, - repl_mode: bool = False, - ) -> Container: - """Create a new Docker container. - - Args: - image: Docker image to use - session_id: Session identifier for the container - command: Optional command to run (overrides default) - working_dir: Working directory inside container - environment: Optional environment variables - language: Programming language for this container - repl_mode: If True, start container with REPL server for fast execution - """ - if not self.is_available(): - error_msg = f"Cannot create container: Docker not available" - if self.get_initialization_error(): - error_msg += f" - {self.get_initialization_error()}" - raise DockerException(error_msg) - - container_name = f"ci-exec-{session_id[:12]}-{uuid.uuid4().hex[:8]}" - - # Build environment variables - env = environment.copy() if environment else {} - if repl_mode: - env["REPL_MODE"] = "true" - - # Determine network configuration - use_wan_access = settings.enable_wan_access - - # Security hardening: paths to mask to prevent host info leakage - # Note: MaskedPaths/ReadonlyPaths are not supported by docker-py 7.1.0. - # Instead, we use bind mounts to /dev/null for critical paths like - # /proc/kallsyms and /proc/modules (see "mounts" in container_config). - # The list below is kept for documentation purposes. - hardening_config: Dict[str, Any] = {} - if settings.container_mask_host_info: - hardening_config["masked_paths"] = [ - "/proc/version", # Kernel version (reveals Azure hosting) - "/proc/version_signature", - "/proc/cpuinfo", # CPU count and model - "/proc/meminfo", # Total RAM - "/proc/kcore", - "/proc/keys", - "/proc/timer_list", - "/proc/sched_debug", - "/proc/kallsyms", # Kernel symbol addresses (KASLR bypass) - masked via bind mount - "/proc/modules", # Loaded kernel modules - masked via bind mount - "/sys/firmware", - "/sys/kernel/security", - "/etc/machine-id", # Unique machine identifier - "/var/lib/dbus/machine-id", - ] - hardening_config["readonly_paths"] = [ - "/proc/bus", - "/proc/fs", - "/proc/irq", - "/proc/sys", - "/proc/sysrq-trigger", - ] - - # Build labels - labels = { - "com.code-interpreter.managed": "true", - "com.code-interpreter.type": "execution", - "com.code-interpreter.session-id": session_id, - "com.code-interpreter.language": language or "unknown", - "com.code-interpreter.created-at": datetime.utcnow().isoformat(), - "com.code-interpreter.repl-mode": "true" if repl_mode else "false", - "com.code-interpreter.wan-access": "true" if use_wan_access else "false", - } - - # Determine command and entrypoint - container_command: Any = command - entrypoint_override = None - if not command: - container_command = ["tail", "-f", "/dev/null"] - try: - image_lower = (image or "").lower() - if "dlang2/dmd-ubuntu" in image_lower or image_lower.startswith( - "dlang2/" - ): - entrypoint_override = ["/bin/sh", "-c"] - container_command = "while true; do sleep 3600; done" - except Exception: - pass - - # Build security options with seccomp profile - security_opts = list(settings.docker_security_opt) - if settings.docker_seccomp_profile: - # Resolve profile path (relative to project root or absolute) - profile_path = Path(settings.docker_seccomp_profile) - if not profile_path.is_absolute(): - # Relative to project root (4 levels up from this file) - project_root = Path(__file__).parent.parent.parent.parent - profile_path = project_root / profile_path - if profile_path.exists(): - try: - with open(profile_path) as f: - seccomp_data = json.load(f) - # docker-py accepts inline JSON via seccomp= - security_opts.append(f"seccomp={json.dumps(seccomp_data)}") - logger.debug( - "Loaded seccomp profile", - profile=str(profile_path), - blocked_syscalls=len(seccomp_data.get("syscalls", [])), - ) - except Exception as e: - logger.warning( - "Failed to load seccomp profile, using default", - profile=str(profile_path), - error=str(e), - ) - else: - logger.warning( - "Seccomp profile not found, using default", - profile=str(profile_path), - ) - - # Build container config - # Security hardening applied: - # - seccomp profile: blocks dangerous syscalls (ptrace, etc.) - # - ulimits: nofile limits to prevent FD exhaustion - # - pids_limit: prevents fork bombs - container_config: Dict[str, Any] = { - "image": image, - "name": container_name, - "working_dir": working_dir, - "detach": True, - "stdin_open": True, - "tty": False if repl_mode else True, - "mem_limit": f"{settings.max_memory_mb}m", - "memswap_limit": f"{settings.max_memory_mb}m", - "nano_cpus": int(settings.max_cpus * 1e9), - "security_opt": security_opts, - "cap_drop": ["ALL"], - "cap_add": ["CHOWN", "DAC_OVERRIDE", "FOWNER", "SETGID", "SETUID"], - # read_only must be False to allow file uploads to /mnt/data - "read_only": False, - "tmpfs": settings.docker_tmpfs, - # pids_limit: cgroup-based per-container process limit (prevents fork bombs) - "pids_limit": settings.max_pids, - "ulimits": [ - docker.types.Ulimit( - name="nofile", - soft=settings.max_open_files, - hard=settings.max_open_files, - ), - ], - # Note: /proc/kallsyms and /proc/modules masking requires MaskedPaths - # which docker-py doesn't support. These paths are read-only by default. - "environment": env, - "labels": labels, - "hostname": settings.container_generic_hostname, - "domainname": "", - "command": container_command, - } - - if entrypoint_override: - container_config["entrypoint"] = entrypoint_override - - # Configure network access - if use_wan_access: - container_config["network"] = settings.wan_network_name - container_config["dns"] = settings.wan_dns_servers - container_config["dns_search"] = [] - container_config["dns_opt"] = ["ndots:1"] - else: - container_config["network_mode"] = "none" - - try: - container = self.client.containers.create(**container_config) - logger.info( - f"Created container {container.id[:12]} for session {session_id}" - ) - return container - except DockerException as e: - logger.error(f"Failed to create container for session {session_id}: {e}") - raise - - async def start_container(self, container: Container) -> bool: - """Start a Docker container.""" - try: - await run_in_executor(container.start) - return await wait_for_container_ready(container) - except DockerException as e: - logger.error(f"Failed to start container {container.id[:12]}: {e}") - return False - - async def execute_command( - self, - container: Container, - command: str, - timeout: int = None, - working_dir: Optional[str] = None, - language: Optional[str] = None, - stdin_payload: Optional[str] = None, - ) -> Tuple[int, str, str]: - """Execute a command in the container.""" - return await self.executor.execute_command( - container, command, timeout, working_dir, language, stdin_payload - ) - - async def copy_to_container( - self, container: Container, source_path: str, dest_path: str - ) -> bool: - """Copy file to container from disk path.""" - try: - with open(source_path, "rb") as f: - data = f.read() - return await self.copy_content_to_container(container, data, dest_path) - except Exception as e: - logger.error(f"Failed to copy file to container: {e}") - return False - - async def copy_content_to_container( - self, container: Container, content: bytes, dest_path: str, language: str = "py" - ) -> bool: - """Copy content directly to container without tempfiles. - - This is the optimized path that avoids disk I/O by streaming - content directly to the container via in-memory tar archive. - - Args: - container: Target container - content: File content as bytes - dest_path: Destination path in container (e.g., /mnt/data/file.py) - language: Programming language (used to set correct file ownership) - - Returns: - True if successful, False otherwise - """ - try: - loop = asyncio.get_event_loop() - - # Get user ID for this language's container - user_id = self.get_user_id_for_language(language) - - # Build in-memory tar archive - tar_buffer = io.BytesIO() - with tarfile.open(fileobj=tar_buffer, mode="w") as tar: - tarinfo = tarfile.TarInfo(name=dest_path.split("/")[-1]) - tarinfo.size = len(content) - tarinfo.mode = 0o644 - tarinfo.uid = user_id - tarinfo.gid = user_id - tar.addfile(tarinfo, io.BytesIO(content)) - - tar_buffer.seek(0) - - # Stream directly to container - dest_dir = "/".join(dest_path.split("/")[:-1]) or "/" - await loop.run_in_executor( - None, - lambda: container.put_archive( - path=dest_dir, data=tar_buffer.getvalue() - ), - ) - - return True - except Exception as e: - logger.error(f"Failed to copy content to container: {e}") - return False - - async def copy_from_container( - self, container: Container, source_path: str, dest_path: str - ) -> bool: - """Copy file from container to disk.""" - try: - content = await self.get_file_content_from_container(container, source_path) - if content is not None: - with open(dest_path, "wb") as f: - f.write(content) - return True - return False - except Exception as e: - logger.error(f"Failed to copy file from container: {e}") - return False - - async def get_file_content_from_container( - self, container: Container, source_path: str - ) -> Optional[bytes]: - """Get file content directly from container without tempfiles. - - This is the optimized path that avoids disk I/O by extracting - content directly from the container's tar archive to memory. - - Args: - container: Source container - source_path: Path to file in container - - Returns: - File content as bytes, or None if failed - """ - try: - loop = asyncio.get_event_loop() - - archive_data, _ = await loop.run_in_executor( - None, lambda: container.get_archive(source_path) - ) - - archive_bytes = b"".join(archive_data) - tar_buffer = io.BytesIO(archive_bytes) - - with tarfile.open(fileobj=tar_buffer, mode="r") as tar: - member = tar.next() - if member: - file_data = tar.extractfile(member) - if file_data: - return file_data.read() - - return None - except Exception as e: - logger.error(f"Failed to get file content from container: {e}") - return None - - async def get_container_stats( - self, container: Container - ) -> Optional[Dict[str, Any]]: - """Get container resource usage statistics.""" - try: - loop = asyncio.get_event_loop() - stats = await loop.run_in_executor( - None, lambda: container.stats(stream=False) - ) - - memory_stats = stats.get("memory_stats", {}) - cpu_stats = stats.get("cpu_stats", {}) - - return { - "memory_usage_mb": memory_stats.get("usage", 0) / (1024 * 1024), - "memory_limit_mb": memory_stats.get("limit", 0) / (1024 * 1024), - "cpu_usage_percent": self._calculate_cpu_percent( - cpu_stats, stats.get("precpu_stats", {}) - ), - "timestamp": datetime.utcnow().isoformat(), - } - except Exception as e: - logger.error(f"Failed to get container stats: {e}") - return None - - def _calculate_cpu_percent(self, cpu_stats: Dict, precpu_stats: Dict) -> float: - """Calculate CPU usage percentage.""" - try: - cpu_delta = cpu_stats.get("cpu_usage", {}).get( - "total_usage", 0 - ) - precpu_stats.get("cpu_usage", {}).get("total_usage", 0) - - system_delta = cpu_stats.get("system_cpu_usage", 0) - precpu_stats.get( - "system_cpu_usage", 0 - ) - - if system_delta > 0 and cpu_delta > 0: - cpu_count = len(cpu_stats.get("cpu_usage", {}).get("percpu_usage", [1])) - return (cpu_delta / system_delta) * cpu_count * 100.0 - - return 0.0 - except (KeyError, ZeroDivisionError): - return 0.0 - - async def stop_container(self, container: Container, timeout: int = 2) -> bool: - """Stop a container.""" - try: - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, lambda: container.stop(timeout=timeout)) - return True - except DockerException as e: - logger.error(f"Failed to stop container {container.id[:12]}: {e}") - return False - - async def remove_container(self, container: Container, force: bool = True) -> bool: - """Remove a container.""" - try: - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, lambda: container.remove(force=force)) - return True - except DockerException as e: - logger.error(f"Failed to remove container {container.id[:12]}: {e}") - return False - - async def force_kill_container(self, container: Container) -> bool: - """Force kill and remove a container.""" - try: - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, lambda: container.remove(force=True)) - return True - except DockerException as e: - logger.error(f"Failed to force kill container {container.id[:12]}: {e}") - return False - - async def force_kill_containers_batch( - self, containers: List[Container], chunk_size: int = 50 - ) -> int: - """Force kill containers in batch.""" - if not containers or not self.is_available(): - return 0 - - logger.info(f"Batch force kill of {len(containers)} containers") - start_time = datetime.utcnow() - total_success = 0 - - async def kill_single(c: Container) -> bool: - try: - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, lambda: c.remove(force=True)) - return True - except Exception: - return False - - for i in range(0, len(containers), chunk_size): - chunk = containers[i : i + chunk_size] - try: - results = await asyncio.wait_for( - asyncio.gather( - *[kill_single(c) for c in chunk], return_exceptions=True - ), - timeout=30, - ) - total_success += sum(1 for r in results if r is True) - except asyncio.TimeoutError: - logger.error(f"Batch kill timed out for chunk") - - duration = (datetime.utcnow() - start_time).total_seconds() - logger.info( - f"Batch kill completed: {total_success}/{len(containers)} in {duration:.2f}s" - ) - return total_success - - async def cleanup_session_containers(self, session_id: str) -> int: - """Clean up all containers for a session.""" - if not self.is_available(): - return 0 - - try: - containers = self.client.containers.list( - all=True, - filters={"label": f"com.code-interpreter.session-id={session_id}"}, - ) - - if not containers: - return 0 - - return await self.force_kill_containers_batch(containers) - except DockerException as e: - logger.error(f"Failed to list containers for cleanup: {e}") - return 0 - - async def cleanup_all_code_execution_containers( - self, max_age_minutes: int = None - ) -> int: - """Clean up old code execution containers.""" - if not self.is_available(): - return 0 - - if max_age_minutes is None: - max_age_minutes = settings.get_container_ttl_minutes() - - try: - all_containers = self.client.containers.list(all=True) - code_exec_containers = [ - c - for c in all_containers - if c.name.startswith("ci-exec-") - or (c.labels and c.labels.get("com.code-interpreter.managed") == "true") - ] - - if not code_exec_containers: - return 0 - - aged_containers = [] - for container in code_exec_containers: - age = self._get_container_age(container) - if age is not None and age > max_age_minutes: - aged_containers.append(container) - - if not aged_containers: - return 0 - - return await self.force_kill_containers_batch(aged_containers) - except DockerException as e: - logger.error(f"Failed to cleanup containers: {e}") - return 0 - - def _get_container_age(self, container) -> Optional[float]: - """Get container age in minutes.""" - try: - created_at_str = ( - container.labels.get("com.code-interpreter.created-at") - if container.labels - else None - ) - if created_at_str: - created_at = datetime.fromisoformat(created_at_str) - age = datetime.utcnow() - created_at - return age.total_seconds() / 60 - - container.reload() - created_str = container.attrs.get("Created") - if created_str: - import dateutil.parser - - created_at = dateutil.parser.parse(created_str).replace(tzinfo=None) - age = datetime.utcnow() - created_at - return age.total_seconds() / 60 - - return None - except Exception as e: - logger.error(f"Failed to get container age: {e}") - return None - - def close(self): - """Close Docker client connection.""" - self._client_factory.close() diff --git a/src/services/container/network.py b/src/services/container/network.py deleted file mode 100644 index 4f4e265..0000000 --- a/src/services/container/network.py +++ /dev/null @@ -1,323 +0,0 @@ -"""Docker network management for WAN-only container access. - -This module provides functionality to create and manage a Docker network that -allows execution containers to access the public internet while blocking: -- Private IP ranges (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16) -- Link-local addresses (169.254.0.0/16) - includes cloud metadata services -- Loopback addresses (127.0.0.0/8) -- Docker host gateway -- Inter-container communication -""" - -import asyncio -import subprocess -from typing import List, Optional - -import structlog -from docker import DockerClient -from docker.errors import APIError, NotFound -from docker.models.networks import Network - -from ...config import settings - -logger = structlog.get_logger(__name__) - -# IP ranges to block (private networks + special ranges) -BLOCKED_IP_RANGES: List[str] = [ - "10.0.0.0/8", # Class A private - "172.16.0.0/12", # Class B private (includes Docker default bridge) - "192.168.0.0/16", # Class C private - "169.254.0.0/16", # Link-local (includes cloud metadata 169.254.169.254) - "127.0.0.0/8", # Loopback - "224.0.0.0/4", # Multicast - "240.0.0.0/4", # Reserved -] - -# WAN network subnet (separate from main code-interpreter-network 172.20.0.0/16) -WAN_NETWORK_SUBNET = "172.30.0.0/16" -WAN_NETWORK_GATEWAY = "172.30.0.1" - -# iptables chain name for our rules -IPTABLES_CHAIN_NAME = "CODE_INTERP_WAN" - - -class WANNetworkManager: - """Manages the WAN-only Docker network for execution containers. - - This class handles: - - Creating/getting the Docker bridge network with ICC disabled - - Applying iptables rules to block private IP ranges - - Cleaning up iptables rules on shutdown - """ - - def __init__(self, docker_client: DockerClient): - """Initialize the WAN network manager. - - Args: - docker_client: Docker client instance - """ - self._client = docker_client - self._network: Optional[Network] = None - self._initialized = False - self._bridge_name: Optional[str] = None - - @property - def network_name(self) -> str: - """Get the WAN network name from settings.""" - return settings.wan_network_name - - @property - def dns_servers(self) -> List[str]: - """Get DNS servers from settings.""" - return settings.wan_dns_servers - - async def initialize(self) -> bool: - """Initialize the WAN network with iptables rules. - - Returns: - True if network is ready, False otherwise - """ - if self._initialized: - return True - - try: - # Get or create the Docker network - self._network = await self._get_or_create_network() - - if self._network: - # Get the bridge interface name - self._bridge_name = await self._get_bridge_name() - - if self._bridge_name: - # Apply iptables rules to block private IPs - await self._apply_iptables_rules() - - self._initialized = True - logger.info( - "WAN network initialized", - network_name=self.network_name, - network_id=self._network.id[:12] if self._network.id else "unknown", - bridge_name=self._bridge_name, - ) - return True - - return False - - except Exception as e: - logger.error("Failed to initialize WAN network", error=str(e)) - return False - - async def _get_or_create_network(self) -> Optional[Network]: - """Get existing network or create new one. - - Returns: - Docker Network object or None if creation failed - """ - loop = asyncio.get_event_loop() - - # Try to get existing network - try: - networks = await loop.run_in_executor( - None, lambda: self._client.networks.list(names=[self.network_name]) - ) - if networks: - logger.info( - "Found existing WAN network", - network_name=self.network_name, - network_id=networks[0].id[:12], - ) - return networks[0] - except Exception as e: - logger.warning("Error checking for existing network", error=str(e)) - - # Create new network with specific subnet - logger.info("Creating WAN network", network_name=self.network_name) - - ipam_config = { - "Driver": "default", - "Config": [{"Subnet": WAN_NETWORK_SUBNET, "Gateway": WAN_NETWORK_GATEWAY}], - } - - try: - network = await loop.run_in_executor( - None, - lambda: self._client.networks.create( - name=self.network_name, - driver="bridge", - ipam=ipam_config, - options={ - # Enable masquerading for outbound internet access - "com.docker.network.bridge.enable_ip_masquerade": "true", - # Disable inter-container communication - "com.docker.network.bridge.enable_icc": "false", - }, - labels={ - "com.code-interpreter.managed": "true", - "com.code-interpreter.type": "wan-access", - }, - ), - ) - logger.info( - "Created WAN network", - network_name=self.network_name, - network_id=network.id[:12], - subnet=WAN_NETWORK_SUBNET, - ) - return network - except APIError as e: - logger.error("Failed to create WAN network", error=str(e)) - raise - - async def _get_bridge_name(self) -> Optional[str]: - """Get the Linux bridge interface name for the network. - - Returns: - Bridge interface name (e.g., 'br-abc123def456') or None - """ - if not self._network: - return None - - try: - loop = asyncio.get_event_loop() - # Reload network to get fresh info - await loop.run_in_executor(None, self._network.reload) - - # Get network ID prefix (first 12 chars) - network_id = self._network.id[:12] - bridge_name = f"br-{network_id}" - - logger.debug( - "Determined bridge name", - network_id=network_id, - bridge_name=bridge_name, - ) - return bridge_name - except Exception as e: - logger.warning("Could not determine bridge name", error=str(e)) - return None - - async def _apply_iptables_rules(self) -> None: - """Apply iptables rules to block private IP ranges. - - This creates rules that: - 1. Allow established connections - 2. Allow DNS (UDP/TCP 53) to public DNS servers - 3. Block all traffic to private IP ranges - 4. Block access to Docker host gateway - 5. Allow all other outbound traffic to public IPs - """ - if not self._bridge_name: - logger.warning("No bridge name available, skipping iptables rules") - return - - rules: List[str] = [] - - # Create custom chain if it doesn't exist (ignore error if exists) - rules.append(f"iptables -N {IPTABLES_CHAIN_NAME} 2>/dev/null || true") - - # Flush existing rules in our chain - rules.append(f"iptables -F {IPTABLES_CHAIN_NAME}") - - # Allow established/related connections (critical for return traffic) - rules.append( - f"iptables -A {IPTABLES_CHAIN_NAME} -m state --state ESTABLISHED,RELATED -j ACCEPT" - ) - - # Allow DNS to public DNS servers - for dns in self.dns_servers: - rules.append( - f"iptables -A {IPTABLES_CHAIN_NAME} -p udp -d {dns} --dport 53 -j ACCEPT" - ) - rules.append( - f"iptables -A {IPTABLES_CHAIN_NAME} -p tcp -d {dns} --dport 53 -j ACCEPT" - ) - - # Block all private IP ranges - for ip_range in BLOCKED_IP_RANGES: - rules.append(f"iptables -A {IPTABLES_CHAIN_NAME} -d {ip_range} -j DROP") - - # Block Docker host gateway explicitly - rules.append( - f"iptables -A {IPTABLES_CHAIN_NAME} -d {WAN_NETWORK_GATEWAY} -j DROP" - ) - - # Allow all other traffic (public internet) - rules.append(f"iptables -A {IPTABLES_CHAIN_NAME} -j ACCEPT") - - # Remove any existing rule in FORWARD chain (ignore error if not exists) - rules.append( - f"iptables -D FORWARD -i {self._bridge_name} -j {IPTABLES_CHAIN_NAME} 2>/dev/null || true" - ) - - # Insert our chain at the beginning of FORWARD chain - rules.append( - f"iptables -I FORWARD 1 -i {self._bridge_name} -j {IPTABLES_CHAIN_NAME}" - ) - - # Execute rules - loop = asyncio.get_event_loop() - failed_rules = [] - - for rule in rules: - try: - result = await loop.run_in_executor( - None, - lambda r=rule: subprocess.run( # nosec B602 - iptables rules built from constants - r, shell=True, check=False, capture_output=True, text=True - ), - ) - if result.returncode != 0 and "already exists" not in result.stderr: - # Only log as warning if it's not an expected "already exists" error - if result.stderr.strip(): - failed_rules.append((rule, result.stderr.strip())) - except Exception as e: - failed_rules.append((rule, str(e))) - - if failed_rules: - logger.warning( - "Some iptables rules failed", - failed_count=len(failed_rules), - failed_rules=failed_rules[:3], # Log first 3 failures - ) - - logger.info( - "Applied iptables rules for WAN network", - chain_name=IPTABLES_CHAIN_NAME, - bridge_name=self._bridge_name, - blocked_ranges=len(BLOCKED_IP_RANGES), - dns_servers=self.dns_servers, - ) - - async def cleanup(self) -> None: - """Clean up iptables rules. - - Called on application shutdown to remove the iptables rules. - The Docker network itself is left intact for reuse. - """ - if not self._bridge_name: - return - - logger.info("Cleaning up WAN network iptables rules") - - rules = [ - # Remove from FORWARD chain - f"iptables -D FORWARD -i {self._bridge_name} -j {IPTABLES_CHAIN_NAME} 2>/dev/null || true", - # Flush our chain - f"iptables -F {IPTABLES_CHAIN_NAME} 2>/dev/null || true", - # Delete our chain - f"iptables -X {IPTABLES_CHAIN_NAME} 2>/dev/null || true", - ] - - loop = asyncio.get_event_loop() - for rule in rules: - try: - await loop.run_in_executor( - None, - lambda r=rule: subprocess.run( - r, shell=True, check=False - ), # nosec B602 - ) - except Exception: - pass # Ignore cleanup errors - - logger.info("Cleaned up WAN network iptables rules") diff --git a/src/services/container/pool.py b/src/services/container/pool.py deleted file mode 100644 index 79d4bfd..0000000 --- a/src/services/container/pool.py +++ /dev/null @@ -1,552 +0,0 @@ -"""Container pool service for pre-warming containers. - -This module provides a container pooling mechanism that: -1. Pre-warms containers per language for fast acquisition (~3ms vs 500-2000ms) -2. Provides fresh containers from the pool on demand -3. Does NOT track session-to-container mapping (stateless) - -After execution, containers should be destroyed by the caller. -The pool continuously replenishes to maintain warm containers. -""" - -import asyncio -from datetime import datetime -from typing import Dict, Optional, Set -import structlog - -from docker.models.containers import Container - -from ...config import settings -from ...models.pool import PooledContainer, PoolConfig, PoolStats -from ...core.events import ( - event_bus, - ContainerAcquiredFromPool, - ContainerCreatedFresh, - PoolWarmedUp, - PoolExhausted, -) -from .manager import ContainerManager -from .repl_executor import REPLExecutor - -logger = structlog.get_logger(__name__) - - -class ContainerPool: - """Container pool for fast container acquisition. - - Key behaviors: - - Pre-warms containers per language based on configuration - - Provides fresh containers from pool (O(1) acquisition) - - Stateless: no session tracking (caller manages container lifecycle) - - Continuously replenishes pool in background - """ - - def __init__(self, container_manager: ContainerManager): - """Initialize the container pool. - - Args: - container_manager: Manager for container lifecycle operations - """ - self._container_manager = container_manager - self._lock = asyncio.Lock() - - # Available containers per language (ready to be used) - self._available: Dict[str, asyncio.Queue[PooledContainer]] = {} - - # Pool statistics per language - self._stats: Dict[str, PoolStats] = {} - - # Background tasks - self._warmup_task: Optional[asyncio.Task] = None - self._running = False - - # Languages to warm up on startup - self._warmup_languages: Set[str] = set() - - # Event for exhaustion-triggered replenishment - self._replenish_event = asyncio.Event() - - async def start(self) -> None: - """Start the container pool and warmup background task.""" - if self._running: - return - - self._running = True - logger.info("Starting container pool (simplified, no session tracking)") - - # Initialize queues for all supported languages and track those needing warmup - all_languages = [ - "py", - "js", - "ts", - "go", - "java", - "c", - "cpp", - "php", - "rs", - "r", - "f90", - "d", - ] - for lang in all_languages: - self._available[lang] = asyncio.Queue() - config = PoolConfig.from_settings(lang) - if config.warmup_on_startup and config.size > 0: - self._warmup_languages.add(lang) - - # Subscribe to exhaustion events for immediate replenishment - if settings.container_pool_exhaustion_trigger: - event_bus.register_handler(PoolExhausted, self._on_pool_exhausted) - - # Start warmup background task - self._warmup_task = asyncio.create_task(self._warmup_loop()) - - logger.info( - "Container pool started", - warmup_languages=list(self._warmup_languages), - parallel_batch=settings.container_pool_parallel_batch, - replenish_interval=settings.container_pool_replenish_interval, - exhaustion_trigger=settings.container_pool_exhaustion_trigger, - ) - - async def stop(self) -> None: - """Stop the container pool and cleanup all containers.""" - if not self._running: - return - - self._running = False - logger.info("Stopping container pool") - - # Cancel background task - if self._warmup_task: - self._warmup_task.cancel() - try: - await self._warmup_task - except asyncio.CancelledError: - pass - - # Destroy all pooled containers - for lang, queue in self._available.items(): - count = 0 - while not queue.empty(): - try: - pooled = queue.get_nowait() - await self._destroy_container(pooled.container_id) - count += 1 - except asyncio.QueueEmpty: - break - if count > 0: - logger.info(f"Destroyed {count} pooled {lang} containers") - - logger.info("Container pool stopped") - - async def acquire(self, language: str, session_id: str = "") -> Container: - """Acquire a container from the pool. - - This method: - 1. Gets a container from the pool if available - 2. Creates a new container if pool is empty - - Args: - language: Programming language code - session_id: Session identifier (for logging only, not tracked) - - Returns: - Docker Container object ready for execution - """ - start_time = datetime.utcnow() - - # Try to get from pool - if settings.container_pool_enabled: - queue = self._available.get(language) - if queue and not queue.empty(): - try: - pooled = queue.get_nowait() - container = await self._get_docker_container(pooled.container_id) - if container and await self._is_container_healthy(container): - acquire_time = ( - datetime.utcnow() - start_time - ).total_seconds() * 1000 - await event_bus.publish( - ContainerAcquiredFromPool( - container_id=pooled.container_id, - session_id=session_id, - language=language, - acquire_time_ms=acquire_time, - ) - ) - self._record_stats( - language, pool_hit=True, acquire_time_ms=acquire_time - ) - logger.info( - "Acquired container from pool", - session_id=session_id[:12] if session_id else "none", - container_id=pooled.container_id[:12], - language=language, - acquire_time_ms=f"{acquire_time:.1f}", - ) - return container - except asyncio.QueueEmpty: - pass - - # Pool empty - await event_bus.publish( - PoolExhausted(language=language, session_id=session_id) - ) - - # Create fresh container (fallback) - container = await self._create_fresh_container(session_id, language) - reason = "pool_empty" if settings.container_pool_enabled else "pool_disabled" - await event_bus.publish( - ContainerCreatedFresh( - container_id=container.id, - session_id=session_id, - language=language, - reason=reason, - ) - ) - self._record_stats(language, pool_miss=True) - - return container - - async def destroy_container(self, container: Container) -> None: - """Destroy a container after use. - - Call this after execution to clean up the container. - Containers are never returned to the pool for security. - """ - if container: - await self._destroy_container(container.id) - - def get_stats(self, language: str = None) -> Dict[str, PoolStats]: - """Get pool statistics.""" - if language: - return { - language: self._stats.get( - language, - PoolStats(language=language, available_count=0, assigned_count=0), - ) - } - - # Build stats for all languages - stats = {} - for lang in set(list(self._available.keys()) + list(self._stats.keys())): - queue = self._available.get(lang) - available = queue.qsize() if queue else 0 - if lang in self._stats: - self._stats[lang].available_count = available - self._stats[lang].assigned_count = 0 # No longer tracking - stats[lang] = self._stats[lang] - else: - stats[lang] = PoolStats( - language=lang, available_count=available, assigned_count=0 - ) - return stats - - # ========================================================================= - # Private methods - # ========================================================================= - - async def _create_fresh_container( - self, session_id: str, language: str - ) -> Container: - """Create a new container when pool is exhausted.""" - image = self._container_manager.get_image_for_language(language) - - # Ensure image is available - await self._container_manager.pull_image_if_needed(image) - - # Enable REPL mode for Python if configured (same as pooled containers) - use_repl_mode = language == "py" and settings.repl_enabled - - # Create and start container - container = self._container_manager.create_container( - image=image, - session_id=session_id, - language=language, - repl_mode=use_repl_mode, - ) - - started = await self._container_manager.start_container(container) - if not started: - try: - container.remove(force=True) - except Exception: - pass - raise RuntimeError(f"Failed to start container for {language}") - - # For REPL containers, wait for REPL to be ready before returning - if use_repl_mode: - repl_ready = await self._wait_for_repl_ready(container) - if not repl_ready: - logger.warning( - "REPL not ready in fresh container", - container_id=container.id[:12], - language=language, - ) - - logger.info( - "Created fresh container", - session_id=session_id[:12] if session_id else "none", - container_id=container.id[:12], - language=language, - repl_mode=use_repl_mode, - ) - - return container - - async def _get_docker_container(self, container_id: str) -> Optional[Container]: - """Get Docker container by ID.""" - try: - return self._container_manager.client.containers.get(container_id) - except Exception: - return None - - async def _is_container_healthy(self, container: Container) -> bool: - """Check if container is running and healthy.""" - try: - container.reload() - return container.status == "running" - except Exception: - return False - - async def _destroy_container(self, container_id: str) -> None: - """Force remove a container.""" - try: - container = await self._get_docker_container(container_id) - if container: - container.remove(force=True) - logger.debug("Destroyed container", container_id=container_id[:12]) - except Exception as e: - logger.warning( - "Failed to destroy container", - container_id=container_id[:12], - error=str(e), - ) - - async def _warmup_loop(self) -> None: - """Background task to maintain warm containers in the pool.""" - # Initial warmup - await asyncio.sleep(2) # Let the app start - - replenish_interval = settings.container_pool_replenish_interval - - while self._running: - try: - for language in self._warmup_languages: - await self._warmup_language(language) - - # Wait for either timeout OR exhaustion event (if enabled) - if settings.container_pool_exhaustion_trigger: - try: - await asyncio.wait_for( - self._replenish_event.wait(), - timeout=float(replenish_interval), - ) - # Event was triggered - immediate replenishment - self._replenish_event.clear() - logger.debug("Exhaustion-triggered replenishment") - except asyncio.TimeoutError: - pass # Normal timeout, continue loop - else: - await asyncio.sleep(replenish_interval) - - except asyncio.CancelledError: - break - except Exception as e: - logger.error("Warmup loop error", error=str(e)) - await asyncio.sleep(replenish_interval) - - async def _on_pool_exhausted(self, event: PoolExhausted) -> None: - """Handle pool exhaustion event by triggering immediate replenishment.""" - logger.info( - "Pool exhaustion detected, triggering replenishment", - language=event.language, - session_id=event.session_id[:12] if event.session_id else "none", - ) - self._replenish_event.set() - - async def _warmup_language(self, language: str) -> None: - """Warm up containers for a specific language using parallel creation.""" - config = PoolConfig.from_settings(language) - queue = self._available.setdefault(language, asyncio.Queue()) - - current_size = queue.qsize() - if current_size >= config.size: - return - - needed = config.size - current_size - created = 0 - - # Enable REPL mode for Python if configured - use_repl_mode = language == "py" and settings.repl_enabled - - # Parallel container creation in batches - batch_size = settings.container_pool_parallel_batch - - for batch_start in range(0, needed, batch_size): - batch_end = min(batch_start + batch_size, needed) - batch_count = batch_end - batch_start - - # Launch container creations in parallel - tasks = [ - self._create_pooled_container(language, use_repl_mode) - for _ in range(batch_count) - ] - - results = await asyncio.gather(*tasks, return_exceptions=True) - - for result in results: - if isinstance(result, PooledContainer): - await queue.put(result) - created += 1 - elif isinstance(result, Exception): - logger.warning( - "Failed to create pooled container", - language=language, - error=str(result), - ) - - if created > 0: - await event_bus.publish( - PoolWarmedUp(language=language, container_count=created) - ) - logger.info( - "Warmed up containers (parallel)", - language=language, - created=created, - total=queue.qsize(), - repl_mode=use_repl_mode, - batch_size=batch_size, - ) - - async def _create_pooled_container( - self, language: str, use_repl_mode: bool - ) -> Optional[PooledContainer]: - """Create a single pooled container (for parallel execution). - - Args: - language: Programming language code - use_repl_mode: Whether to enable REPL mode (Python only) - - Returns: - PooledContainer if successful, None if failed - """ - import uuid - - try: - image = self._container_manager.get_image_for_language(language) - await self._container_manager.pull_image_if_needed(image) - - # Create container with a unique pool-specific session ID - pool_session_id = f"pool-{language}-{uuid.uuid4().hex[:12]}" - container = self._container_manager.create_container( - image=image, - session_id=pool_session_id, - language=language, - repl_mode=use_repl_mode, - ) - - started = await self._container_manager.start_container(container) - if not started: - try: - container.remove(force=True) - except Exception: - pass - return None - - # For REPL containers, wait for REPL to be ready - repl_ready = True - if use_repl_mode: - repl_ready = await self._wait_for_repl_ready(container) - if not repl_ready: - logger.warning( - "REPL not ready, removing container", - container_id=container.id[:12], - language=language, - ) - try: - container.remove(force=True) - except Exception: - pass - return None - - pooled = PooledContainer( - container_id=container.id, - language=language, - image=image, - created_at=datetime.utcnow(), - status="available", - repl_enabled=use_repl_mode, - repl_ready=repl_ready if use_repl_mode else False, - ) - - if use_repl_mode: - logger.debug( - "REPL container ready", - container_id=container.id[:12], - language=language, - ) - - return pooled - - except Exception as e: - logger.warning( - "Failed to create pooled container", language=language, error=str(e) - ) - return None - - async def _wait_for_repl_ready( - self, container: Container, timeout: float = 15.0 - ) -> bool: - """Wait for REPL server to be ready in container. - - Args: - container: Container with REPL server - timeout: Maximum time to wait in seconds - - Returns: - True if REPL is ready, False if timeout - """ - try: - repl_executor = REPLExecutor(self._container_manager.client) - return await repl_executor.wait_for_ready(container, timeout=timeout) - except Exception as e: - logger.warning( - "Error waiting for REPL ready", - container_id=container.id[:12], - error=str(e), - ) - return False - - def _record_stats( - self, - language: str, - pool_hit: bool = False, - pool_miss: bool = False, - acquire_time_ms: float = 0.0, - ) -> None: - """Record pool statistics.""" - if language not in self._stats: - self._stats[language] = PoolStats( - language=language, available_count=0, assigned_count=0 - ) - - stats = self._stats[language] - stats.total_acquisitions += 1 - - if pool_hit: - stats.pool_hits += 1 - if pool_miss: - stats.pool_misses += 1 - if acquire_time_ms > 0: - # Running average - n = stats.total_acquisitions - stats.avg_acquire_time_ms = ( - stats.avg_acquire_time_ms * (n - 1) + acquire_time_ms - ) / n - - -# Backward compatibility aliases -acquire_for_session = ContainerPool.acquire diff --git a/src/services/container/repl_executor.py b/src/services/container/repl_executor.py deleted file mode 100644 index a7217fc..0000000 --- a/src/services/container/repl_executor.py +++ /dev/null @@ -1,426 +0,0 @@ -"""REPL-based code execution for pre-warmed Python containers. - -This module provides fast code execution by communicating with a -running Python REPL inside the container, eliminating interpreter startup. - -The REPL server runs as PID 1 in the container and communicates via -stdin/stdout using a JSON-based protocol with delimiters. -""" - -import asyncio -import json -import time -import structlog -from typing import Tuple, Optional, Dict, Any, List -from docker.models.containers import Container - -from ...config import settings - -logger = structlog.get_logger(__name__) - -# Protocol delimiter (must match repl_server.py) -DELIMITER = b"\n---END---\n" - - -class REPLExecutor: - """Executes code via running REPL in container. - - Uses Docker's attach socket to communicate with the REPL server - that's running as PID 1 in the container. - """ - - def __init__(self, docker_client): - """Initialize REPL executor. - - Args: - docker_client: Docker client instance - """ - self.client = docker_client - - async def execute( - self, - container: Container, - code: str, - timeout: int = None, - working_dir: str = "/mnt/data", - args: Optional[List[str]] = None, - ) -> Tuple[int, str, str]: - """Execute code in running REPL. - - Args: - container: Docker container with REPL server running - code: Python code to execute - timeout: Maximum execution time in seconds - working_dir: Working directory for code execution - args: Optional list of command line arguments - - Returns: - Tuple of (exit_code, stdout, stderr) - """ - if timeout is None: - timeout = settings.max_execution_time - - start_time = time.perf_counter() - - # Build request - request = {"code": code, "timeout": timeout, "working_dir": working_dir} - if args: - request["args"] = args - request_json = json.dumps(request) - request_bytes = request_json.encode("utf-8") + DELIMITER - - try: - # Execute via Docker attach - response = await self._send_and_receive( - container, request_bytes, timeout + 5 - ) - - elapsed_ms = (time.perf_counter() - start_time) * 1000 - logger.debug( - "REPL execution completed", - container_id=container.id[:12], - elapsed_ms=f"{elapsed_ms:.1f}", - exit_code=response.get("exit_code", -1), - ) - - return self._parse_response(response) - - except asyncio.TimeoutError: - elapsed_ms = (time.perf_counter() - start_time) * 1000 - logger.warning( - "REPL execution timed out", - container_id=container.id[:12], - timeout=timeout, - elapsed_ms=f"{elapsed_ms:.1f}", - ) - return 124, "", f"Execution timed out after {timeout} seconds" - - except Exception as e: - elapsed_ms = (time.perf_counter() - start_time) * 1000 - logger.error( - "REPL execution failed", - container_id=container.id[:12], - error=str(e), - elapsed_ms=f"{elapsed_ms:.1f}", - ) - return 1, "", f"REPL execution error: {str(e)}" - - async def execute_with_state( - self, - container: Container, - code: str, - timeout: int = None, - working_dir: str = "/mnt/data", - initial_state: Optional[str] = None, - capture_state: bool = False, - args: Optional[List[str]] = None, - ) -> Tuple[int, str, str, Optional[str], List[str]]: - """Execute code in running REPL with optional state persistence. - - Args: - container: Docker container with REPL server running - code: Python code to execute - timeout: Maximum execution time in seconds - working_dir: Working directory for code execution - initial_state: Base64-encoded state to restore before execution - capture_state: Whether to capture state after execution - args: Optional list of command line arguments - - Returns: - Tuple of (exit_code, stdout, stderr, new_state, state_errors) - new_state is base64-encoded cloudpickle, or None if not captured - """ - if timeout is None: - timeout = settings.max_execution_time - - start_time = time.perf_counter() - - # Build request with state options - request = {"code": code, "timeout": timeout, "working_dir": working_dir} - - if initial_state: - request["initial_state"] = initial_state - - if capture_state: - request["capture_state"] = True - - if args: - request["args"] = args - - request_json = json.dumps(request) - request_bytes = request_json.encode("utf-8") + DELIMITER - - try: - # Execute via Docker attach - response = await self._send_and_receive( - container, request_bytes, timeout + 10 - ) - - elapsed_ms = (time.perf_counter() - start_time) * 1000 - logger.debug( - "REPL execution with state completed", - container_id=container.id[:12], - elapsed_ms=f"{elapsed_ms:.1f}", - exit_code=response.get("exit_code", -1), - has_state="state" in response, - ) - - return self._parse_response_with_state(response) - - except asyncio.TimeoutError: - elapsed_ms = (time.perf_counter() - start_time) * 1000 - logger.warning( - "REPL execution timed out", - container_id=container.id[:12], - timeout=timeout, - elapsed_ms=f"{elapsed_ms:.1f}", - ) - return 124, "", f"Execution timed out after {timeout} seconds", None, [] - - except Exception as e: - elapsed_ms = (time.perf_counter() - start_time) * 1000 - logger.error( - "REPL execution failed", - container_id=container.id[:12], - error=str(e), - elapsed_ms=f"{elapsed_ms:.1f}", - ) - return 1, "", f"REPL execution error: {str(e)}", None, [] - - async def _send_and_receive( - self, container: Container, request: bytes, timeout: int - ) -> Dict[str, Any]: - """Send request to REPL and receive response. - - Uses Docker attach socket for bidirectional communication - with the REPL server running in the container. - - Args: - container: Docker container - request: Request bytes to send - timeout: Timeout in seconds - - Returns: - Parsed JSON response dict - """ - loop = asyncio.get_event_loop() - - def _sync_communicate(): - """Synchronous communication with container (runs in executor).""" - import time as sync_time - - t0 = sync_time.perf_counter() - - # Attach to container's stdin/stdout - sock = self.client.api.attach_socket( - container.id, - params={"stdin": True, "stdout": True, "stderr": True, "stream": True}, - ) - t1 = sync_time.perf_counter() - - try: - # Get the raw socket - raw_sock = sock._sock - raw_sock.settimeout(timeout) - - # Send request - raw_sock.sendall(request) - t2 = sync_time.perf_counter() - - # Read response until we get the delimiter - response_bytes = b"" - while DELIMITER not in response_bytes: - try: - chunk = raw_sock.recv(4096) - if not chunk: - break - response_bytes += chunk - except Exception as e: - if "timed out" in str(e).lower(): - raise asyncio.TimeoutError() - raise - - t3 = sync_time.perf_counter() - - # Log timing breakdown - logger.debug( - "REPL socket timing", - attach_ms=f"{(t1-t0)*1000:.1f}", - send_ms=f"{(t2-t1)*1000:.1f}", - recv_ms=f"{(t3-t2)*1000:.1f}", - total_ms=f"{(t3-t0)*1000:.1f}", - ) - - # Parse response - if DELIMITER in response_bytes: - json_part = response_bytes.split(DELIMITER)[0] - - # Strip Docker stream headers (multiplexed format) - # Format: [type:1][0:3][size:4][payload] - json_part = self._strip_docker_headers(json_part) - - # Decode with error handling for any remaining binary data - try: - json_str = json_part.decode("utf-8") - except UnicodeDecodeError: - # Try to find JSON in the data by looking for { and } - json_str = json_part.decode("utf-8", errors="replace") - # Extract the JSON object - start = json_str.find("{") - end = json_str.rfind("}") - if start >= 0 and end > start: - json_str = json_str[start : end + 1] - - return json.loads(json_str) - else: - return { - "exit_code": 1, - "stdout": "", - "stderr": f"Invalid response from REPL: delimiter not found", - } - - finally: - try: - sock.close() - except: - pass - - # Run sync communication in executor - return await loop.run_in_executor(None, _sync_communicate) - - def _strip_docker_headers(self, data: bytes) -> bytes: - """Strip Docker multiplexed stream headers from data. - - Docker attach socket uses multiplexed format where each chunk - is prefixed with 8 bytes: [type:1][0:3][size:4] - - Args: - data: Raw bytes from Docker socket - - Returns: - Data with stream headers stripped - """ - result = bytearray() - pos = 0 - - while pos < len(data): - # Check for Docker stream header - if pos + 8 <= len(data) and data[pos : pos + 1] in ( - b"\x01", - b"\x02", - b"\x00", - ): - # This looks like a Docker header - # Read the payload size from bytes 4-7 (big-endian) - size = int.from_bytes(data[pos + 4 : pos + 8], byteorder="big") - if size > 0 and pos + 8 + size <= len(data) + 100: # Allow some slack - # Extract payload - payload_start = pos + 8 - payload_end = min(pos + 8 + size, len(data)) - result.extend(data[payload_start:payload_end]) - pos = payload_end - continue - - # Not a header or invalid, try to find JSON start - if data[pos : pos + 1] == b"{": - result.extend(data[pos:]) - break - pos += 1 - - return bytes(result) if result else data - - def _parse_response(self, response: Dict[str, Any]) -> Tuple[int, str, str]: - """Parse REPL response into (exit_code, stdout, stderr). - - Args: - response: JSON response from REPL - - Returns: - Tuple of (exit_code, stdout, stderr) - """ - return ( - response.get("exit_code", 1), - response.get("stdout", ""), - response.get("stderr", ""), - ) - - def _parse_response_with_state( - self, response: Dict[str, Any] - ) -> Tuple[int, str, str, Optional[str], List[str]]: - """Parse REPL response including state data. - - Args: - response: JSON response from REPL - - Returns: - Tuple of (exit_code, stdout, stderr, state, state_errors) - """ - return ( - response.get("exit_code", 1), - response.get("stdout", ""), - response.get("stderr", ""), - response.get("state"), # May be None - response.get("state_errors", []), - ) - - async def check_health(self, container: Container, timeout: float = 5.0) -> bool: - """Check if REPL is responsive. - - Sends a simple health check code and verifies response. - - Args: - container: Docker container to check - timeout: Maximum time to wait for response - - Returns: - True if REPL is healthy, False otherwise - """ - try: - exit_code, stdout, stderr = await self.execute( - container, "print('health_check_ok')", timeout=int(timeout) - ) - return exit_code == 0 and "health_check_ok" in stdout - - except Exception as e: - logger.debug( - "REPL health check failed", container_id=container.id[:12], error=str(e) - ) - return False - - async def wait_for_ready( - self, container: Container, timeout: float = 10.0, poll_interval: float = 0.1 - ) -> bool: - """Wait for REPL to be ready. - - The REPL server sends a ready signal when it has finished - pre-loading libraries. This method waits for that signal - or falls back to health check. - - Args: - container: Docker container - timeout: Maximum time to wait - poll_interval: Time between checks - - Returns: - True if REPL is ready, False if timeout - """ - start_time = time.perf_counter() - - while (time.perf_counter() - start_time) < timeout: - # Try health check - if await self.check_health(container, timeout=2.0): - elapsed = time.perf_counter() - start_time - logger.info( - "REPL ready", - container_id=container.id[:12], - elapsed_ms=f"{elapsed * 1000:.1f}", - ) - return True - - # Wait before next check - await asyncio.sleep(poll_interval) - - logger.warning( - "REPL ready timeout", container_id=container.id[:12], timeout=timeout - ) - return False diff --git a/src/services/container/utils.py b/src/services/container/utils.py deleted file mode 100644 index 1bbd607..0000000 --- a/src/services/container/utils.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Shared utilities for container operations. - -This module contains common patterns extracted from container services -to reduce code duplication. -""" - -import asyncio -from typing import List, Optional - -import structlog -from docker.models.containers import Container - -logger = structlog.get_logger(__name__) - - -async def wait_for_container_ready( - container: Container, - max_wait: float = 2.0, - interval: float = 0.05, - stable_checks_required: int = 3, -) -> bool: - """ - Wait for a container to reach a stable running state. - - Uses polling with stability checks to ensure the container - is truly running before returning. - - Args: - container: Docker container to wait for - max_wait: Maximum time to wait in seconds - interval: Polling interval in seconds - stable_checks_required: Number of consecutive running checks required - - Returns: - True if container is running, False otherwise - """ - stable_checks = 0 - total_wait = 0.0 - - while total_wait < max_wait: - try: - container.reload() - if getattr(container, "status", "") == "running": - stable_checks += 1 - if stable_checks >= stable_checks_required: - return True - else: - stable_checks = 0 - except Exception: - stable_checks = 0 - await asyncio.sleep(interval) - total_wait += interval - - # Final check - try: - container.reload() - return getattr(container, "status", "") == "running" - except Exception: - return False - - -def receive_socket_output( - sock, - chunk_size: int = 4096, - timeout_exceptions: tuple = (TimeoutError, OSError), -) -> bytes: - """ - Receive all output from a socket until closed or timeout. - - Args: - sock: Raw socket to receive from - chunk_size: Size of chunks to receive - timeout_exceptions: Exception types that indicate timeout - - Returns: - All received bytes concatenated - """ - output_chunks: List[bytes] = [] - while True: - try: - chunk = sock.recv(chunk_size) - if not chunk: - break - output_chunks.append(chunk) - except timeout_exceptions: - break - return b"".join(output_chunks) - - -async def run_in_executor(func, *args): - """ - Run a blocking function in the default thread pool executor. - - Args: - func: Blocking function to run - *args: Arguments to pass to the function - - Returns: - Result of the function - """ - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, func, *args) diff --git a/src/services/detailed_metrics.py b/src/services/detailed_metrics.py deleted file mode 100644 index 97f3e74..0000000 --- a/src/services/detailed_metrics.py +++ /dev/null @@ -1,585 +0,0 @@ -"""Detailed Metrics Service. - -Provides extended metrics tracking with: -- Per-API-key usage tracking -- Per-language breakdown -- Container pool metrics -- Hourly/daily aggregation with Redis storage -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, List, Dict, Any - -import redis.asyncio as redis -import structlog - -from ..config import settings -from ..core.pool import redis_pool -from ..models.metrics import ( - DetailedExecutionMetrics, - LanguageMetrics, - ApiKeyUsageMetrics, - PoolMetricsSummary, - AggregatedMetrics, - MetricsSummary, -) - -logger = structlog.get_logger(__name__) - - -class DetailedMetricsService: - """Service for collecting and querying detailed execution metrics.""" - - # Redis key prefixes - BUFFER_KEY = "metrics:detailed:buffer" - HOURLY_PREFIX = "metrics:detailed:hourly:" - DAILY_PREFIX = "metrics:detailed:daily:" - POOL_STATS_KEY = "metrics:pool:stats" - API_KEY_HOURLY_PREFIX = "metrics:api_key:" - - # Buffer and retention settings - MAX_BUFFER_SIZE = 10000 - HOURLY_TTL = 7 * 24 * 3600 # 7 days - DAILY_TTL = 30 * 24 * 3600 # 30 days - - def __init__(self, redis_client: Optional[redis.Redis] = None): - """Initialize the detailed metrics service. - - Args: - redis_client: Optional Redis client, uses shared pool if not provided - """ - self._redis = redis_client - self._in_memory_buffer: List[DetailedExecutionMetrics] = [] - - def register_event_handlers(self) -> None: - """Register event handlers for pool metrics.""" - from ..core.events import ( - event_bus, - ContainerAcquiredFromPool, - ContainerCreatedFresh, - PoolExhausted, - ) - - @event_bus.subscribe(ContainerAcquiredFromPool) - async def handle_pool_hit(event: ContainerAcquiredFromPool): - await self.record_pool_event( - event_type="hit", - language=event.language, - acquire_time_ms=event.acquire_time_ms, - ) - - @event_bus.subscribe(ContainerCreatedFresh) - async def handle_pool_miss(event: ContainerCreatedFresh): - if event.reason in ("pool_empty", "pool_disabled"): - await self.record_pool_event(event_type="miss", language=event.language) - - @event_bus.subscribe(PoolExhausted) - async def handle_pool_exhaustion(event: PoolExhausted): - await self.record_pool_event( - event_type="exhaustion", language=event.language - ) - - logger.info("Registered pool event handlers for metrics") - - @property - def redis(self) -> redis.Redis: - """Get Redis client, initializing if needed.""" - if self._redis is None: - self._redis = redis_pool.get_client() - return self._redis - - async def record_execution(self, metrics: DetailedExecutionMetrics) -> None: - """Record a detailed execution metric. - - Args: - metrics: The execution metrics to record - """ - if not settings.detailed_metrics_enabled: - return - - try: - # Add to Redis buffer - await self.redis.lpush(self.BUFFER_KEY, json.dumps(metrics.to_dict())) - - # Trim buffer to max size - await self.redis.ltrim(self.BUFFER_KEY, 0, self.MAX_BUFFER_SIZE - 1) - - # Update hourly aggregates - await self._update_hourly_aggregates(metrics) - - # Update per-API-key metrics - if metrics.api_key_hash: - await self._update_api_key_metrics(metrics) - - # Forward to SQLite for persistent storage - if settings.sqlite_metrics_enabled: - try: - from .sqlite_metrics import sqlite_metrics_service - - await sqlite_metrics_service.record_execution(metrics) - except Exception as sqlite_err: - logger.warning( - "Failed to record metrics to SQLite", - error=str(sqlite_err), - ) - - logger.debug( - "Recorded detailed metrics", - execution_id=metrics.execution_id, - language=metrics.language, - api_key_hash=( - metrics.api_key_hash[:8] if metrics.api_key_hash else "unknown" - ), - ) - - except Exception as e: - logger.warning("Failed to record detailed metrics", error=str(e)) - # Fall back to in-memory buffer - self._in_memory_buffer.append(metrics) - if len(self._in_memory_buffer) > self.MAX_BUFFER_SIZE: - self._in_memory_buffer = self._in_memory_buffer[-self.MAX_BUFFER_SIZE :] - - async def _update_hourly_aggregates( - self, metrics: DetailedExecutionMetrics - ) -> None: - """Update hourly aggregate counters.""" - hour_key = self._get_hour_key(metrics.timestamp) - redis_key = f"{self.HOURLY_PREFIX}{hour_key}" - - pipe = self.redis.pipeline(transaction=False) - - # Increment counters - pipe.hincrby(redis_key, "execution_count", 1) - pipe.hincrbyfloat( - redis_key, "total_execution_time_ms", metrics.execution_time_ms - ) - - if metrics.status == "completed": - pipe.hincrby(redis_key, "success_count", 1) - elif metrics.status == "failed": - pipe.hincrby(redis_key, "failure_count", 1) - elif metrics.status == "timeout": - pipe.hincrby(redis_key, "timeout_count", 1) - - if metrics.memory_peak_mb: - pipe.hincrbyfloat(redis_key, "total_memory_mb", metrics.memory_peak_mb) - - # Per-language counters - lang_key = f"lang:{metrics.language}:count" - lang_time_key = f"lang:{metrics.language}:time_ms" - pipe.hincrby(redis_key, lang_key, 1) - pipe.hincrbyfloat(redis_key, lang_time_key, metrics.execution_time_ms) - - if metrics.status != "completed": - lang_error_key = f"lang:{metrics.language}:errors" - pipe.hincrby(redis_key, lang_error_key, 1) - - # Container pool tracking - if metrics.container_source == "pool_hit": - pipe.hincrby(redis_key, "pool_hits", 1) - elif metrics.container_source == "pool_miss": - pipe.hincrby(redis_key, "pool_misses", 1) - - # Set TTL - pipe.expire(redis_key, self.HOURLY_TTL) - - await pipe.execute() - - async def _update_api_key_metrics(self, metrics: DetailedExecutionMetrics) -> None: - """Update per-API-key metrics.""" - hour_key = self._get_hour_key(metrics.timestamp) - redis_key = ( - f"{self.API_KEY_HOURLY_PREFIX}{metrics.api_key_hash[:16]}:hour:{hour_key}" - ) - - pipe = self.redis.pipeline(transaction=False) - pipe.hincrby(redis_key, "execution_count", 1) - pipe.hincrbyfloat( - redis_key, "total_execution_time_ms", metrics.execution_time_ms - ) - - if metrics.status == "completed": - pipe.hincrby(redis_key, "success_count", 1) - else: - pipe.hincrby(redis_key, "failure_count", 1) - - if metrics.memory_peak_mb: - pipe.hincrbyfloat(redis_key, "total_memory_mb", metrics.memory_peak_mb) - - file_ops = metrics.files_uploaded + metrics.files_generated - if file_ops > 0: - pipe.hincrby(redis_key, "file_operations", file_ops) - - pipe.expire(redis_key, 7200) # 2 hours TTL - - await pipe.execute() - - async def record_pool_event( - self, event_type: str, language: str, acquire_time_ms: Optional[float] = None - ) -> None: - """Record a container pool event. - - Args: - event_type: Type of event (hit, miss, exhaustion) - language: Language of container - acquire_time_ms: Time to acquire container - """ - try: - pipe = self.redis.pipeline(transaction=False) - - if event_type == "hit": - pipe.hincrby(self.POOL_STATS_KEY, "pool_hits", 1) - elif event_type == "miss": - pipe.hincrby(self.POOL_STATS_KEY, "pool_misses", 1) - elif event_type == "exhaustion": - pipe.hincrby(self.POOL_STATS_KEY, "exhaustion_events", 1) - pipe.hset( - self.POOL_STATS_KEY, - "last_exhaustion", - datetime.now(timezone.utc).isoformat(), - ) - - pipe.hincrby(self.POOL_STATS_KEY, "total_acquisitions", 1) - - if acquire_time_ms: - pipe.hincrbyfloat( - self.POOL_STATS_KEY, "total_acquire_time_ms", acquire_time_ms - ) - - await pipe.execute() - - except Exception as e: - logger.warning("Failed to record pool event", error=str(e)) - - async def get_hourly_metrics( - self, hour: Optional[datetime] = None - ) -> Optional[AggregatedMetrics]: - """Get aggregated metrics for a specific hour. - - Args: - hour: The hour to get metrics for (default: current hour) - - Returns: - AggregatedMetrics or None if no data - """ - if hour is None: - hour = datetime.now(timezone.utc) - - hour_key = self._get_hour_key(hour) - redis_key = f"{self.HOURLY_PREFIX}{hour_key}" - - try: - data = await self.redis.hgetall(redis_key) - if not data: - return None - - return self._parse_hourly_data(data, hour_key, "hourly") - - except Exception as e: - logger.error("Failed to get hourly metrics", error=str(e)) - return None - - async def get_metrics_range( - self, start: datetime, end: datetime, period_type: str = "hourly" - ) -> List[AggregatedMetrics]: - """Get aggregated metrics for a time range. - - Args: - start: Start of range - end: End of range - period_type: hourly or daily - - Returns: - List of AggregatedMetrics - """ - results = [] - - if period_type == "hourly": - current = start.replace(minute=0, second=0, microsecond=0) - while current <= end: - metrics = await self.get_hourly_metrics(current) - if metrics: - results.append(metrics) - current += timedelta(hours=1) - - return results - - async def get_language_stats(self, hours: int = 24) -> Dict[str, LanguageMetrics]: - """Get per-language statistics for the last N hours. - - Args: - hours: Number of hours to aggregate - - Returns: - Dict mapping language code to LanguageMetrics - """ - now = datetime.now(timezone.utc) - language_stats: Dict[str, LanguageMetrics] = {} - - for i in range(hours): - hour = now - timedelta(hours=i) - hour_key = self._get_hour_key(hour) - redis_key = f"{self.HOURLY_PREFIX}{hour_key}" - - try: - data = await self.redis.hgetall(redis_key) - if not data: - continue - - # Parse language-specific fields - for key, value in data.items(): - key_str = key.decode() if isinstance(key, bytes) else key - value_str = value.decode() if isinstance(value, bytes) else value - - if key_str.startswith("lang:") and ":count" in key_str: - lang = key_str.split(":")[1] - if lang not in language_stats: - language_stats[lang] = LanguageMetrics(language=lang) - - count = int(value_str) - language_stats[lang].execution_count += count - - # Get corresponding time and error counts - time_key = f"lang:{lang}:time_ms" - error_key = f"lang:{lang}:errors" - - time_data = data.get( - time_key.encode() if isinstance(key, bytes) else time_key - ) - if time_data: - language_stats[lang].total_execution_time_ms += float( - time_data.decode() - if isinstance(time_data, bytes) - else time_data - ) - - error_data = data.get( - error_key.encode() if isinstance(key, bytes) else error_key - ) - if error_data: - language_stats[lang].failure_count += int( - error_data.decode() - if isinstance(error_data, bytes) - else error_data - ) - - except Exception as e: - logger.warning( - "Failed to get language stats for hour", hour=hour_key, error=str(e) - ) - - # Calculate derived values - for stats in language_stats.values(): - stats.success_count = stats.execution_count - stats.failure_count - if stats.execution_count > 0: - stats.avg_execution_time_ms = ( - stats.total_execution_time_ms / stats.execution_count - ) - stats.error_rate = (stats.failure_count / stats.execution_count) * 100 - - return language_stats - - async def get_api_key_stats( - self, api_key_hash: str, hours: int = 24 - ) -> ApiKeyUsageMetrics: - """Get usage statistics for a specific API key. - - Args: - api_key_hash: Hash of the API key - hours: Number of hours to aggregate - - Returns: - ApiKeyUsageMetrics - """ - stats = ApiKeyUsageMetrics(api_key_hash=api_key_hash[:16]) - now = datetime.now(timezone.utc) - - for i in range(hours): - hour = now - timedelta(hours=i) - hour_key = self._get_hour_key(hour) - redis_key = ( - f"{self.API_KEY_HOURLY_PREFIX}{api_key_hash[:16]}:hour:{hour_key}" - ) - - try: - data = await self.redis.hgetall(redis_key) - if not data: - continue - - for key, value in data.items(): - key_str = key.decode() if isinstance(key, bytes) else key - value_str = value.decode() if isinstance(value, bytes) else value - - if key_str == "execution_count": - stats.execution_count += int(value_str) - elif key_str == "success_count": - stats.success_count += int(value_str) - elif key_str == "failure_count": - stats.failure_count += int(value_str) - elif key_str == "total_execution_time_ms": - stats.total_execution_time_ms += float(value_str) - elif key_str == "total_memory_mb": - stats.total_memory_mb += float(value_str) - elif key_str == "file_operations": - stats.file_operations += int(value_str) - - except Exception as e: - logger.warning("Failed to get API key stats", error=str(e)) - - # Calculate success rate - if stats.execution_count > 0: - stats.success_rate = (stats.success_count / stats.execution_count) * 100 - - return stats - - async def get_pool_stats(self) -> PoolMetricsSummary: - """Get container pool statistics. - - Returns: - PoolMetricsSummary - """ - stats = PoolMetricsSummary() - - try: - data = await self.redis.hgetall(self.POOL_STATS_KEY) - if data: - for key, value in data.items(): - key_str = key.decode() if isinstance(key, bytes) else key - value_str = value.decode() if isinstance(value, bytes) else value - - if key_str == "total_acquisitions": - stats.total_acquisitions = int(value_str) - elif key_str == "pool_hits": - stats.pool_hits = int(value_str) - elif key_str == "pool_misses": - stats.pool_misses = int(value_str) - elif key_str == "exhaustion_events": - stats.exhaustion_events = int(value_str) - elif key_str == "total_acquire_time_ms": - if stats.total_acquisitions > 0: - stats.avg_acquire_time_ms = ( - float(value_str) / stats.total_acquisitions - ) - - # Calculate hit rate - if stats.total_acquisitions > 0: - stats.hit_rate = (stats.pool_hits / stats.total_acquisitions) * 100 - - except Exception as e: - logger.warning("Failed to get pool stats", error=str(e)) - - return stats - - async def get_summary(self) -> MetricsSummary: - """Get high-level metrics summary. - - Returns: - MetricsSummary for dashboard display - """ - summary = MetricsSummary() - now = datetime.now(timezone.utc) - - try: - # Get current hour stats - current_hour = await self.get_hourly_metrics(now) - if current_hour: - summary.total_executions_hour = current_hour.execution_count - summary.avg_execution_time_ms = current_hour.avg_execution_time_ms - - # Get today's stats (last 24 hours) - for i in range(24): - hour = now - timedelta(hours=i) - hour_metrics = await self.get_hourly_metrics(hour) - if hour_metrics: - summary.total_executions_today += hour_metrics.execution_count - summary.total_executions += hour_metrics.execution_count - - # Get language breakdown - language_stats = await self.get_language_stats(hours=24) - sorted_languages = sorted( - language_stats.values(), key=lambda x: x.execution_count, reverse=True - )[:5] - summary.top_languages = [ - {"language": s.language, "count": s.execution_count} - for s in sorted_languages - ] - - # Get pool stats - pool_stats = await self.get_pool_stats() - summary.pool_hit_rate = pool_stats.hit_rate - - # Calculate overall success rate - total_success = sum(s.success_count for s in language_stats.values()) - total_all = sum(s.execution_count for s in language_stats.values()) - if total_all > 0: - summary.success_rate = (total_success / total_all) * 100 - - except Exception as e: - logger.error("Failed to get metrics summary", error=str(e)) - - return summary - - def _get_hour_key(self, dt: datetime) -> str: - """Get Redis key suffix for hourly period.""" - return dt.strftime("%Y-%m-%d-%H") - - def _get_day_key(self, dt: datetime) -> str: - """Get Redis key suffix for daily period.""" - return dt.strftime("%Y-%m-%d") - - def _parse_hourly_data( - self, data: Dict[bytes, bytes], period: str, period_type: str - ) -> AggregatedMetrics: - """Parse Redis hash data into AggregatedMetrics.""" - metrics = AggregatedMetrics(period=period, period_type=period_type) - - for key, value in data.items(): - key_str = key.decode() if isinstance(key, bytes) else key - value_str = value.decode() if isinstance(value, bytes) else value - - if key_str == "execution_count": - metrics.execution_count = int(value_str) - elif key_str == "success_count": - metrics.success_count = int(value_str) - elif key_str == "failure_count": - metrics.failure_count = int(value_str) - elif key_str == "timeout_count": - metrics.timeout_count = int(value_str) - elif key_str == "total_execution_time_ms": - metrics.total_execution_time_ms = float(value_str) - elif key_str == "total_memory_mb": - metrics.total_memory_mb = float(value_str) - elif key_str == "pool_hits": - if metrics.pool_stats is None: - metrics.pool_stats = PoolMetricsSummary() - metrics.pool_stats.pool_hits = int(value_str) - elif key_str == "pool_misses": - if metrics.pool_stats is None: - metrics.pool_stats = PoolMetricsSummary() - metrics.pool_stats.pool_misses = int(value_str) - - # Calculate averages - if metrics.execution_count > 0: - metrics.avg_execution_time_ms = ( - metrics.total_execution_time_ms / metrics.execution_count - ) - metrics.avg_memory_mb = metrics.total_memory_mb / metrics.execution_count - - return metrics - - -# Global service instance -_detailed_metrics_service: Optional[DetailedMetricsService] = None - - -def get_detailed_metrics_service() -> DetailedMetricsService: - """Get or create detailed metrics service instance.""" - global _detailed_metrics_service - - if _detailed_metrics_service is None: - _detailed_metrics_service = DetailedMetricsService() - _detailed_metrics_service.register_event_handlers() - logger.info("Initialized detailed metrics service with event handlers") - - return _detailed_metrics_service diff --git a/src/services/execution/__init__.py b/src/services/execution/__init__.py index 11a811f..f345432 100644 --- a/src/services/execution/__init__.py +++ b/src/services/execution/__init__.py @@ -12,11 +12,11 @@ # that implements the ExecutionServiceInterface from ..interfaces import ExecutionServiceInterface from .runner import CodeExecutionRunner as _Runner -from ..container import ContainerManager +from ..sandbox import SandboxManager class CodeExecutionService(_Runner, ExecutionServiceInterface): - """Service for executing code in Docker containers. + """Service for executing code in nsjail sandboxes. This class provides backward compatibility with the original CodeExecutionService API while using the refactored implementation. @@ -35,8 +35,8 @@ async def execute_code( capture_state: Whether to capture state after execution (Python only) Returns: - Tuple of (CodeExecution, Container, new_state, state_errors) - Container returned directly for thread-safe file retrieval in concurrent requests. + Tuple of (CodeExecution, SandboxInfo, new_state, state_errors) + SandboxInfo returned directly for thread-safe file retrieval in concurrent requests. new_state is base64-encoded cloudpickle, or None if not captured. """ return await self.execute( @@ -72,7 +72,7 @@ def _format_error_message(self, exit_code, stderr): def __del__(self): """Cleanup when service is destroyed.""" try: - self.container_manager.close() + self.sandbox_manager.close() except Exception: pass @@ -81,5 +81,5 @@ def __del__(self): "CodeExecutionService", "CodeExecutionRunner", "OutputProcessor", - "ContainerManager", + "SandboxManager", ] diff --git a/src/services/execution/runner.py b/src/services/execution/runner.py index 31d601a..a036745 100644 --- a/src/services/execution/runner.py +++ b/src/services/execution/runner.py @@ -1,13 +1,14 @@ """Code execution runner - core execution logic.""" import asyncio +import os import shlex +import signal from datetime import datetime, timedelta from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import structlog -from docker.models.containers import Container from ...config import settings from ...config.languages import get_language @@ -19,10 +20,10 @@ ExecuteCodeRequest, ) from ...utils.id_generator import generate_execution_id -from ..container import ContainerManager -from ..container.pool import ContainerPool -from ..container.repl_executor import REPLExecutor -from ..metrics import metrics_collector, ExecutionMetrics +from ..sandbox.nsjail import SandboxInfo +from ..sandbox.manager import SandboxManager +from ..sandbox.pool import SandboxPool +from ..sandbox.repl_executor import SandboxREPLExecutor, SandboxREPLProcess from .output import OutputProcessor logger = structlog.get_logger(__name__) @@ -33,58 +34,63 @@ class CodeExecutionRunner: def __init__( self, - container_manager: ContainerManager = None, - container_pool: ContainerPool = None, + sandbox_manager: SandboxManager = None, + sandbox_pool: SandboxPool = None, ): """Initialize the execution runner. Args: - container_manager: Optional container manager instance - container_pool: Optional container pool for fast container acquisition + sandbox_manager: Optional sandbox manager instance + sandbox_pool: Optional sandbox pool for fast sandbox acquisition """ - self.container_manager = container_manager or ContainerManager() - self.container_pool = container_pool + self.sandbox_manager = sandbox_manager or SandboxManager() + self.sandbox_pool = sandbox_pool self.active_executions: Dict[str, CodeExecution] = {} - self.session_containers: Dict[str, Container] = {} + self.session_sandboxes: Dict[str, SandboxInfo] = {} + self._repl_processes: Dict[str, SandboxREPLProcess] = {} - async def _get_container( + def set_sandbox_pool(self, pool: SandboxPool) -> None: + """Set the sandbox pool dependency.""" + self.sandbox_pool = pool + + async def _get_sandbox( self, session_id: str, language: str - ) -> Tuple[Container, str]: - """Get container for execution, using pool if available. + ) -> Tuple[SandboxInfo, str]: + """Get sandbox for execution, using pool if available. Priority: - 1. Get fresh container from pool (fast, ~3ms) - 2. Create new container (fallback, slow) + 1. Get fresh sandbox from pool (fast, ~3ms) + 2. Create new sandbox (fallback, slow) Returns: - Tuple of (Container, source) where source is 'pool_hit' or 'pool_miss' + Tuple of (SandboxInfo, source) where source is 'pool_hit' or 'pool_miss' """ # Try pool first if enabled - if self.container_pool and settings.container_pool_enabled: + if self.sandbox_pool and settings.sandbox_pool_enabled: logger.debug( - "Acquiring container from pool", + "Acquiring sandbox from pool", session_id=session_id[:12], pool_enabled=True, ) try: - container = await self.container_pool.acquire(language, session_id) - return container, "pool_hit" + sandbox_info = await self.sandbox_pool.acquire(language, session_id) + return sandbox_info, "pool_hit" except Exception as e: logger.warning( - "Pool acquire failed, falling back to fresh container", + "Pool acquire failed, falling back to fresh sandbox", session_id=session_id[:12], error=str(e), ) else: logger.debug( "Pool not available", - has_pool=self.container_pool is not None, - pool_enabled=settings.container_pool_enabled, + has_pool=self.sandbox_pool is not None, + pool_enabled=settings.sandbox_pool_enabled, ) - # Fallback: create fresh container (original behavior) - container = await self._create_fresh_container(session_id, language) - return container, "pool_miss" + # Fallback: create fresh sandbox (original behavior) + sandbox_info = await self._create_fresh_sandbox(session_id, language) + return sandbox_info, "pool_miss" async def execute( self, @@ -93,7 +99,7 @@ async def execute( files: Optional[List[Dict[str, Any]]] = None, initial_state: Optional[str] = None, capture_state: bool = True, - ) -> Tuple[CodeExecution, Optional[Container], Optional[str], List[str], str]: + ) -> Tuple[CodeExecution, Optional[SandboxInfo], Optional[str], List[str], str]: """Execute code in a session with optional state persistence. Args: @@ -104,12 +110,12 @@ async def execute( capture_state: Whether to capture state after execution (Python only) Returns: - Tuple of (CodeExecution record, Container, new_state, state_errors, container_source) + Tuple of (CodeExecution record, SandboxInfo, new_state, state_errors, container_source) container_source is 'pool_hit' or 'pool_miss'. """ execution_id = generate_execution_id() - logger.info( + logger.debug( "Starting code execution", execution_id=execution_id[:8], session_id=session_id, @@ -128,45 +134,43 @@ async def execute( self.active_executions[execution_id] = execution - # Check if Docker is available - if not self.container_manager.is_available(): + # Check if sandbox/nsjail is available + if not self.sandbox_manager.is_available(): logger.error( - "Docker not available", + "Sandbox/nsjail not available", execution_id=execution_id[:8], - error=self.container_manager.get_initialization_error(), + error=self.sandbox_manager.get_initialization_error(), ) execution.status = ExecutionStatus.FAILED execution.completed_at = datetime.utcnow() - execution.error_message = f"Docker service unavailable: {self.container_manager.get_initialization_error()}" + execution.error_message = f"Sandbox service unavailable: {self.sandbox_manager.get_initialization_error()}" return execution, None, None, [], "pool_miss" - container = None + sandbox_info = None container_source = "pool_miss" try: execution.status = ExecutionStatus.RUNNING execution.started_at = datetime.utcnow() - # Get container (from pool or create fresh) - container, container_source = await self._get_container( + # Get sandbox (from pool or create fresh) + sandbox_info, container_source = await self._get_sandbox( session_id, request.language ) # Mount files if provided if files: - await self._mount_files_to_container(container, files, request.language) + await self._mount_files_to_sandbox( + sandbox_info, files, request.language + ) # Execute the code start_time = datetime.utcnow() - # Check if this is a REPL container (for optimization) - is_repl = self._is_repl_container(container, request.language) + # Check if this is a REPL sandbox (for optimization) + is_repl = self._is_repl_sandbox(sandbox_info, request.language) - # Skip stats for REPL mode (saves ~1 second per call) + # nsjail doesn't expose detailed per-sandbox resource stats initial_stats = None - if not is_repl: - initial_stats = await self.container_manager.get_container_stats( - container - ) # Execute code with optional state persistence (Python REPL only) new_state = None @@ -181,7 +185,7 @@ async def execute( new_state, state_errors, ) = await self._execute_via_repl_with_state( - container, + sandbox_info, request.code, request.timeout or settings.max_execution_time, initial_state=initial_state, @@ -190,8 +194,8 @@ async def execute( ) else: # Standard execution (no state persistence) - exit_code, stdout, stderr = await self._execute_code_in_container( - container, + exit_code, stdout, stderr = await self._execute_code_in_sandbox( + sandbox_info, request.code, request.language, request.timeout, @@ -201,15 +205,8 @@ async def execute( execution_time_ms = int((end_time - start_time).total_seconds() * 1000) - # Skip final stats for REPL mode + # nsjail doesn't provide per-sandbox memory stats memory_peak_mb = None - if not is_repl: - final_stats = await self.container_manager.get_container_stats( - container - ) - memory_peak_mb = ( - final_stats.get("memory_usage_mb") if final_stats else None - ) # Process outputs outputs = self._process_outputs(stdout, stderr, end_time) @@ -227,7 +224,7 @@ async def execute( generated_files = [] if should_detect_files: - generated_files = await self._detect_generated_files(container) + generated_files = await self._detect_generated_files(sandbox_info) mounted_filenames = self._get_mounted_filenames(files) filtered_files = self._filter_generated_files( @@ -261,9 +258,13 @@ async def execute( exit_code, stderr ) - logger.info( - f"Code execution {execution_id} completed: status={execution.status}, " - f"exit_code={exit_code}, time={execution_time_ms}ms, source={container_source}" + logger.debug( + "Code execution completed", + execution_id=execution_id[:8], + status=execution.status.value, + exit_code=exit_code, + time_ms=execution_time_ms, + source=container_source, ) # Log state info if captured @@ -303,10 +304,7 @@ async def execute( state_errors = [] logger.error(f"Code execution {execution_id} failed: {e}") - # Record metrics - self._record_metrics(execution, session_id, request.language, files) - - return execution, container, new_state, state_errors, container_source + return execution, sandbox_info, new_state, state_errors, container_source def _process_outputs( self, stdout: str, stderr: str, timestamp: datetime @@ -358,99 +356,162 @@ def _filter_generated_files( if Path(f.get("path", "")).name not in mounted_filenames ] - def _record_metrics( - self, - execution: CodeExecution, - session_id: str, - language: str, - files: Optional[List[Dict[str, Any]]], - ) -> None: - """Record execution metrics.""" - try: - metrics = ExecutionMetrics( - execution_id=execution.execution_id, - session_id=session_id, - language=language, - status=execution.status.value, - execution_time_ms=execution.execution_time_ms or 0, - memory_peak_mb=execution.memory_peak_mb, - exit_code=execution.exit_code, - file_count=len(files) if files else 0, - output_size_bytes=( - sum(len(o.content) for o in execution.outputs) - if execution.outputs - else 0 - ), - ) - metrics_collector.record_execution_metrics(metrics) - except Exception as e: - logger.error("Failed to record execution metrics", error=str(e)) - - async def _create_fresh_container( + async def _create_fresh_sandbox( self, session_id: str, language: str - ) -> Container: - """Create a fresh container for execution.""" - if session_id in self.session_containers: + ) -> SandboxInfo: + """Create a fresh sandbox for execution.""" + if session_id in self.session_sandboxes: try: - await self.container_manager.force_kill_container( - self.session_containers[session_id] - ) + old_sandbox = self.session_sandboxes[session_id] + # Kill any REPL process + repl_proc = self._repl_processes.pop(old_sandbox.sandbox_id, None) + if repl_proc and repl_proc.process.returncode is None: + try: + os.killpg(repl_proc.process.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + try: + repl_proc.process.kill() + except ProcessLookupError: + pass + try: + await repl_proc.process.wait() + except Exception: + pass + self.sandbox_manager.destroy_sandbox(old_sandbox) except Exception: pass finally: - if session_id in self.session_containers: - del self.session_containers[session_id] - - image = self.container_manager.get_image_for_language(language) - await self.container_manager.pull_image_if_needed(image) + if session_id in self.session_sandboxes: + del self.session_sandboxes[session_id] # Enable REPL mode for Python if configured (matches pool behavior) use_repl_mode = language == "py" and settings.repl_enabled - container = self.container_manager.create_container( - image=image, + sandbox_info = self.sandbox_manager.create_sandbox( session_id=session_id, - working_dir="/mnt/data", language=language, repl_mode=use_repl_mode, ) - await self.container_manager.start_container(container) - # For REPL containers, wait for REPL to be ready before returning + # For REPL sandboxes, start the REPL process and wait for ready if use_repl_mode: - repl_executor = REPLExecutor(self.container_manager.client) - ready = await repl_executor.wait_for_ready(container, timeout=10.0) - if not ready: + repl_process = await self._start_repl_process(sandbox_info) + if repl_process: + self._repl_processes[sandbox_info.sandbox_id] = repl_process + else: logger.warning( - "REPL not ready in fresh container, may affect performance", + "REPL not ready in fresh sandbox, may affect performance", session_id=session_id[:12], - container_id=container.id[:12], + sandbox_id=sandbox_info.sandbox_id[:12], ) - self.session_containers[session_id] = container - logger.info( - "Fresh container created", + self.session_sandboxes[session_id] = sandbox_info + logger.debug( + "Fresh sandbox created", session_id=session_id, - container_id=container.id[:12], + sandbox_id=sandbox_info.sandbox_id[:12], ) - return container + return sandbox_info - async def _execute_code_in_container( + async def _start_repl_process( + self, sandbox_info: SandboxInfo + ) -> Optional[SandboxREPLProcess]: + """Start a REPL process inside an nsjail sandbox. + + Args: + sandbox_info: Sandbox to start REPL in + + Returns: + SandboxREPLProcess if successful, None if failed + """ + try: + from ..sandbox.nsjail import NsjailConfig + + nsjail_config = NsjailConfig() + + # Build nsjail args for REPL mode + env = self.sandbox_manager.executor._build_sanitized_env("py") + nsjail_args = nsjail_config.build_args( + sandbox_dir=str(sandbox_info.data_dir), + command=["/usr/bin/python3", "/opt/repl_server.py"], + language="py", + repl_mode=True, + env=env, + ) + + # Wrap nsjail in unshare+mount for security isolation + nsjail_cmd = " ".join( + shlex.quote(str(a)) for a in [settings.nsjail_binary] + nsjail_args + ) + wrapper_cmd = ( + f"mount --bind {shlex.quote(str(sandbox_info.data_dir))} /mnt/data && " + f"mount -t tmpfs -o size=1k tmpfs /var/lib/code-interpreter/sandboxes && " + f"mount -t tmpfs -o size=1k tmpfs /app/data && " + f"mount -t tmpfs -o size=1k tmpfs /var/log && " + f"mount -t tmpfs -o size=1k tmpfs /app/ssl && " + f"mount -t tmpfs -o size=1k tmpfs /app/dashboard && " + f"mount -t tmpfs -o size=1k tmpfs /app/src && " + # BUG-003: Bind /dev/null over mountinfo to hide mount details + f"mount --bind /dev/null /proc/self/mountinfo && " + f"{nsjail_cmd}" + ) + + # Start the nsjail subprocess with REPL via unshare wrapper + proc = await asyncio.create_subprocess_exec( + "unshare", + "--mount", + "--", + "/bin/sh", + "-c", + wrapper_cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + repl_process = SandboxREPLProcess( + process=proc, + sandbox_info=sandbox_info, + ) + + # Wait for REPL to be ready + repl_executor = SandboxREPLExecutor() + ready = await repl_executor.wait_for_ready( + repl_process, + timeout=settings.repl_warmup_timeout_seconds, + ) + + if not ready: + proc.kill() + await proc.wait() + return None + + return repl_process + + except Exception as e: + logger.error( + "Failed to start REPL process", + sandbox_id=sandbox_info.sandbox_id[:12], + error=str(e), + ) + return None + + async def _execute_code_in_sandbox( self, - container: Container, + sandbox_info: SandboxInfo, code: str, language: str, timeout: Optional[int] = None, args: Optional[List[str]] = None, ) -> Tuple[int, str, str]: - """Execute code in the container. + """Execute code in the sandbox. - For REPL-enabled containers (Python with REPL mode), uses the fast + For REPL-enabled sandboxes (Python with REPL mode), uses the fast REPL executor which communicates with the pre-warmed Python interpreter. - For other containers, uses the standard execution path. + For other sandboxes, uses the standard execution path. Args: - container: Docker container to execute in + sandbox_info: Sandbox to execute in code: Code to execute language: Programming language timeout: Execution timeout in seconds @@ -463,22 +524,24 @@ async def _execute_code_in_container( execution_timeout = timeout or settings.max_execution_time - # Check if container is REPL-enabled for faster execution - if self._is_repl_container(container, language): + # Check if sandbox is REPL-enabled for faster execution + if self._is_repl_sandbox(sandbox_info, language): logger.debug( - "Using REPL executor", container_id=container.id[:12], language=language + "Using REPL executor", + sandbox_id=sandbox_info.sandbox_id[:12], + language=language, ) return await self._execute_via_repl( - container, code, execution_timeout, args=args + sandbox_info, code, execution_timeout, args=args ) - # Standard execution path for non-REPL containers + # Standard execution path for non-REPL sandboxes exec_command = lang_config.execution_command # For stdin-based languages (except ts which compiles first) if lang_config.uses_stdin and language != "ts": - return await self.container_manager.execute_command( - container, + return await self.sandbox_manager.execute_command( + sandbox_info, exec_command, timeout=execution_timeout, language=language, @@ -493,12 +556,12 @@ async def _execute_code_in_container( elif language == "ts": code_filename = "code.ts" - # Direct memory-to-container transfer (no tempfiles) + # Direct memory-to-sandbox transfer (no tempfiles) dest_path = f"/mnt/data/{code_filename}" - if not await self.container_manager.copy_content_to_container( - container, code.encode("utf-8"), dest_path, language=language + if not self.sandbox_manager.copy_content_to_sandbox( + sandbox_info, code.encode("utf-8"), dest_path, language=language ): - return 1, "", "Failed to write code file to container" + return 1, "", "Failed to write code file to sandbox" # Build execution command with args if provided final_command = exec_command @@ -507,23 +570,22 @@ async def _execute_code_in_container( quoted_args = " ".join(shlex.quote(arg) for arg in args) final_command = f"{exec_command} {quoted_args}" - return await self.container_manager.execute_command( - container, + return await self.sandbox_manager.execute_command( + sandbox_info, final_command, timeout=execution_timeout, language=language, - working_dir="/mnt/data", ) - def _is_repl_container(self, container: Container, language: str) -> bool: - """Check if container is running in REPL mode. + def _is_repl_sandbox(self, sandbox_info: SandboxInfo, language: str) -> bool: + """Check if sandbox is running in REPL mode. Args: - container: Docker container to check + sandbox_info: Sandbox to check language: Programming language Returns: - True if container has REPL mode enabled, False otherwise + True if sandbox has REPL mode enabled, False otherwise """ # Only Python supports REPL mode currently if language != "py": @@ -533,27 +595,19 @@ def _is_repl_container(self, container: Container, language: str) -> bool: if not settings.repl_enabled: return False - try: - # Check container labels for REPL mode (no reload needed - labels set at creation) - labels = container.labels or {} - return labels.get("com.code-interpreter.repl-mode") == "true" - except Exception as e: - logger.debug( - "Error checking REPL mode", container_id=container.id[:12], error=str(e) - ) - return False + return sandbox_info.repl_mode async def _execute_via_repl( self, - container: Container, + sandbox_info: SandboxInfo, code: str, timeout: int, args: Optional[List[str]] = None, ) -> Tuple[int, str, str]: - """Execute code via REPL server in container. + """Execute code via REPL server in sandbox. Args: - container: Docker container with REPL server running + sandbox_info: Sandbox with REPL server running code: Python code to execute timeout: Maximum execution time in seconds args: Optional list of command line arguments @@ -561,14 +615,28 @@ async def _execute_via_repl( Returns: Tuple of (exit_code, stdout, stderr) """ - repl_executor = REPLExecutor(self.container_manager.client) + # Get REPL process: try pool first, then local tracking + repl_process = None + if self.sandbox_pool: + repl_process = self.sandbox_pool.get_repl_process(sandbox_info) + if not repl_process: + repl_process = self._repl_processes.get(sandbox_info.sandbox_id) + + if not repl_process: + logger.warning( + "No REPL process found for sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + ) + return 1, "", "REPL process not available" + + repl_executor = SandboxREPLExecutor() return await repl_executor.execute( - container, code, timeout=timeout, working_dir="/mnt/data", args=args + repl_process, code, timeout=timeout, working_dir="/mnt/data", args=args ) async def _execute_via_repl_with_state( self, - container: Container, + sandbox_info: SandboxInfo, code: str, timeout: int, initial_state: Optional[str] = None, @@ -578,7 +646,7 @@ async def _execute_via_repl_with_state( """Execute code via REPL server with state persistence. Args: - container: Docker container with REPL server running + sandbox_info: Sandbox with REPL server running code: Python code to execute timeout: Maximum execution time in seconds initial_state: Base64-encoded state to restore before execution @@ -588,9 +656,23 @@ async def _execute_via_repl_with_state( Returns: Tuple of (exit_code, stdout, stderr, new_state, state_errors) """ - repl_executor = REPLExecutor(self.container_manager.client) + # Get REPL process: try pool first, then local tracking + repl_process = None + if self.sandbox_pool: + repl_process = self.sandbox_pool.get_repl_process(sandbox_info) + if not repl_process: + repl_process = self._repl_processes.get(sandbox_info.sandbox_id) + + if not repl_process: + logger.warning( + "No REPL process found for sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + ) + return 1, "", "REPL process not available", None, [] + + repl_executor = SandboxREPLExecutor() return await repl_executor.execute_with_state( - container, + repl_process, code, timeout=timeout, working_dir="/mnt/data", @@ -599,10 +681,13 @@ async def _execute_via_repl_with_state( args=args, ) - async def _mount_files_to_container( - self, container: Container, files: List[Dict[str, Any]], language: str = "py" + async def _mount_files_to_sandbox( + self, + sandbox_info: SandboxInfo, + files: List[Dict[str, Any]], + language: str = "py", ) -> None: - """Mount files to container workspace.""" + """Mount files to sandbox workspace.""" try: from ..file import FileService @@ -623,82 +708,77 @@ async def _mount_files_to_container( ) if file_content is not None: - # Direct memory-to-container transfer (no tempfiles) + # Direct memory-to-sandbox transfer (no tempfiles) normalized_filename = OutputProcessor.sanitize_filename( filename ) dest_path = f"/mnt/data/{normalized_filename}" - if await self.container_manager.copy_content_to_container( - container, file_content, dest_path, language=language + if self.sandbox_manager.copy_content_to_sandbox( + sandbox_info, file_content, dest_path, language=language ): - logger.info( + logger.debug( "Mounted file", filename=filename, size=len(file_content), ) else: logger.warning("Failed to mount file", filename=filename) - await self._create_placeholder_file(container, filename) + await self._create_placeholder_file(sandbox_info, filename) else: logger.warning( f"Could not retrieve content for file {filename}" ) - await self._create_placeholder_file(container, filename) + await self._create_placeholder_file(sandbox_info, filename) except Exception as file_error: logger.error(f"Error retrieving file {filename}: {file_error}") - await self._create_placeholder_file(container, filename) + await self._create_placeholder_file(sandbox_info, filename) except Exception as e: - logger.error(f"Failed to mount files to container: {e}") + logger.error(f"Failed to mount files to sandbox: {e}") async def _create_placeholder_file( - self, container: Container, filename: str + self, sandbox_info: SandboxInfo, filename: str ) -> None: """Create a placeholder file when content cannot be retrieved.""" try: normalized_filename = OutputProcessor.sanitize_filename(filename) - create_command = f"""cat > /mnt/data/{normalized_filename} << 'EOF' -# File: {filename} -# This is a placeholder - original file could not be retrieved -EOF""" - await self.container_manager.execute_command( - container, create_command, timeout=10 + placeholder = f"# File: {filename}\n# This is a placeholder - original file could not be retrieved\n" + self.sandbox_manager.copy_content_to_sandbox( + sandbox_info, + placeholder.encode(), + f"/mnt/data/{normalized_filename}", + "py", ) except Exception as e: logger.error(f"Failed to create placeholder file: {e}") async def _detect_generated_files( - self, container: Container + self, sandbox_info: SandboxInfo ) -> List[Dict[str, Any]]: """Detect files generated during execution.""" try: - exit_code, stdout, stderr = await self.container_manager.execute_command( - container, - "find /mnt/data -maxdepth 1 -type f -name '*' ! -name 'code' ! -name 'code.*' ! -name 'Code.*' -exec ls -la {} \\;", - timeout=5, - ) + generated_files = [] + data_dir = sandbox_info.data_dir - if exit_code != 0 or not stdout.strip(): + if not data_dir.exists(): return [] - generated_files = [] - for line in stdout.strip().split("\n"): - if line.strip(): - parts = line.split() - if len(parts) >= 9: - size = int(parts[4]) if parts[4].isdigit() else 0 - filename = " ".join(parts[8:]) - - if size > settings.max_file_size_mb * 1024 * 1024: - continue + for name in os.listdir(data_dir): + # Skip code files + if name.startswith("code") or name.startswith("Code."): + continue + filepath = data_dir / name + if filepath.is_file(): + size = filepath.stat().st_size + if size <= settings.max_file_size_mb * 1024 * 1024: generated_files.append( { - "path": filename, + "path": f"/mnt/data/{name}", "size": size, - "mime_type": OutputProcessor.guess_mime_type(filename), + "mime_type": OutputProcessor.guess_mime_type(name), } ) @@ -725,11 +805,24 @@ async def cancel_execution(self, execution_id: str) -> bool: return False try: - container = self.session_containers.get(execution.session_id) - if container: - await self.container_manager.stop_container(container) - await self.container_manager.remove_container(container) - del self.session_containers[execution.session_id] + sandbox_info = self.session_sandboxes.get(execution.session_id) + if sandbox_info: + # Kill any REPL process + repl_proc = self._repl_processes.pop(sandbox_info.sandbox_id, None) + if repl_proc and repl_proc.process.returncode is None: + try: + os.killpg(repl_proc.process.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + try: + repl_proc.process.kill() + except ProcessLookupError: + pass + try: + await repl_proc.process.wait() + except Exception: + pass + self.sandbox_manager.destroy_sandbox(sandbox_info) + del self.session_sandboxes[execution.session_id] execution.status = ExecutionStatus.CANCELLED execution.completed_at = datetime.utcnow() @@ -755,10 +848,24 @@ async def list_executions( async def cleanup_session(self, session_id: str) -> bool: """Clean up resources for a session.""" try: - if session_id in self.session_containers: - container = self.session_containers[session_id] - await self.container_manager.force_kill_container(container) - del self.session_containers[session_id] + if session_id in self.session_sandboxes: + sandbox_info = self.session_sandboxes[session_id] + # Kill any REPL process + repl_proc = self._repl_processes.pop(sandbox_info.sandbox_id, None) + if repl_proc and repl_proc.process.returncode is None: + try: + os.killpg(repl_proc.process.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + try: + repl_proc.process.kill() + except ProcessLookupError: + pass + try: + await repl_proc.process.wait() + except Exception: + pass + self.sandbox_manager.destroy_sandbox(sandbox_info) + del self.session_sandboxes[session_id] execution_ids = [ eid @@ -768,7 +875,7 @@ async def cleanup_session(self, session_id: str) -> bool: for eid in execution_ids: del self.active_executions[eid] - logger.info("Cleaned up session resources", session_id=session_id) + logger.debug("Cleaned up session resources", session_id=session_id) return True except Exception as e: @@ -794,19 +901,36 @@ async def cleanup_expired_executions(self, max_age_hours: int = 24) -> int: for eid in expired: del self.active_executions[eid] - logger.info(f"Cleaned up {len(expired)} expired executions") + if expired: + logger.info(f"Cleaned up {len(expired)} expired executions") + else: + logger.debug("No expired executions to clean up") return len(expired) - async def cleanup_all_containers(self) -> None: - """Clean up all active containers during shutdown.""" - logger.info("Cleaning up all containers", count=len(self.session_containers)) + async def cleanup_all_sandboxes(self) -> None: + """Clean up all active sandboxes during shutdown.""" + logger.info("Cleaning up all sandboxes", count=len(self.session_sandboxes)) - containers = list(self.session_containers.values()) - if containers: - cleaned = await self.container_manager.force_kill_containers_batch( - containers - ) - logger.info(f"Cleaned up {cleaned}/{len(containers)} containers") + # Kill all REPL processes + for sandbox_id, repl_proc in list(self._repl_processes.items()): + try: + if repl_proc.process.returncode is None: + repl_proc.process.kill() + await repl_proc.process.wait() + except Exception: + pass + self._repl_processes.clear() + + # Destroy all sandboxes + cleaned = 0 + for session_id, sandbox_info in list(self.session_sandboxes.items()): + try: + self.sandbox_manager.destroy_sandbox(sandbox_info) + cleaned += 1 + except Exception: + pass + + logger.info(f"Cleaned up {cleaned}/{len(self.session_sandboxes)} sandboxes") - self.session_containers.clear() + self.session_sandboxes.clear() self.active_executions.clear() diff --git a/src/services/file.py b/src/services/file.py index 5fd3406..84e5626 100644 --- a/src/services/file.py +++ b/src/services/file.py @@ -67,7 +67,7 @@ def _get_file_key( """Generate S3 object key for a file.""" return f"sessions/{session_id}/{file_type}/{file_id}" - def _get_file_metadata_key(self, session_id: str, file_id: str) -> str: + def get_file_metadata_key(self, session_id: str, file_id: str) -> str: """Generate Redis key for file metadata.""" return f"files:{session_id}:{file_id}" @@ -80,7 +80,7 @@ async def _store_file_metadata( ) -> None: """Store file metadata in Redis.""" try: - metadata_key = self._get_file_metadata_key(session_id, file_id) + metadata_key = self.get_file_metadata_key(session_id, file_id) session_files_key = self._get_session_files_key(session_id) # Store file metadata @@ -103,12 +103,12 @@ async def _store_file_metadata( ) raise - async def _get_file_metadata( + async def get_file_metadata( self, session_id: str, file_id: str ) -> Optional[Dict[str, Any]]: """Retrieve file metadata from Redis.""" try: - metadata_key = self._get_file_metadata_key(session_id, file_id) + metadata_key = self.get_file_metadata_key(session_id, file_id) metadata = await self.redis_client.hgetall(metadata_key) if not metadata: @@ -134,7 +134,7 @@ async def _get_file_metadata( async def _delete_file_metadata(self, session_id: str, file_id: str) -> None: """Delete file metadata from Redis.""" try: - metadata_key = self._get_file_metadata_key(session_id, file_id) + metadata_key = self.get_file_metadata_key(session_id, file_id) session_files_key = self._get_session_files_key(session_id) # Delete metadata @@ -152,6 +152,39 @@ async def _delete_file_metadata(self, session_id: str, file_id: str) -> None: ) raise + def validate_uploads( + self, + filenames: List[str], + file_sizes: List[Optional[int]], + ) -> Optional[Tuple[int, str]]: + """Validate upload files against size, count, and type restrictions. + + Args: + filenames: List of filenames to validate + file_sizes: List of file sizes (may contain None for unknown sizes) + + Returns: + None if valid, or (http_status_code, error_message) tuple if invalid + """ + for filename, size in zip(filenames, file_sizes): + if size and size > settings.max_file_size_mb * 1024 * 1024: + return ( + 413, + f"File {filename} exceeds maximum size of {settings.max_file_size_mb}MB", + ) + + if len(filenames) > settings.max_files_per_session: + return ( + 413, + f"Too many files. Maximum {settings.max_files_per_session} files allowed", + ) + + for filename in filenames: + if not settings.is_file_allowed(filename or ""): + return (415, f"File type not allowed: {filename}") + + return None + async def upload_file( self, session_id: str, request: FileUploadRequest ) -> Tuple[str, str]: @@ -189,7 +222,7 @@ async def upload_file( await self._store_file_metadata(session_id, file_id, metadata) - logger.info( + logger.debug( "Generated file upload URL", session_id=session_id, file_id=file_id, @@ -206,7 +239,7 @@ async def upload_file( async def confirm_upload(self, session_id: str, file_id: str) -> FileInfo: """Confirm file upload completion and return file info.""" - metadata = await self._get_file_metadata(session_id, file_id) + metadata = await self.get_file_metadata(session_id, file_id) if not metadata: raise ValueError(f"File {file_id} not found in session {session_id}") @@ -223,7 +256,7 @@ async def confirm_upload(self, session_id: str, file_id: str) -> FileInfo: metadata["size"] = stat.size await self._store_file_metadata(session_id, file_id, metadata) - logger.info( + logger.debug( "Confirmed file upload", session_id=session_id, file_id=file_id, @@ -250,7 +283,7 @@ async def confirm_upload(self, session_id: str, file_id: str) -> FileInfo: async def get_file_info(self, session_id: str, file_id: str) -> Optional[FileInfo]: """Get file information.""" - metadata = await self._get_file_metadata(session_id, file_id) + metadata = await self.get_file_metadata(session_id, file_id) if not metadata: return None @@ -297,7 +330,7 @@ async def list_files(self, session_id: str) -> List[FileInfo]: async def download_file(self, session_id: str, file_id: str) -> Optional[str]: """Generate download URL for a file.""" - metadata = await self._get_file_metadata(session_id, file_id) + metadata = await self.get_file_metadata(session_id, file_id) if not metadata: return None @@ -327,7 +360,7 @@ async def download_file(self, session_id: str, file_id: str) -> Optional[str]: async def delete_file(self, session_id: str, file_id: str) -> bool: """Delete a file from the session.""" - metadata = await self._get_file_metadata(session_id, file_id) + metadata = await self.get_file_metadata(session_id, file_id) if not metadata: return False @@ -343,7 +376,7 @@ async def delete_file(self, session_id: str, file_id: str) -> bool: # Delete metadata from Redis await self._delete_file_metadata(session_id, file_id) - logger.info("Deleted file", session_id=session_id, file_id=file_id) + logger.debug("Deleted file", session_id=session_id, file_id=file_id) return True except S3Error as e: @@ -403,7 +436,7 @@ async def cleanup_session_files(self, session_id: str) -> int: error=str(e), ) - logger.info( + logger.debug( "Cleaned up session files", session_id=session_id, deleted_count=deleted_count, @@ -485,13 +518,12 @@ async def store_execution_output_file( await self._store_file_metadata(session_id, file_id, metadata) - logger.info( + logger.debug( "Stored execution output file", session_id=session_id, file_id=file_id, filename=filename, size=len(content), - state_hash=state_hash[:12] if state_hash else None, ) return file_id @@ -507,7 +539,7 @@ async def store_execution_output_file( async def get_file_content(self, session_id: str, file_id: str) -> Optional[bytes]: """Get file content directly (for internal use).""" - metadata = await self._get_file_metadata(session_id, file_id) + metadata = await self.get_file_metadata(session_id, file_id) if not metadata: return None @@ -598,7 +630,7 @@ async def store_uploaded_file( await self._store_file_metadata(session_id, file_id, metadata) - logger.info( + logger.debug( "Stored uploaded file", session_id=session_id, file_id=file_id, @@ -757,7 +789,7 @@ async def get_file_state_hash(self, session_id: str, file_id: str) -> Optional[s SHA256 hash of the state when this file was last used, or None """ try: - metadata_key = self._get_file_metadata_key(session_id, file_id) + metadata_key = self.get_file_metadata_key(session_id, file_id) state_hash = await self.redis_client.hget(metadata_key, "state_hash") return state_hash except Exception as e: @@ -788,7 +820,7 @@ async def update_file_state_hash( True if update was successful """ try: - metadata_key = self._get_file_metadata_key(session_id, file_id) + metadata_key = self.get_file_metadata_key(session_id, file_id) now = datetime.utcnow().isoformat() # Update multiple fields atomically @@ -842,7 +874,7 @@ async def update_file_content( """ try: # Get existing metadata to find object_key - metadata = await self._get_file_metadata(session_id, file_id) + metadata = await self.get_file_metadata(session_id, file_id) if not metadata: logger.warning( "File not found for content update", @@ -889,7 +921,7 @@ async def update_file_content( if execution_id: updates["execution_id"] = execution_id - metadata_key = self._get_file_metadata_key(session_id, file_id) + metadata_key = self.get_file_metadata_key(session_id, file_id) await self.redis_client.hset(metadata_key, mapping=updates) logger.debug( diff --git a/src/services/health.py b/src/services/health.py index 8959060..70e69c5 100644 --- a/src/services/health.py +++ b/src/services/health.py @@ -2,13 +2,15 @@ # Standard library imports import asyncio +import shutil +import subprocess import time from datetime import datetime, timezone from enum import Enum +from pathlib import Path from typing import Dict, Any, Optional # Third-party imports -import docker import redis.asyncio as redis import structlog from minio import Minio @@ -73,16 +75,15 @@ class HealthCheckService: def __init__(self): """Initialize health check service.""" self._redis_client: Optional[redis.Redis] = None - self._docker_client: Optional[docker.DockerClient] = None self._minio_client: Optional[Minio] = None - self._container_pool = None + self._sandbox_pool = None self._last_check_time: Optional[datetime] = None self._cached_results: Dict[str, HealthCheckResult] = {} self._cache_ttl_seconds = 30 # Cache results for 30 seconds - def set_container_pool(self, pool) -> None: - """Set container pool reference for health checks.""" - self._container_pool = pool + def set_sandbox_pool(self, pool) -> None: + """Set sandbox pool reference for health checks.""" + self._sandbox_pool = pool async def check_all_services( self, use_cache: bool = True @@ -104,14 +105,14 @@ async def check_all_services( tasks = [ self.check_redis(), self.check_minio(), - self.check_docker(), + self.check_nsjail(), ] - service_names = ["redis", "minio", "docker"] + service_names = ["redis", "minio", "nsjail"] - # Add container pool check if pool is configured - if self._container_pool and settings.container_pool_enabled: - tasks.append(self.check_container_pool()) - service_names.append("container_pool") + # Add sandbox pool check if pool is configured + if self._sandbox_pool and settings.sandbox_pool_enabled: + tasks.append(self.check_sandbox_pool()) + service_names.append("sandbox_pool") results = await asyncio.gather(*tasks, return_exceptions=True) @@ -338,77 +339,70 @@ async def check_minio(self) -> HealthCheckResult: error=str(e), ) - async def check_docker(self) -> HealthCheckResult: - """Check Docker daemon connectivity and performance.""" + async def check_nsjail(self) -> HealthCheckResult: + """Check nsjail binary availability and sandbox base directory.""" start_time = time.time() try: - # Create Docker client if not exists - if not self._docker_client: - try: - # Try to use the default Docker socket - self._docker_client = docker.from_env( - timeout=settings.health_check_timeout - ) - except Exception as e: - logger.warning( - f"Failed to create Docker client from environment: {e}" - ) - # Fallback to explicit socket path - self._docker_client = docker.DockerClient( - base_url="unix://var/run/docker.sock", - timeout=settings.health_check_timeout, - ) - - # Test basic connectivity - loop = asyncio.get_event_loop() - version_info = await loop.run_in_executor(None, self._docker_client.version) - - # Get system info - system_info = await loop.run_in_executor(None, self._docker_client.info) - - # List containers to test API functionality - containers = await loop.run_in_executor( - None, self._docker_client.containers.list, True - ) + # Check if nsjail binary exists + nsjail_path = shutil.which(settings.nsjail_binary) + if not nsjail_path: + response_time = (time.time() - start_time) * 1000 + return HealthCheckResult( + service="nsjail", + status=HealthStatus.UNHEALTHY, + response_time_ms=response_time, + error=f"nsjail binary not found: {settings.nsjail_binary}", + ) - # Check if we can pull a simple image (test registry connectivity) + # Get nsjail version + version = "unknown" try: - await loop.run_in_executor( - None, self._docker_client.images.pull, "hello-world:latest" + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + None, + lambda: subprocess.run( + [nsjail_path, "--help"], + capture_output=True, + text=True, + timeout=5, + ), ) - registry_accessible = True - except Exception as e: - logger.warning("Docker registry not accessible", error=str(e)) - registry_accessible = False + # nsjail --help outputs to stderr + output = result.stderr or result.stdout or "" + for line in output.split("\n"): + if "version" in line.lower() or "nsjail" in line.lower(): + version = line.strip() + break + except Exception: + pass + + # Check sandbox base directory + sandbox_base = Path(settings.sandbox_base_dir) + base_dir_exists = sandbox_base.exists() + base_dir_writable = False + if base_dir_exists: + import os + + base_dir_writable = os.access(str(sandbox_base), os.W_OK) response_time = (time.time() - start_time) * 1000 # Determine status status = HealthStatus.HEALTHY - if response_time > 3000: # > 3 seconds + if not base_dir_exists or not base_dir_writable: status = HealthStatus.DEGRADED - elif not registry_accessible: - status = HealthStatus.DEGRADED - - # Calculate resource usage - total_containers = len(containers) - running_containers = len([c for c in containers if c.status == "running"]) details = { - "version": version_info.get("Version", "unknown"), - "api_version": version_info.get("ApiVersion", "unknown"), - "platform": version_info.get("Platform", {}).get("Name", "unknown"), - "total_containers": total_containers, - "running_containers": running_containers, - "registry_accessible": registry_accessible, - "server_version": system_info.get("ServerVersion", "unknown"), - "memory_total_gb": round(system_info.get("MemTotal", 0) / (1024**3), 2), - "cpu_count": system_info.get("NCPU", 0), + "binary_path": nsjail_path, + "version": version, + "sandbox_base_dir": str(sandbox_base), + "base_dir_exists": base_dir_exists, + "base_dir_writable": base_dir_writable, } return HealthCheckResult( - service="docker", + service="nsjail", status=status, response_time_ms=response_time, details=details, @@ -417,32 +411,32 @@ async def check_docker(self) -> HealthCheckResult: except Exception as e: response_time = (time.time() - start_time) * 1000 logger.error( - "Docker health check failed", + "nsjail health check failed", error=str(e), response_time_ms=response_time, ) return HealthCheckResult( - service="docker", + service="nsjail", status=HealthStatus.UNHEALTHY, response_time_ms=response_time, error=str(e), ) - async def check_container_pool(self) -> HealthCheckResult: - """Check container pool health and statistics.""" + async def check_sandbox_pool(self) -> HealthCheckResult: + """Check sandbox pool health and statistics.""" start_time = time.time() try: - if not self._container_pool: + if not self._sandbox_pool: return HealthCheckResult( - service="container_pool", + service="sandbox_pool", status=HealthStatus.UNKNOWN, - error="Container pool not configured", + error="Sandbox pool not configured", ) # Get pool statistics - stats = self._container_pool.get_stats() + stats = self._sandbox_pool.get_stats() response_time = (time.time() - start_time) * 1000 @@ -476,7 +470,7 @@ async def check_container_pool(self) -> HealthCheckResult: details = { "enabled": True, - "architecture": "stateless", # Containers destroyed after each execution + "architecture": "stateless", # Sandboxes destroyed after each execution "total_available": total_available, "total_acquisitions": total_acquisitions, "pool_hits": pool_hits, @@ -486,7 +480,7 @@ async def check_container_pool(self) -> HealthCheckResult: } return HealthCheckResult( - service="container_pool", + service="sandbox_pool", status=status, response_time_ms=response_time, details=details, @@ -494,10 +488,10 @@ async def check_container_pool(self) -> HealthCheckResult: except Exception as e: response_time = (time.time() - start_time) * 1000 - logger.error("Container pool health check failed", error=str(e)) + logger.error("Sandbox pool health check failed", error=str(e)) return HealthCheckResult( - service="container_pool", + service="sandbox_pool", status=HealthStatus.UNHEALTHY, response_time_ms=response_time, error=str(e), @@ -540,22 +534,6 @@ async def close(self) -> None: f"Error closing Redis connection during shutdown: {e}" ) - # Close Docker connection with timeout - if self._docker_client: - try: - # Docker client close is synchronous, but wrap in executor with timeout - loop = asyncio.get_event_loop() - await asyncio.wait_for( - loop.run_in_executor(None, self._docker_client.close), - timeout=2.0, - ) - except asyncio.TimeoutError: - logger.warning("Docker connection close timed out during shutdown") - except Exception as e: - logger.warning( - f"Error closing Docker connection during shutdown: {e}" - ) - logger.info("Closed health check service connections") except Exception as e: diff --git a/src/services/interfaces.py b/src/services/interfaces.py index ab50899..d464b4e 100644 --- a/src/services/interfaces.py +++ b/src/services/interfaces.py @@ -124,53 +124,3 @@ async def delete_file(self, session_id: str, file_id: str) -> bool: async def cleanup_session_files(self, session_id: str) -> int: """Clean up all files for a session. Returns count of deleted files.""" pass - - -class ContainerServiceInterface(ABC): - """Interface for container management service.""" - - @abstractmethod - async def create_container(self, session_id: str) -> str: - """Create a new container for a session. Returns container_id.""" - pass - - @abstractmethod - async def get_container_status(self, container_id: str) -> Optional[str]: - """Get container status.""" - pass - - @abstractmethod - async def execute_in_container( - self, container_id: str, command: str, timeout: int - ) -> Tuple[int, str, str]: - """Execute command in container. Returns (exit_code, stdout, stderr).""" - pass - - @abstractmethod - async def copy_file_to_container( - self, container_id: str, source_path: str, dest_path: str - ) -> bool: - """Copy file to container.""" - pass - - @abstractmethod - async def copy_file_from_container( - self, container_id: str, source_path: str, dest_path: str - ) -> bool: - """Copy file from container.""" - pass - - @abstractmethod - async def stop_container(self, container_id: str) -> bool: - """Stop a container.""" - pass - - @abstractmethod - async def remove_container(self, container_id: str) -> bool: - """Remove a container.""" - pass - - @abstractmethod - async def get_container_stats(self, container_id: str) -> Optional[Dict[str, Any]]: - """Get container resource usage statistics.""" - pass diff --git a/src/services/metrics.py b/src/services/metrics.py index 33090a7..0c90284 100644 --- a/src/services/metrics.py +++ b/src/services/metrics.py @@ -1,468 +1,914 @@ -"""Metrics collection service for monitoring API usage and performance.""" +"""Unified metrics service combining in-memory counters, SQLite persistence, +and container pool event tracking. + +Replaces the previous three-service architecture: +- MetricsCollector (in-memory + Redis persistence) +- DetailedMetricsService (Redis per-key/per-language) +- SQLiteMetricsService (SQLite long-term storage) + +Redis is no longer used for metrics storage. +""" -# Standard library imports import asyncio import time -from collections import defaultdict, deque +from collections import defaultdict from dataclasses import dataclass, field -from datetime import datetime, timezone -from enum import Enum -from typing import Dict, Any, Optional, List +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional -# Third-party imports -import redis.asyncio as redis +import aiosqlite import structlog -# Local application imports from ..config import settings +from ..models.metrics import DetailedExecutionMetrics logger = structlog.get_logger(__name__) - -class MetricType(str, Enum): - """Metric type enumeration.""" - - COUNTER = "counter" - GAUGE = "gauge" - HISTOGRAM = "histogram" - TIMER = "timer" - - -@dataclass -class MetricPoint: - """Individual metric data point.""" - - name: str - value: float - timestamp: datetime - labels: Dict[str, str] = field(default_factory=dict) - metric_type: MetricType = MetricType.GAUGE - - -@dataclass -class ExecutionMetrics: - """Execution-specific metrics.""" - - execution_id: str - session_id: str - language: str - status: str - execution_time_ms: float - memory_peak_mb: Optional[float] = None - cpu_time_ms: Optional[float] = None - exit_code: Optional[int] = None - file_count: int = 0 - output_size_bytes: int = 0 - timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) +# SQLite schema -- identical to the previous sqlite_metrics.py +SCHEMA_SQL = """ +-- Individual execution records (90-day retention by default) +CREATE TABLE IF NOT EXISTS executions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + execution_id TEXT NOT NULL UNIQUE, + session_id TEXT NOT NULL, + api_key_hash TEXT NOT NULL, + user_id TEXT, + entity_id TEXT, + language TEXT NOT NULL, + status TEXT NOT NULL, + execution_time_ms REAL NOT NULL, + memory_peak_mb REAL, + cpu_time_ms REAL, + container_source TEXT, + repl_mode INTEGER DEFAULT 0, + files_uploaded INTEGER DEFAULT 0, + files_generated INTEGER DEFAULT 0, + output_size_bytes INTEGER DEFAULT 0, + state_size_bytes INTEGER, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- Daily aggregates (1-year retention by default) +CREATE TABLE IF NOT EXISTS daily_aggregates ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + date DATE NOT NULL, + api_key_hash TEXT, + language TEXT, + execution_count INTEGER DEFAULT 0, + success_count INTEGER DEFAULT 0, + failure_count INTEGER DEFAULT 0, + timeout_count INTEGER DEFAULT 0, + total_execution_time_ms REAL DEFAULT 0, + total_memory_mb REAL DEFAULT 0, + pool_hits INTEGER DEFAULT 0, + pool_misses INTEGER DEFAULT 0, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(date, api_key_hash, language) +); + +-- Hourly activity for heatmap (90-day retention) +CREATE TABLE IF NOT EXISTS hourly_activity ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + date DATE NOT NULL, + hour INTEGER NOT NULL, + day_of_week INTEGER NOT NULL, + api_key_hash TEXT, + execution_count INTEGER DEFAULT 0, + success_count INTEGER DEFAULT 0, + avg_execution_time_ms REAL, + UNIQUE(date, hour, api_key_hash) +); + +-- Indexes for efficient querying +CREATE INDEX IF NOT EXISTS idx_executions_created_at ON executions(created_at); +CREATE INDEX IF NOT EXISTS idx_executions_api_key_hash ON executions(api_key_hash); +CREATE INDEX IF NOT EXISTS idx_executions_language ON executions(language); +CREATE INDEX IF NOT EXISTS idx_executions_status ON executions(status); +CREATE INDEX IF NOT EXISTS idx_executions_composite ON executions(created_at, api_key_hash, language); + +CREATE INDEX IF NOT EXISTS idx_daily_date ON daily_aggregates(date); +CREATE INDEX IF NOT EXISTS idx_daily_api_key ON daily_aggregates(api_key_hash); +CREATE INDEX IF NOT EXISTS idx_daily_language ON daily_aggregates(language); + +CREATE INDEX IF NOT EXISTS idx_hourly_date ON hourly_activity(date); +CREATE INDEX IF NOT EXISTS idx_hourly_dow_hour ON hourly_activity(day_of_week, hour); +""" @dataclass -class APIMetrics: - """API request metrics.""" +class APIRequestMetrics: + """Lightweight API request metrics for in-memory tracking.""" endpoint: str method: str status_code: int response_time_ms: float - request_size_bytes: int = 0 - response_size_bytes: int = 0 - user_agent: Optional[str] = None timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) -class MetricsCollector: - """In-memory metrics collector with Redis persistence.""" +class MetricsService: + """Unified metrics service. + + Combines: + - In-memory counters for fast health-check responses + - SQLite persistence for dashboard queries and long-term analytics + - Event hooks for container pool metrics + """ def __init__(self): - """Initialize metrics collector.""" - self._redis_client: Optional[redis.Redis] = None - self._metrics_buffer: deque = deque(maxlen=10000) # Buffer for recent metrics + # In-memory counters (for /metrics and /health endpoints) + self._start_time = time.time() self._counters: Dict[str, float] = defaultdict(float) - self._gauges: Dict[str, float] = {} - self._histograms: Dict[str, List[float]] = defaultdict(list) - self._timers: Dict[str, List[float]] = defaultdict(list) + self._execution_times: List[float] = [] + self._api_response_times: List[float] = [] - # Aggregated statistics self._execution_stats = { "total_executions": 0, "successful_executions": 0, "failed_executions": 0, "timeout_executions": 0, - "total_execution_time_ms": 0, - "total_memory_usage_mb": 0, + "total_execution_time_ms": 0.0, "language_counts": defaultdict(int), - "hourly_executions": defaultdict(int), } self._api_stats = { "total_requests": 0, "successful_requests": 0, "error_requests": 0, - "total_response_time_ms": 0, + "total_response_time_ms": 0.0, "endpoint_counts": defaultdict(int), "status_code_counts": defaultdict(int), - "hourly_requests": defaultdict(int), } - # Background task for metrics persistence - self._persistence_task: Optional[asyncio.Task] = None - self._persistence_interval = 60 # Persist metrics every 60 seconds + # Pool stats (in-memory, updated via event handlers) + self._pool_stats = { + "total_acquisitions": 0, + "pool_hits": 0, + "pool_misses": 0, + "exhaustion_events": 0, + "total_acquire_time_ms": 0.0, + } + + # SQLite state + self._db: Optional[aiosqlite.Connection] = None + self._write_queue: asyncio.Queue = asyncio.Queue() + self._writer_task: Optional[asyncio.Task] = None + self._aggregation_task: Optional[asyncio.Task] = None + self._cleanup_task: Optional[asyncio.Task] = None + self._running = False + self._batch_size = 100 + self._flush_interval = 5.0 + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ async def start(self) -> None: - """Start the metrics collector.""" - try: - # Use shared connection pool - from ..core.pool import redis_pool + """Start the metrics service (SQLite + background tasks).""" + if self._running: + return - self._redis_client = redis_pool.get_client() + self._start_time = time.time() - # Test Redis connection with timeout - await asyncio.wait_for(self._redis_client.ping(), timeout=3.0) # type: ignore[arg-type] + if not settings.sqlite_metrics_enabled: + self._running = True + logger.info("Metrics service started (in-memory only, SQLite disabled)") + return - # Load existing metrics from Redis - await self._load_metrics_from_redis() + try: + db_dir = Path(settings.sqlite_metrics_db_path).parent + db_dir.mkdir(parents=True, exist_ok=True) - # Start background persistence task - self._persistence_task = asyncio.create_task(self._persistence_loop()) + self._db = await aiosqlite.connect(settings.sqlite_metrics_db_path) + self._db.row_factory = aiosqlite.Row - logger.info("Metrics collector started with Redis persistence") + await self._db.execute("PRAGMA journal_mode=WAL") + await self._db.execute("PRAGMA synchronous=NORMAL") + await self._db.execute("PRAGMA cache_size=10000") + await self._db.executescript(SCHEMA_SQL) + await self._db.commit() - except asyncio.TimeoutError: - logger.warning( - "Redis connection timed out - metrics collector will run without persistence" + self._running = True + + self._writer_task = asyncio.create_task(self._batch_writer()) + self._aggregation_task = asyncio.create_task(self._aggregation_loop()) + self._cleanup_task = asyncio.create_task(self._cleanup_loop()) + + logger.info( + "Metrics service started", + db_path=settings.sqlite_metrics_db_path, ) - self._redis_client = None except Exception as e: + self._running = True # still run in-memory mode logger.warning( - "Failed to connect to Redis - metrics collector will run without persistence", + "SQLite init failed, metrics service running in-memory only", error=str(e), ) - self._redis_client = None - - # Always start the metrics collector, even without Redis - logger.info( - "Metrics collector started (in-memory only)" - if not self._redis_client - else "Metrics collector started" - ) async def stop(self) -> None: - """Stop the metrics collector.""" - try: - # Stop persistence task - if self._persistence_task and not self._persistence_task.done(): - self._persistence_task.cancel() - try: - await asyncio.wait_for(self._persistence_task, timeout=2.0) - except (asyncio.CancelledError, asyncio.TimeoutError): - logger.info( - "Persistence task cancelled or timed out during shutdown" - ) + """Stop the metrics service and flush pending writes.""" + if not self._running: + return - # Final metrics persistence with timeout to avoid hanging - try: - await asyncio.wait_for(self._persist_metrics_to_redis(), timeout=3.0) - except asyncio.TimeoutError: - logger.warning("Metrics persistence timed out during shutdown") - except Exception as e: - logger.warning( - "Failed to persist final metrics during shutdown", error=str(e) - ) + self._running = False - # Close Redis connection - if self._redis_client: + for task in [self._writer_task, self._aggregation_task, self._cleanup_task]: + if task: + task.cancel() try: - await asyncio.wait_for(self._redis_client.close(), timeout=1.0) - except asyncio.TimeoutError: - logger.warning("Redis connection close timed out during shutdown") - except Exception as e: - logger.warning( - f"Error closing Redis connection during shutdown: {e}" - ) + await task + except asyncio.CancelledError: + pass - logger.info("Metrics collector stopped") + await self._flush_queue() - except Exception as e: - logger.error("Error stopping metrics collector", error=str(e)) + if self._db: + await self._db.close() + self._db = None + + logger.info("Metrics service stopped") - def record_execution_metrics(self, metrics: ExecutionMetrics) -> None: - """Record code execution metrics.""" + def register_event_handlers(self) -> None: + """Register event handlers for container pool metrics.""" try: - # Add to buffer - self._metrics_buffer.append(metrics) - - # Update counters - self._counters["executions_total"] += 1 - self._counters[f"executions_by_language.{metrics.language}"] += 1 - self._counters[f"executions_by_status.{metrics.status}"] += 1 - - # Update execution statistics - self._execution_stats["total_executions"] += 1 - - if metrics.status == "completed": - self._execution_stats["successful_executions"] += 1 - elif metrics.status == "failed": - self._execution_stats["failed_executions"] += 1 - elif metrics.status == "timeout": - self._execution_stats["timeout_executions"] += 1 - - self._execution_stats[ - "total_execution_time_ms" - ] += metrics.execution_time_ms - self._execution_stats["language_counts"][metrics.language] += 1 - - if metrics.memory_peak_mb: - self._execution_stats["total_memory_usage_mb"] += metrics.memory_peak_mb - - # Update hourly statistics - hour_key = metrics.timestamp.strftime("%Y-%m-%d-%H") - self._execution_stats["hourly_executions"][hour_key] += 1 - - # Update histograms - self._histograms["execution_time_ms"].append(metrics.execution_time_ms) - if metrics.memory_peak_mb: - self._histograms["memory_usage_mb"].append(metrics.memory_peak_mb) - - # Keep histogram size manageable - if len(self._histograms["execution_time_ms"]) > 1000: - self._histograms["execution_time_ms"] = self._histograms[ - "execution_time_ms" - ][-500:] - if len(self._histograms["memory_usage_mb"]) > 1000: - self._histograms["memory_usage_mb"] = self._histograms[ - "memory_usage_mb" - ][-500:] - - # Update gauges - self._gauges["avg_execution_time_ms"] = ( - self._execution_stats["total_execution_time_ms"] - / self._execution_stats["total_executions"] + from ..core.events import ( + event_bus, + ContainerAcquiredFromPool, + ContainerCreatedFresh, + PoolExhausted, ) - if self._execution_stats["total_memory_usage_mb"] > 0: - successful_with_memory = sum( - 1 - for m in self._metrics_buffer - if isinstance(m, ExecutionMetrics) and m.memory_peak_mb - ) - if successful_with_memory > 0: - self._gauges["avg_memory_usage_mb"] = ( - self._execution_stats["total_memory_usage_mb"] - / successful_with_memory - ) + @event_bus.subscribe(ContainerAcquiredFromPool) + async def handle_pool_hit(event: ContainerAcquiredFromPool): + self._pool_stats["pool_hits"] += 1 + self._pool_stats["total_acquisitions"] += 1 + self._pool_stats["total_acquire_time_ms"] += event.acquire_time_ms - except Exception as e: - logger.error("Failed to record execution metrics", error=str(e)) + @event_bus.subscribe(ContainerCreatedFresh) + async def handle_pool_miss(event: ContainerCreatedFresh): + if event.reason in ("pool_empty", "pool_disabled"): + self._pool_stats["pool_misses"] += 1 + self._pool_stats["total_acquisitions"] += 1 - def record_api_metrics(self, metrics: APIMetrics) -> None: - """Record API request metrics.""" - try: - # Add to buffer - self._metrics_buffer.append(metrics) - - # Update counters - self._counters["api_requests_total"] += 1 - self._counters[f"api_requests_by_endpoint.{metrics.endpoint}"] += 1 - self._counters[f"api_requests_by_method.{metrics.method}"] += 1 - self._counters[f"api_requests_by_status.{metrics.status_code}"] += 1 - - # Update API statistics - self._api_stats["total_requests"] += 1 - - if 200 <= metrics.status_code < 400: - self._api_stats["successful_requests"] += 1 - else: - self._api_stats["error_requests"] += 1 - - self._api_stats["total_response_time_ms"] += metrics.response_time_ms - self._api_stats["endpoint_counts"][metrics.endpoint] += 1 - self._api_stats["status_code_counts"][metrics.status_code] += 1 - - # Update hourly statistics - hour_key = metrics.timestamp.strftime("%Y-%m-%d-%H") - self._api_stats["hourly_requests"][hour_key] += 1 - - # Update histograms - self._histograms["api_response_time_ms"].append(metrics.response_time_ms) - - # Keep histogram size manageable - if len(self._histograms["api_response_time_ms"]) > 1000: - self._histograms["api_response_time_ms"] = self._histograms[ - "api_response_time_ms" - ][-500:] - - # Update gauges - self._gauges["avg_api_response_time_ms"] = ( - self._api_stats["total_response_time_ms"] - / self._api_stats["total_requests"] - ) - - self._gauges["api_success_rate"] = ( - self._api_stats["successful_requests"] - / self._api_stats["total_requests"] - ) * 100 + @event_bus.subscribe(PoolExhausted) + async def handle_pool_exhaustion(event: PoolExhausted): + self._pool_stats["exhaustion_events"] += 1 + logger.info("Registered pool event handlers for metrics") except Exception as e: - logger.error("Failed to record API metrics", error=str(e)) + logger.warning("Failed to register pool event handlers", error=str(e)) + + # ------------------------------------------------------------------ + # Recording methods + # ------------------------------------------------------------------ + + async def record_execution(self, metrics: DetailedExecutionMetrics) -> None: + """Record an execution -- updates in-memory counters and queues SQLite write.""" + # Update in-memory counters + self._counters["executions_total"] += 1 + self._counters[f"executions_by_language.{metrics.language}"] += 1 + self._counters[f"executions_by_status.{metrics.status}"] += 1 + + stats = self._execution_stats + stats["total_executions"] += 1 + stats["total_execution_time_ms"] += metrics.execution_time_ms + stats["language_counts"][metrics.language] += 1 + + if metrics.status == "completed": + stats["successful_executions"] += 1 + elif metrics.status == "failed": + stats["failed_executions"] += 1 + elif metrics.status == "timeout": + stats["timeout_executions"] += 1 + + # Track execution times for percentiles (keep bounded) + self._execution_times.append(metrics.execution_time_ms) + if len(self._execution_times) > 1000: + self._execution_times = self._execution_times[-500:] + + # Queue for SQLite persistence + if self._running and self._db is not None: + await self._write_queue.put(metrics) + + def record_api_request(self, metrics: APIRequestMetrics) -> None: + """Record an API request (in-memory only, no persistence needed).""" + self._counters["api_requests_total"] += 1 + self._counters[f"api_requests_by_endpoint.{metrics.endpoint}"] += 1 + self._counters[f"api_requests_by_status.{metrics.status_code}"] += 1 + + api = self._api_stats + api["total_requests"] += 1 + api["total_response_time_ms"] += metrics.response_time_ms + api["endpoint_counts"][metrics.endpoint] += 1 + api["status_code_counts"][metrics.status_code] += 1 + + if 200 <= metrics.status_code < 400: + api["successful_requests"] += 1 + else: + api["error_requests"] += 1 + + self._api_response_times.append(metrics.response_time_ms) + if len(self._api_response_times) > 1000: + self._api_response_times = self._api_response_times[-500:] + + # ------------------------------------------------------------------ + # In-memory query methods (used by /metrics and /health endpoints) + # ------------------------------------------------------------------ def get_execution_statistics(self) -> Dict[str, Any]: - """Get execution statistics summary.""" - stats = dict(self._execution_stats) - - # Convert defaultdicts to regular dicts - stats["language_counts"] = dict(stats["language_counts"]) - stats["hourly_executions"] = dict(stats["hourly_executions"]) - - # Add calculated metrics - if stats["total_executions"] > 0: - stats["success_rate"] = ( - stats["successful_executions"] / stats["total_executions"] - ) * 100 - stats["failure_rate"] = ( - stats["failed_executions"] / stats["total_executions"] - ) * 100 - stats["timeout_rate"] = ( - stats["timeout_executions"] / stats["total_executions"] - ) * 100 - - # Add histogram statistics - if ( - "execution_time_ms" in self._histograms - and self._histograms["execution_time_ms"] - ): - times = self._histograms["execution_time_ms"] - stats["execution_time_percentiles"] = { - "p50": self._percentile(times, 50), - "p90": self._percentile(times, 90), - "p95": self._percentile(times, 95), - "p99": self._percentile(times, 99), - } + """Get execution statistics summary (in-memory).""" + stats = { + k: (dict(v) if isinstance(v, defaultdict) else v) + for k, v in self._execution_stats.items() + } - if ( - "memory_usage_mb" in self._histograms - and self._histograms["memory_usage_mb"] - ): - memory = self._histograms["memory_usage_mb"] - stats["memory_usage_percentiles"] = { - "p50": self._percentile(memory, 50), - "p90": self._percentile(memory, 90), - "p95": self._percentile(memory, 95), - "p99": self._percentile(memory, 99), + total = self._execution_stats["total_executions"] + if isinstance(total, (int, float)) and total > 0: + success = self._execution_stats["successful_executions"] + failed = self._execution_stats["failed_executions"] + timed_out = self._execution_stats["timeout_executions"] + assert isinstance(success, (int, float)) + assert isinstance(failed, (int, float)) + assert isinstance(timed_out, (int, float)) + stats["success_rate"] = (success / total) * 100 + stats["failure_rate"] = (failed / total) * 100 + stats["timeout_rate"] = (timed_out / total) * 100 + + if self._execution_times: + stats["execution_time_percentiles"] = { + "p50": self._percentile(self._execution_times, 50), + "p90": self._percentile(self._execution_times, 90), + "p95": self._percentile(self._execution_times, 95), + "p99": self._percentile(self._execution_times, 99), } return stats def get_api_statistics(self) -> Dict[str, Any]: - """Get API statistics summary.""" - stats = dict(self._api_stats) - - # Convert defaultdicts to regular dicts - stats["endpoint_counts"] = dict(stats["endpoint_counts"]) - stats["status_code_counts"] = dict(stats["status_code_counts"]) - stats["hourly_requests"] = dict(stats["hourly_requests"]) - - # Add histogram statistics - if ( - "api_response_time_ms" in self._histograms - and self._histograms["api_response_time_ms"] - ): - times = self._histograms["api_response_time_ms"] + """Get API statistics summary (in-memory).""" + stats = { + k: (dict(v) if isinstance(v, defaultdict) else v) + for k, v in self._api_stats.items() + } + + if self._api_response_times: stats["response_time_percentiles"] = { - "p50": self._percentile(times, 50), - "p90": self._percentile(times, 90), - "p95": self._percentile(times, 95), - "p99": self._percentile(times, 99), + "p50": self._percentile(self._api_response_times, 50), + "p90": self._percentile(self._api_response_times, 90), + "p95": self._percentile(self._api_response_times, 95), + "p99": self._percentile(self._api_response_times, 99), } return stats def get_system_metrics(self) -> Dict[str, Any]: - """Get current system metrics.""" + """Get current system metrics (in-memory).""" return { "counters": dict(self._counters), - "gauges": dict(self._gauges), - "buffer_size": len(self._metrics_buffer), - "uptime_seconds": time.time() - getattr(self, "_start_time", time.time()), - "last_persistence": getattr(self, "_last_persistence", None), + "gauges": {}, + "buffer_size": self._write_queue.qsize() if self._db else 0, + "uptime_seconds": time.time() - self._start_time, } - def _percentile(self, data: List[float], percentile: float) -> float: - """Calculate percentile of a list of values.""" - if not data: - return 0.0 + def get_pool_stats(self) -> Dict[str, Any]: + """Get container pool statistics (in-memory).""" + total = self._pool_stats["total_acquisitions"] + hit_rate = (self._pool_stats["pool_hits"] / total * 100) if total > 0 else 0.0 + avg_acquire = ( + (self._pool_stats["total_acquire_time_ms"] / total) if total > 0 else 0.0 + ) + return { + "total_acquisitions": total, + "pool_hits": self._pool_stats["pool_hits"], + "pool_misses": self._pool_stats["pool_misses"], + "hit_rate": round(hit_rate, 1), + "avg_acquire_time_ms": round(avg_acquire, 1), + "exhaustion_events": self._pool_stats["exhaustion_events"], + } - sorted_data = sorted(data) - index = (percentile / 100) * (len(sorted_data) - 1) + # ------------------------------------------------------------------ + # SQLite query methods (used by dashboard_metrics.py endpoints) + # ------------------------------------------------------------------ + + async def get_summary_stats( + self, + start: datetime, + end: datetime, + api_key_hash: Optional[str] = None, + ) -> Dict[str, Any]: + """Get summary statistics for stats cards.""" + if not self._db: + return {} + + params: List[Any] = [start.isoformat(), end.isoformat()] + api_key_filter = "" + if api_key_hash: + api_key_filter = "AND api_key_hash = ?" + params.append(api_key_hash) + + cursor = await self._db.execute( + f""" + SELECT + COUNT(*) as total_executions, + SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failure_count, + SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeout_count, + AVG(execution_time_ms) as avg_execution_time_ms, + SUM(CASE WHEN container_source = 'pool_hit' THEN 1 ELSE 0 END) as pool_hits, + SUM(CASE WHEN container_source IN ('pool_hit', 'pool_miss') THEN 1 ELSE 0 END) as pool_total, + COUNT(DISTINCT api_key_hash) as active_api_keys + FROM executions + WHERE created_at >= ? AND created_at <= ? {api_key_filter} + """, + params, + ) + row = await cursor.fetchone() + + if not row or row["total_executions"] == 0: + return { + "total_executions": 0, + "success_rate": 0, + "avg_execution_time_ms": 0, + "pool_hit_rate": 0, + "active_api_keys": 0, + } - if index.is_integer(): - return sorted_data[int(index)] + total = row["total_executions"] + success_rate = (row["success_count"] / total * 100) if total > 0 else 0 + pool_hit_rate = ( + (row["pool_hits"] / row["pool_total"] * 100) if row["pool_total"] > 0 else 0 + ) + + return { + "total_executions": total, + "success_count": row["success_count"] or 0, + "failure_count": row["failure_count"] or 0, + "timeout_count": row["timeout_count"] or 0, + "success_rate": round(success_rate, 1), + "avg_execution_time_ms": round(row["avg_execution_time_ms"] or 0, 1), + "pool_hit_rate": round(pool_hit_rate, 1), + "active_api_keys": row["active_api_keys"] or 0, + } + + async def get_language_usage( + self, + start: datetime, + end: datetime, + api_key_hash: Optional[str] = None, + stack_by_api_key: bool = False, + ) -> Dict[str, Any]: + """Get language usage data for stacked bar chart.""" + if not self._db: + return {"by_language": {}, "by_api_key": {}, "matrix": {}} + + params: List[Any] = [start.isoformat(), end.isoformat()] + api_key_filter = "" + if api_key_hash: + api_key_filter = "AND api_key_hash = ?" + params.append(api_key_hash) + + cursor = await self._db.execute( + f""" + SELECT language, COUNT(*) as count + FROM executions + WHERE created_at >= ? AND created_at <= ? {api_key_filter} + GROUP BY language + ORDER BY count DESC + """, + params, + ) + by_language = {row["language"]: row["count"] async for row in cursor} + + if not stack_by_api_key: + return {"by_language": by_language, "by_api_key": {}, "matrix": {}} + + params = [start.isoformat(), end.isoformat()] + cursor = await self._db.execute( + """ + SELECT language, api_key_hash, COUNT(*) as count + FROM executions + WHERE created_at >= ? AND created_at <= ? + GROUP BY language, api_key_hash + ORDER BY language, count DESC + """, + params, + ) + + matrix: Dict[str, Dict[str, int]] = {} + api_keys_seen: Dict[str, int] = {} + + async for row in cursor: + lang = row["language"] + key = row["api_key_hash"] + count = row["count"] + + if lang not in matrix: + matrix[lang] = {} + matrix[lang][key] = count + + if key not in api_keys_seen: + api_keys_seen[key] = 0 + api_keys_seen[key] += count + + return { + "by_language": by_language, + "by_api_key": api_keys_seen, + "matrix": matrix, + } + + async def get_time_series( + self, + start: datetime, + end: datetime, + api_key_hash: Optional[str] = None, + granularity: str = "hour", + ) -> Dict[str, Any]: + """Get execution trend data for line chart.""" + if not self._db: + return { + "timestamps": [], + "executions": [], + "success_rate": [], + "avg_duration": [], + } + + params: List[Any] = [start.isoformat(), end.isoformat()] + api_key_filter = "" + if api_key_hash: + api_key_filter = "AND api_key_hash = ?" + params.append(api_key_hash) + + if granularity == "hour": + time_format = "%Y-%m-%d %H:00" + elif granularity == "day": + time_format = "%Y-%m-%d" else: - lower = sorted_data[int(index)] - upper = sorted_data[int(index) + 1] - return lower + (upper - lower) * (index - int(index)) + time_format = "%Y-%W" + + cursor = await self._db.execute( + f""" + SELECT + strftime('{time_format}', created_at) as period, + COUNT(*) as executions, + SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, + AVG(execution_time_ms) as avg_duration + FROM executions + WHERE created_at >= ? AND created_at <= ? {api_key_filter} + GROUP BY period + ORDER BY period + """, + params, + ) + + timestamps = [] + executions = [] + success_rate = [] + avg_duration = [] + + async for row in cursor: + timestamps.append(row["period"]) + executions.append(row["executions"]) + rate = ( + (row["success_count"] / row["executions"] * 100) + if row["executions"] > 0 + else 0 + ) + success_rate.append(round(rate, 1)) + avg_duration.append(round(row["avg_duration"] or 0, 1)) + + return { + "timestamps": timestamps, + "executions": executions, + "success_rate": success_rate, + "avg_duration": avg_duration, + } + + async def get_heatmap_data( + self, + start: datetime, + end: datetime, + api_key_hash: Optional[str] = None, + ) -> Dict[str, Any]: + """Get day-of-week x hour activity matrix for heatmap.""" + if not self._db: + return {"matrix": [[0] * 24 for _ in range(7)], "max_value": 0} + + params: List[Any] = [start.isoformat(), end.isoformat()] + api_key_filter = "" + if api_key_hash: + api_key_filter = "AND api_key_hash = ?" + params.append(api_key_hash) + + cursor = await self._db.execute( + f""" + SELECT + CAST(strftime('%w', created_at) AS INTEGER) as day_of_week, + CAST(strftime('%H', created_at) AS INTEGER) as hour, + COUNT(*) as count + FROM executions + WHERE created_at >= ? AND created_at <= ? {api_key_filter} + GROUP BY day_of_week, hour + """, + params, + ) - async def _persistence_loop(self) -> None: - """Background task for persisting metrics to Redis.""" - while True: + matrix = [[0] * 24 for _ in range(7)] + max_value = 0 + + async for row in cursor: + dow = (row["day_of_week"] - 1) % 7 + hour = row["hour"] + count = row["count"] + matrix[dow][hour] = count + max_value = max(max_value, count) + + return {"matrix": matrix, "max_value": max_value} + + async def get_api_keys_list(self) -> List[Dict[str, Any]]: + """Get list of API keys for filter dropdown.""" + if not self._db: + return [] + + cursor = await self._db.execute(""" + SELECT DISTINCT api_key_hash, COUNT(*) as usage_count + FROM executions + GROUP BY api_key_hash + ORDER BY usage_count DESC + LIMIT 50 + """) + + return [ + {"key_hash": row["api_key_hash"], "usage_count": row["usage_count"]} + async for row in cursor + ] + + async def get_top_languages( + self, + start: datetime, + end: datetime, + limit: int = 5, + ) -> List[Dict[str, Any]]: + """Get top languages by execution count.""" + if not self._db: + return [] + + cursor = await self._db.execute( + """ + SELECT language, COUNT(*) as count + FROM executions + WHERE created_at >= ? AND created_at <= ? + GROUP BY language + ORDER BY count DESC + LIMIT ? + """, + (start.isoformat(), end.isoformat(), limit), + ) + + return [ + {"language": row["language"], "count": row["count"]} async for row in cursor + ] + + # ------------------------------------------------------------------ + # SQLite background tasks + # ------------------------------------------------------------------ + + async def _batch_writer(self) -> None: + """Background task that batches writes for efficiency.""" + batch: List[DetailedExecutionMetrics] = [] + + while self._running: try: - await asyncio.sleep(self._persistence_interval) - await self._persist_metrics_to_redis() + try: + item = await asyncio.wait_for( + self._write_queue.get(), timeout=self._flush_interval + ) + batch.append(item) + except asyncio.TimeoutError: + pass + + if len(batch) >= self._batch_size or ( + batch and self._write_queue.empty() + ): + await self._write_batch(batch) + batch = [] except asyncio.CancelledError: - logger.info("Metrics persistence task cancelled") - break + if batch: + await self._write_batch(batch) + raise except Exception as e: - logger.error("Error in metrics persistence loop", error=str(e)) - # Continue the loop even if persistence fails + logger.error("Error in batch writer", error=str(e)) - async def _persist_metrics_to_redis(self) -> None: - """Persist current metrics to Redis.""" - if not self._redis_client: + async def _write_batch(self, batch: List[DetailedExecutionMetrics]) -> None: + """Write a batch of execution records to SQLite.""" + if not batch or not self._db: return try: - # Prepare metrics data - metrics_data = { - "execution_stats": self.get_execution_statistics(), - "api_stats": self.get_api_statistics(), - "system_metrics": self.get_system_metrics(), - "timestamp": datetime.now(timezone.utc).isoformat(), - } - - # Store in Redis with TTL - await self._redis_client.setex( - "metrics:current", 86400, str(metrics_data) # 24 hours TTL + await self._db.executemany( + """ + INSERT OR IGNORE INTO executions ( + execution_id, session_id, api_key_hash, user_id, entity_id, + language, status, execution_time_ms, memory_peak_mb, cpu_time_ms, + container_source, repl_mode, files_uploaded, files_generated, + output_size_bytes, state_size_bytes, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + m.execution_id, + m.session_id, + m.api_key_hash[:16] if m.api_key_hash else "unknown", + m.user_id, + m.entity_id, + m.language, + m.status, + m.execution_time_ms, + m.memory_peak_mb, + m.cpu_time_ms, + m.container_source, + 1 if m.repl_mode else 0, + m.files_uploaded, + m.files_generated, + m.output_size_bytes, + m.state_size_bytes, + ( + m.timestamp.isoformat() + if m.timestamp + else datetime.now(timezone.utc).isoformat() + ), + ) + for m in batch + ], ) + await self._db.commit() + logger.debug("Wrote metrics batch", count=len(batch)) + except Exception as e: + logger.error("Failed to write metrics batch", error=str(e)) + + async def _flush_queue(self) -> None: + """Flush all pending writes from the queue.""" + batch: List[DetailedExecutionMetrics] = [] + while not self._write_queue.empty(): + try: + batch.append(self._write_queue.get_nowait()) + except asyncio.QueueEmpty: + break + if batch: + await self._write_batch(batch) - # Store historical data (keep last 24 hours) - hour_key = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H") - await self._redis_client.setex( - f"metrics:hourly:{hour_key}", - 86400 * 7, # 7 days TTL for hourly data - str(metrics_data), + async def _aggregation_loop(self) -> None: + """Periodically aggregate executions into daily summaries.""" + interval = settings.metrics_aggregation_interval_minutes * 60 + + while self._running: + try: + await asyncio.sleep(interval) + await self.run_aggregation() + except asyncio.CancelledError: + raise + except Exception as e: + logger.error("Error in aggregation loop", error=str(e)) + + async def run_aggregation(self) -> None: + """Build daily aggregates from execution records.""" + if not self._db: + return + + try: + yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date() + + await self._db.execute( + """ + INSERT OR REPLACE INTO daily_aggregates ( + date, api_key_hash, language, + execution_count, success_count, failure_count, timeout_count, + total_execution_time_ms, total_memory_mb, pool_hits, pool_misses + ) + SELECT + DATE(created_at) as date, + api_key_hash, + language, + COUNT(*) as execution_count, + SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failure_count, + SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeout_count, + SUM(execution_time_ms) as total_execution_time_ms, + SUM(COALESCE(memory_peak_mb, 0)) as total_memory_mb, + SUM(CASE WHEN container_source = 'pool_hit' THEN 1 ELSE 0 END) as pool_hits, + SUM(CASE WHEN container_source = 'pool_miss' THEN 1 ELSE 0 END) as pool_misses + FROM executions + WHERE DATE(created_at) <= ? + GROUP BY DATE(created_at), api_key_hash, language + """, + (yesterday.isoformat(),), ) - self._last_persistence = datetime.now(timezone.utc) + await self._db.execute( + """ + INSERT OR REPLACE INTO hourly_activity ( + date, hour, day_of_week, api_key_hash, + execution_count, success_count, avg_execution_time_ms + ) + SELECT + DATE(created_at) as date, + CAST(strftime('%H', created_at) AS INTEGER) as hour, + CAST(strftime('%w', created_at) AS INTEGER) as day_of_week, + api_key_hash, + COUNT(*) as execution_count, + SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, + AVG(execution_time_ms) as avg_execution_time_ms + FROM executions + WHERE DATE(created_at) <= ? + GROUP BY DATE(created_at), hour, api_key_hash + """, + (yesterday.isoformat(),), + ) + await self._db.commit() + logger.info("Aggregation completed", up_to_date=yesterday.isoformat()) except Exception as e: - logger.error("Failed to persist metrics to Redis", error=str(e)) + logger.error("Aggregation failed", error=str(e)) + + async def _cleanup_loop(self) -> None: + """Periodically clean up old data based on retention settings.""" + interval = 24 * 60 * 60 - async def _load_metrics_from_redis(self) -> None: - """Load existing metrics from Redis.""" - if not self._redis_client: + while self._running: + try: + await asyncio.sleep(interval) + await self.cleanup_old_data() + except asyncio.CancelledError: + raise + except Exception as e: + logger.error("Error in cleanup loop", error=str(e)) + + async def cleanup_old_data(self) -> None: + """Remove data older than retention periods.""" + if not self._db: return try: - # Load current metrics - current_data = await self._redis_client.get("metrics:current") - if current_data: - # In a full implementation, we would parse and restore the metrics - # For now, just log that we found existing data - logger.info("Found existing metrics data in Redis") + now = datetime.now(timezone.utc) + + exec_cutoff = ( + now - timedelta(days=settings.metrics_execution_retention_days) + ).isoformat() + result = await self._db.execute( + "DELETE FROM executions WHERE created_at < ?", (exec_cutoff,) + ) + exec_deleted = result.rowcount + daily_cutoff = ( + (now - timedelta(days=settings.metrics_daily_retention_days)) + .date() + .isoformat() + ) + result = await self._db.execute( + "DELETE FROM daily_aggregates WHERE date < ?", (daily_cutoff,) + ) + daily_deleted = result.rowcount + + hourly_cutoff = ( + (now - timedelta(days=settings.metrics_execution_retention_days)) + .date() + .isoformat() + ) + result = await self._db.execute( + "DELETE FROM hourly_activity WHERE date < ?", (hourly_cutoff,) + ) + hourly_deleted = result.rowcount + + await self._db.commit() + await self._db.execute("VACUUM") + + logger.info( + "Cleanup completed", + executions_deleted=exec_deleted, + daily_deleted=daily_deleted, + hourly_deleted=hourly_deleted, + ) except Exception as e: - logger.error("Failed to load metrics from Redis", error=str(e)) + logger.error("Cleanup failed", error=str(e)) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + @staticmethod + def _percentile(data: List[float], percentile: float) -> float: + """Calculate percentile of a list of values.""" + if not data: + return 0.0 + sorted_data = sorted(data) + index = (percentile / 100) * (len(sorted_data) - 1) + if index.is_integer(): + return sorted_data[int(index)] + lower = sorted_data[int(index)] + upper = sorted_data[int(index) + 1] + return lower + (upper - lower) * (index - int(index)) -# Global metrics collector instance -metrics_collector = MetricsCollector() +# Global singleton +metrics_service = MetricsService() diff --git a/src/services/orchestrator.py b/src/services/orchestrator.py index ba57110..218c8d0 100644 --- a/src/services/orchestrator.py +++ b/src/services/orchestrator.py @@ -16,7 +16,6 @@ """ import asyncio -import base64 from dataclasses import dataclass from datetime import datetime from typing import Any, Dict, List, Optional @@ -33,10 +32,7 @@ SessionCreate, ExecuteCodeRequest, ValidationError, - ExecutionError, - ResourceNotFoundError, ServiceUnavailableError, - TimeoutError, ) from ..models.errors import ErrorDetail from .interfaces import ( @@ -171,9 +167,6 @@ async def execute( except ( ValidationError, - ExecutionError, - TimeoutError, - ResourceNotFoundError, ServiceUnavailableError, ): raise @@ -235,7 +228,7 @@ async def _get_or_create_session(self, ctx: ExecutionContext) -> str: try: existing = await self.session_service.get_session(request.session_id) if existing and existing.status.value == "active": - logger.info( + logger.debug( "Reusing session from request", session_id=request.session_id[:12], ) @@ -256,7 +249,7 @@ async def _get_or_create_session(self, ctx: ExecutionContext) -> str: file_ref.session_id ) if existing and existing.status.value == "active": - logger.info( + logger.debug( "Reusing session from file reference", session_id=file_ref.session_id, ) @@ -277,7 +270,7 @@ async def _get_or_create_session(self, ctx: ExecutionContext) -> str: if entity_sessions: existing = entity_sessions[0] if existing.status.value == "active": - logger.info( + logger.debug( "Reusing session by entity_id", session_id=existing.session_id[:12], entity_id=request.entity_id, @@ -311,9 +304,7 @@ async def _mount_files(self, ctx: ExecutionContext) -> List[Dict[str, Any]]: 2. If no request.files[] but session_id exists, auto-mount ALL session files 3. If neither, return empty list - Also handles restore_state flag for state-file linking: - - If a file has restore_state=True, loads the state associated with that file - - Tracks mounted file references for updating state_hash after execution + Tracks mounted file references for updating state_hash after execution. """ # If explicit files provided, mount those (existing behavior) if ctx.request.files: @@ -328,16 +319,10 @@ async def _mount_files(self, ctx: ExecutionContext) -> List[Dict[str, Any]]: async def _mount_explicit_files( self, ctx: ExecutionContext ) -> List[Dict[str, Any]]: - """Mount explicitly requested files from request.files[]. - - This preserves the original file mounting behavior with restore_state support. - """ + """Mount explicitly requested files from request.files[].""" mounted = [] mounted_ids = set() file_refs = [] # Track for state-file linking - restore_state_hash = ( - None # Hash of state to restore (from first restore_state file) - ) for file_ref in ctx.request.files: # Get file info @@ -383,27 +368,9 @@ async def _mount_explicit_files( } ) - # Check for restore_state flag (only for Python, use first file's state) - if ( - file_ref.restore_state - and ctx.request.lang == "py" - and restore_state_hash is None - and file_info.state_hash - ): - restore_state_hash = file_info.state_hash - logger.debug( - "Will restore state from file", - file_id=file_info.file_id, - state_hash=file_info.state_hash[:12], - ) - # Store file refs for later state_hash update ctx.mounted_file_refs = file_refs - # If a file requested state restoration, load that state - if restore_state_hash and settings.state_persistence_enabled: - await self._load_state_by_hash(ctx, restore_state_hash) - return mounted async def _auto_mount_session_files( @@ -418,7 +385,7 @@ async def _auto_mount_session_files( SECURITY: All files are from the current session, so cross-session isolation is maintained. """ - logger.info( + logger.debug( "Auto-mounting all session files", session_id=ctx.session_id[:12] if ctx.session_id else None, ) @@ -458,7 +425,7 @@ async def _auto_mount_session_files( ctx.mounted_file_refs = file_refs if mounted: - logger.info( + logger.debug( "Auto-mounted session files", session_id=ctx.session_id[:12] if ctx.session_id else None, file_count=len(mounted), @@ -467,50 +434,10 @@ async def _auto_mount_session_files( return mounted - async def _load_state_by_hash(self, ctx: ExecutionContext, state_hash: str) -> None: - """Load state by its hash for state-file restoration. - - Tries Redis first, then MinIO cold storage. - """ - try: - # Try Redis first - state = await self.state_service.get_state_by_hash(state_hash) - - if ( - not state - and self.state_archival_service - and settings.state_archive_enabled - ): - # Try MinIO cold storage - state = await self.state_archival_service.restore_state_by_hash( - state_hash - ) - - if state: - ctx.initial_state = state - logger.info( - "Restored state from file reference", - session_id=ctx.session_id[:12] if ctx.session_id else "none", - state_hash=state_hash[:12], - state_size=len(state), - ) - else: - logger.warning( - "State not found for hash", - state_hash=state_hash[:12], - ) - except Exception as e: - logger.error( - "Failed to load state by hash", - state_hash=state_hash[:12], - error=str(e), - ) - async def _load_state(self, ctx: ExecutionContext) -> None: """Load previous state from Redis (or MinIO fallback) for Python sessions. Priority order: - 0. State already loaded via restore_state file reference (highest priority) 1. Recently uploaded state via POST /state (client-side cache restore) 2. Redis hot storage (within 2-hour TTL) 3. MinIO cold storage (archived state) @@ -521,10 +448,10 @@ async def _load_state(self, ctx: ExecutionContext) -> None: if ctx.request.lang != "py": return - # Skip if state was already loaded via restore_state file reference + # Skip if state was already loaded by another mechanism if ctx.initial_state: logger.debug( - "State already loaded (from file restore_state)", + "State already loaded", session_id=ctx.session_id[:12], ) return @@ -536,7 +463,7 @@ async def _load_state(self, ctx: ExecutionContext) -> None: if ctx.initial_state: # Clear marker so subsequent executions use normal flow await self.state_service.clear_upload_marker(ctx.session_id) - logger.info( + logger.debug( "Using client-uploaded state", session_id=ctx.session_id[:12], state_size=len(ctx.initial_state), @@ -660,7 +587,7 @@ async def _update_mounted_files_content(self, ctx: ExecutionContext) -> None: if not ctx.mounted_files or not ctx.container: return - container_manager = self.execution_service.container_manager + sandbox_manager = self.execution_service.sandbox_manager for file_info in ctx.mounted_files: try: @@ -684,7 +611,7 @@ async def _update_mounted_files_content(self, ctx: ExecutionContext) -> None: # SECURITY: Skip agent-assigned files (uploaded with entity_id) # Agent files are read-only and cannot be modified by user code - file_metadata = await self.file_service._get_file_metadata( + file_metadata = await self.file_service.get_file_metadata( file_session_id, file_id ) if file_metadata and file_metadata.get("is_agent_file") == "1": @@ -697,7 +624,7 @@ async def _update_mounted_files_content(self, ctx: ExecutionContext) -> None: # Read current content from container file_path = f"/mnt/data/{filename}" - content = await container_manager.get_file_content_from_container( + content = sandbox_manager.get_file_content_from_sandbox( ctx.container, file_path ) @@ -783,8 +710,8 @@ async def _execute_code(self, ctx: ExecutionContext) -> Any: capture_state=use_state, ) - logger.info( - "Code execution completed", + logger.debug( + "Code execution completed in sandbox", session_id=ctx.session_id, status=execution.status.value, container_id=( @@ -837,7 +764,7 @@ async def _handle_generated_files(self, ctx: ExecutionContext) -> List[FileRef]: session_id=ctx.session_id, # Include for cross-message persistence ) ) - logger.info( + logger.debug( "Generated file stored", session_id=ctx.session_id, filename=filename, @@ -853,35 +780,20 @@ async def _handle_generated_files(self, ctx: ExecutionContext) -> List[FileRef]: return generated async def _get_file_from_container(self, container: Any, file_path: str) -> bytes: - """Get file content from the execution container. + """Get file content from the execution sandbox. Args: - container: Docker container object (passed directly, no session lookup needed) - file_path: Path to file inside container + container: Sandbox object (passed directly, no session lookup needed) + file_path: Path to file inside sandbox """ - import tempfile - import os - if not container: - return f"# Container not found for file: {file_path}\n".encode("utf-8") + return f"# Sandbox not found for file: {file_path}\n".encode("utf-8") - container_manager = self.execution_service.container_manager - - with tempfile.NamedTemporaryFile(delete=False) as tmp_file: - temp_path = tmp_file.name - - try: - success = await container_manager.copy_from_container( - container, file_path, temp_path - ) - if success: - with open(temp_path, "rb") as f: - return f.read() - else: - return f"# Failed to retrieve file: {file_path}\n".encode("utf-8") - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) + sandbox_manager = self.execution_service.sandbox_manager + content = sandbox_manager.get_file_content_from_sandbox(container, file_path) + if content is not None: + return content + return f"# Failed to retrieve file: {file_path}\n".encode("utf-8") def _extract_outputs(self, ctx: ExecutionContext) -> None: """Extract stdout and stderr from execution outputs.""" @@ -910,31 +822,12 @@ def _extract_outputs(self, ctx: ExecutionContext) -> None: ctx.stdout += "\n" def _build_response(self, ctx: ExecutionContext) -> ExecResponse: - """Build the LibreChat-compatible response with state info.""" - # Compute state info for Python executions - has_state = False - state_size = None - state_hash = None - - if ctx.new_state and ctx.request.lang == "py": - has_state = True - # new_state is base64-encoded, decode to get raw bytes for size and hash - try: - raw_bytes = base64.b64decode(ctx.new_state) - state_size = len(raw_bytes) - state_hash = self.state_service.compute_hash(raw_bytes) - except Exception: - # Fallback to base64 string length if decode fails - state_size = len(ctx.new_state) - + """Build the LibreChat-compatible response.""" return ExecResponse( session_id=ctx.session_id, files=ctx.generated_files or [], stdout=ctx.stdout, stderr=ctx.stderr, - has_state=has_state, - state_size=state_size, - state_hash=state_hash, ) async def _cleanup(self, ctx: ExecutionContext) -> None: @@ -943,34 +836,39 @@ async def _cleanup(self, ctx: ExecutionContext) -> None: - Destroys the container in background (non-blocking for faster response) - Publishes ExecutionCompleted event for metrics """ - # Destroy container in background for faster response + # Destroy sandbox in background for faster response. + # Use sandbox_pool.destroy_sandbox() which kills the REPL process + # AND removes the directory. Without this, REPL processes leak. if ctx.container: try: - container_manager = self.execution_service.container_manager - container_id = ( + sandbox_id = ( ctx.container.id[:12] if hasattr(ctx.container, "id") else "unknown" ) - logger.debug( - "Scheduling container destruction", container_id=container_id - ) + logger.debug("Scheduling sandbox destruction", sandbox_id=sandbox_id) + + # Use pool destroy (kills process + removes dir) or manager (dir only) + sandbox_pool = getattr(self.execution_service, "sandbox_pool", None) + sandbox_manager = self.execution_service.sandbox_manager - # Fire-and-forget: destroy container in background async def destroy_background(): try: - await container_manager.force_kill_container(ctx.container) - logger.debug("Container destroyed", container_id=container_id) + if sandbox_pool: + await sandbox_pool.destroy_sandbox(ctx.container) + else: + sandbox_manager.destroy_sandbox(ctx.container) + logger.debug("Sandbox destroyed", sandbox_id=sandbox_id) except Exception as e: logger.warning( - "Background container destruction failed", - container_id=container_id, + "Background sandbox destruction failed", + sandbox_id=sandbox_id, error=str(e), ) asyncio.create_task(destroy_background()) except Exception as e: - logger.error("Failed to schedule container destruction", error=str(e)) + logger.error("Failed to schedule sandbox destruction", error=str(e)) else: - logger.debug("No container in context to destroy") + logger.debug("No sandbox in context to destroy") # Publish event for metrics try: @@ -1017,9 +915,7 @@ async def _record_detailed_metrics( status: Execution status (completed, failed, timeout) """ try: - from .detailed_metrics import get_detailed_metrics_service - - service = get_detailed_metrics_service() + from .metrics import metrics_service # Get memory usage if available memory_peak_mb = None @@ -1040,7 +936,7 @@ async def _record_detailed_metrics( repl_mode = ( ctx.request.lang == "py" and settings.repl_enabled - and settings.container_pool_enabled + and settings.sandbox_pool_enabled ) metrics = DetailedExecutionMetrics( @@ -1063,7 +959,7 @@ async def _record_detailed_metrics( state_size_bytes=state_size, ) - await service.record_execution(metrics) + await metrics_service.record_execution(metrics) except Exception as e: logger.warning("Failed to record detailed metrics", error=str(e)) diff --git a/src/services/sandbox/__init__.py b/src/services/sandbox/__init__.py new file mode 100644 index 0000000..f843137 --- /dev/null +++ b/src/services/sandbox/__init__.py @@ -0,0 +1,21 @@ +"""Sandbox management services using nsjail. + +This package provides nsjail-based sandbox management functionality: +- nsjail.py: SandboxInfo dataclass and NsjailConfig builder +- executor.py: Command execution in sandboxes +- repl_executor.py: REPL-based execution for pre-warmed Python sandboxes +- manager.py: Sandbox lifecycle management +- pool.py: Pre-warmed sandbox pool +""" + +from .manager import SandboxManager +from .executor import SandboxExecutor +from .repl_executor import SandboxREPLExecutor +from .pool import SandboxPool + +__all__ = [ + "SandboxManager", + "SandboxExecutor", + "SandboxREPLExecutor", + "SandboxPool", +] diff --git a/src/services/sandbox/executor.py b/src/services/sandbox/executor.py new file mode 100644 index 0000000..c4d3d9d --- /dev/null +++ b/src/services/sandbox/executor.py @@ -0,0 +1,274 @@ +"""Command execution in nsjail sandboxes. + +Uses asyncio subprocess to invoke nsjail for isolated code execution. +""" + +import asyncio +import os +import re +import shlex +import signal +from typing import Dict, List, Optional, Tuple + +import structlog + +from ...config import settings +from .nsjail import NsjailConfig, SandboxInfo + +logger = structlog.get_logger(__name__) + + +class SandboxExecutor: + """Handles command execution inside nsjail sandboxes. + + Spawns an nsjail subprocess for each command execution. + """ + + def __init__(self, nsjail_config: NsjailConfig): + """Initialize executor with nsjail config. + + Args: + nsjail_config: Configuration for building nsjail arguments + """ + self._nsjail_config = nsjail_config + + async def execute_command( + self, + sandbox_info: SandboxInfo, + command: str, + timeout: int = None, + language: Optional[str] = None, + stdin_payload: Optional[str] = None, + ) -> Tuple[int, str, str]: + """Execute a command in the sandbox via nsjail. + + Args: + sandbox_info: Sandbox to execute in + command: Command string to execute + timeout: Maximum execution time in seconds + language: Programming language code + stdin_payload: Optional stdin data + + Returns: + Tuple of (exit_code, stdout, stderr) + """ + if timeout is None: + timeout = settings.max_execution_time + + # Build sanitized environment + sanitized_env = self._build_sanitized_env(language) + + # Wrap the command in a shell for consistent behavior + # Use absolute path since nsjail uses execve (no PATH search) + shell_command = ["/bin/sh", "-c", command] + + # Build nsjail arguments + network = False # nsjail sandboxes run without network access + nsjail_args = self._nsjail_config.build_args( + sandbox_dir=str(sandbox_info.data_dir), + command=shell_command, + language=sandbox_info.language, + timeout=timeout, + network=network, + env=sanitized_env, + ) + + try: + # Wrap nsjail in unshare+mount to bind sandbox_dir to /mnt/data. + # This gives each execution its own mount namespace so /mnt/data + # resolves to the correct sandbox dir (concurrent-safe). + nsjail_cmd = " ".join( + shlex.quote(str(a)) for a in [settings.nsjail_binary] + nsjail_args + ) + # BUG-003: Mask /proc for most languages. + # Java and Rust need /proc/self/exe to locate shared libraries + # (JVM needs libjli.so, rustc needs its own binary path). + # For these languages, /proc remains accessible (known limitation). + lang = sandbox_info.language.lower().strip() + if lang in ("java", "rs"): + proc_mask = "" + else: + proc_mask = "mount --bind /tmp/empty_proc /proc && " + + wrapper_cmd = ( + # Bind sandbox dir to /mnt/data (before hiding sandboxes dir) + f"mount --bind {shlex.quote(str(sandbox_info.data_dir))} /mnt/data && " + # BUG-001: Hide other sessions' sandbox directories + f"mount -t tmpfs -o size=1k tmpfs /var/lib/code-interpreter/sandboxes && " + # BUG-002: Hide metrics database + f"mount -t tmpfs -o size=1k tmpfs /app/data && " + # BUG-004: Hide log directory + f"mount -t tmpfs -o size=1k tmpfs /var/log && " + # BUG-005: Hide SSL certs and application source + f"mount -t tmpfs -o size=1k tmpfs /app/ssl && " + f"mount -t tmpfs -o size=1k tmpfs /app/dashboard && " + f"mount -t tmpfs -o size=1k tmpfs /app/src && " + # BUG-003: Hide /proc (except Java which needs /proc/self/exe) + f"{proc_mask}" + # Execute nsjail + f"{nsjail_cmd}" + ) + + # Create subprocess via unshare --mount for per-process mount namespace + proc = await asyncio.create_subprocess_exec( + "unshare", + "--mount", + "--", + "/bin/sh", + "-c", + wrapper_cmd, + stdin=asyncio.subprocess.PIPE if stdin_payload else None, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, # New process group for clean cleanup + ) + + # Communicate with timeout + stdin_data = stdin_payload.encode("utf-8") if stdin_payload else None + try: + stdout_bytes, stderr_bytes = await asyncio.wait_for( + proc.communicate(input=stdin_data), + timeout=timeout + 5, # Grace period beyond nsjail's own limit + ) + except asyncio.TimeoutError: + try: + os.killpg(proc.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + proc.kill() + await proc.wait() + logger.warning( + "Sandbox execution timed out", + sandbox_id=sandbox_info.sandbox_id[:12], + timeout=timeout, + ) + return 124, "", f"Execution timed out after {timeout} seconds" + + # Sanitize output + stdout = self._sanitize_output(stdout_bytes) if stdout_bytes else "" + stderr = self._sanitize_output(stderr_bytes) if stderr_bytes else "" + + return proc.returncode or 0, stdout, stderr + + except Exception as e: + logger.error( + "Sandbox execution failed", + sandbox_id=sandbox_info.sandbox_id[:12], + error=str(e), + ) + return 1, "", f"Execution failed: {str(e)}" + + def _build_sanitized_env(self, language: Optional[str]) -> Dict[str, str]: + """Build environment whitelist for execution.""" + normalized_lang = (language or "").lower().strip() + + env_whitelist: Dict[str, str] = { + "PATH": "/usr/local/bin:/usr/bin:/bin", + "HOME": "/tmp", + "TMPDIR": "/tmp", + } + + if normalized_lang in {"py", "python"}: + env_whitelist.update( + { + "PYTHONUNBUFFERED": "1", + "PYTHONDONTWRITEBYTECODE": "1", + "PYTHONPATH": "/mnt/data", + "MPLCONFIGDIR": "/tmp/mplconfig", + "XDG_CACHE_HOME": "/tmp/.cache", + "MPLBACKEND": "Agg", + } + ) + elif normalized_lang in {"js", "ts"}: + env_whitelist.update( + { + "NODE_PATH": "/usr/local/lib/node_modules", + } + ) + elif normalized_lang == "java": + env_whitelist.update( + { + "CLASSPATH": ".:/opt/java/lib/*", + "JAVA_OPTS": "-Xmx512m -Xms128m", + "PATH": "/opt/java/openjdk/bin:/usr/local/bin:/usr/bin:/bin", + } + ) + elif normalized_lang == "go": + env_whitelist.update( + { + "GO111MODULE": "on", + "GOROOT": "/usr/local/go", + "GOPROXY": "https://proxy.golang.org,direct", + "GOSUMDB": "sum.golang.org", + "GOCACHE": "/tmp/go-build", + "PATH": "/usr/local/go/bin:/usr/local/bin:/usr/bin:/bin", + } + ) + elif normalized_lang in {"c", "cpp"}: + env_whitelist.update( + { + "CC": "gcc", + "CXX": "g++", + "PKG_CONFIG_PATH": "/usr/lib/x86_64-linux-gnu/pkgconfig", + } + ) + elif normalized_lang == "php": + env_whitelist.update( + { + "PHP_INI_SCAN_DIR": "/usr/local/etc/php/conf.d", + "COMPOSER_HOME": "/opt/composer/global", + "PATH": "/opt/composer/global/vendor/bin:/usr/local/bin:/usr/bin:/bin", + } + ) + elif normalized_lang == "rs": + env_whitelist.update( + { + "CARGO_HOME": "/usr/local/cargo", + "RUSTUP_HOME": "/usr/local/rustup", + "PATH": "/usr/local/cargo/bin:/usr/local/bin:/usr/bin:/bin", + } + ) + elif normalized_lang == "r": + env_whitelist.update( + { + "R_LIBS_USER": "/usr/local/lib/R/site-library", + } + ) + elif normalized_lang == "f90": + env_whitelist.update( + { + "FORTRAN_COMPILER": "gfortran", + "FC": "gfortran", + "F77": "gfortran", + "F90": "gfortran", + "F95": "gfortran", + } + ) + + return env_whitelist + + def _escape_env_value(self, value: str) -> str: + """Escape env var values for shell.""" + try: + safe = str(value).replace("'", "'\\''") + return f"'{safe}'" + except Exception: + return "''" + + def _sanitize_output(self, output: bytes) -> str: + """Sanitize command output for security.""" + try: + output_str = output.decode("utf-8", errors="replace") + + max_output_size = 1024 * 1024 # 1MB limit + if len(output_str) > max_output_size: + output_str = ( + output_str[:max_output_size] + + "\n[Output truncated - size limit exceeded]" + ) + + output_str = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", output_str) + return output_str + + except Exception as e: + logger.error(f"Failed to sanitize output: {e}") + return "[Output sanitization failed]" diff --git a/src/services/sandbox/manager.py b/src/services/sandbox/manager.py new file mode 100644 index 0000000..c9ebf2e --- /dev/null +++ b/src/services/sandbox/manager.py @@ -0,0 +1,268 @@ +"""Sandbox lifecycle management using nsjail.""" + +import os +import shutil +import uuid +from datetime import datetime +from pathlib import Path +from typing import Optional, Tuple + +import structlog + +from ...config import settings +from ...config.languages import get_user_id_for_language +from .nsjail import NsjailConfig, SandboxInfo +from .executor import SandboxExecutor + +logger = structlog.get_logger(__name__) + + +class SandboxManager: + """Manages nsjail sandbox lifecycle operations. + + Creates sandbox directories on the host filesystem for isolated + code execution via nsjail. + """ + + def __init__(self): + """Initialize the sandbox manager.""" + self._nsjail_config = NsjailConfig() + self._executor = SandboxExecutor(self._nsjail_config) + self._base_dir = Path(settings.sandbox_base_dir) + self._initialization_error: Optional[str] = None + + # Ensure base directory exists + try: + self._base_dir.mkdir(parents=True, exist_ok=True) + except OSError as e: + self._initialization_error = ( + f"Failed to create sandbox base directory {self._base_dir}: {e}" + ) + logger.error( + "Sandbox base directory creation failed", + base_dir=str(self._base_dir), + error=str(e), + ) + + @property + def executor(self) -> SandboxExecutor: + """Get the sandbox executor.""" + return self._executor + + def is_available(self) -> bool: + """Check if nsjail is available.""" + return shutil.which(settings.nsjail_binary) is not None + + def get_initialization_error(self) -> Optional[str]: + """Get initialization error if any.""" + if self._initialization_error: + return self._initialization_error + if not self.is_available(): + return ( + f"nsjail binary not found: {settings.nsjail_binary}. " + "Ensure nsjail is installed and in PATH." + ) + return None + + def create_sandbox( + self, + session_id: str, + language: str, + repl_mode: bool = False, + ) -> SandboxInfo: + """Create a new sandbox directory. + + Args: + session_id: Session identifier + language: Programming language code + repl_mode: Whether to start in REPL mode + + Returns: + SandboxInfo with paths to the sandbox directories + """ + sandbox_id = uuid.uuid4().hex + sandbox_dir = self._base_dir / sandbox_id + data_dir = sandbox_dir / "data" + + try: + data_dir.mkdir(parents=True, exist_ok=True) + + # Make data dir writable by the sandbox user. + # Each sandbox has its own isolated directory so world-writable is safe. + os.chmod(str(data_dir), 0o777) # nosec B103 + except OSError as e: + logger.error( + "Failed to create sandbox directory", + sandbox_id=sandbox_id, + error=str(e), + ) + raise RuntimeError(f"Failed to create sandbox: {e}") + + labels = { + "com.code-interpreter.managed": "true", + "com.code-interpreter.type": "execution", + "com.code-interpreter.session-id": session_id, + "com.code-interpreter.language": language or "unknown", + "com.code-interpreter.created-at": datetime.utcnow().isoformat(), + "com.code-interpreter.repl-mode": "true" if repl_mode else "false", + } + + info = SandboxInfo( + sandbox_id=sandbox_id, + sandbox_dir=sandbox_dir, + data_dir=data_dir, + language=language, + session_id=session_id, + created_at=datetime.utcnow(), + repl_mode=repl_mode, + labels=labels, + ) + + logger.debug( + "Created sandbox", + sandbox_id=sandbox_id[:12], + session_id=session_id[:12] if session_id else "none", + language=language, + repl_mode=repl_mode, + ) + + return info + + def destroy_sandbox(self, sandbox_info: SandboxInfo) -> bool: + """Destroy a sandbox by removing its directory tree. + + Args: + sandbox_info: Sandbox to destroy + + Returns: + True if successful, False otherwise + """ + try: + if sandbox_info.sandbox_dir.exists(): + shutil.rmtree(str(sandbox_info.sandbox_dir)) + logger.debug( + "Destroyed sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + ) + return True + except Exception as e: + logger.warning( + "Failed to destroy sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + error=str(e), + ) + return False + + def copy_content_to_sandbox( + self, + sandbox_info: SandboxInfo, + content: bytes, + dest_path: str, + language: str = "py", + ) -> bool: + """Write file content into the sandbox data directory. + + Args: + sandbox_info: Target sandbox + content: File content as bytes + dest_path: Destination path (e.g., /mnt/data/file.py or file.py) + language: Programming language (used to set correct ownership) + + Returns: + True if successful, False otherwise + """ + try: + # Extract filename from dest_path (may be absolute like /mnt/data/file.py) + filename = Path(dest_path).name + file_path = sandbox_info.data_dir / filename + + file_path.write_bytes(content) + + # Set ownership to language-specific user + user_id = get_user_id_for_language(language.lower().strip()) + os.chown(str(file_path), user_id, user_id) + os.chmod(str(file_path), 0o644) + + return True + except Exception as e: + logger.error( + "Failed to copy content to sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + dest_path=dest_path, + error=str(e), + ) + return False + + def get_file_content_from_sandbox( + self, sandbox_info: SandboxInfo, source_path: str + ) -> Optional[bytes]: + """Read file content from the sandbox data directory. + + Args: + sandbox_info: Source sandbox + source_path: Path to file (may be absolute like /mnt/data/file.py) + + Returns: + File content as bytes, or None if failed + """ + try: + # Extract filename from source_path (may be absolute) + filename = Path(source_path).name + file_path = sandbox_info.data_dir / filename + + if file_path.exists(): + return file_path.read_bytes() + + # Try the full path relative to data_dir + if source_path.startswith("/mnt/data/"): + relative = source_path[len("/mnt/data/") :] + alt_path = sandbox_info.data_dir / relative + if alt_path.exists(): + return alt_path.read_bytes() + + logger.warning( + "File not found in sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + source_path=source_path, + ) + return None + except Exception as e: + logger.error( + "Failed to get file content from sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + source_path=source_path, + error=str(e), + ) + return None + + async def execute_command( + self, + sandbox_info: SandboxInfo, + command: str, + timeout: int = None, + language: Optional[str] = None, + stdin_payload: Optional[str] = None, + ) -> Tuple[int, str, str]: + """Execute a command inside the sandbox via nsjail. + + Args: + sandbox_info: Sandbox to execute in + command: Command string to execute + timeout: Execution timeout in seconds + language: Programming language code + stdin_payload: Optional stdin data + + Returns: + Tuple of (exit_code, stdout, stderr) + """ + return await self._executor.execute_command( + sandbox_info, command, timeout, language, stdin_payload + ) + + def get_user_id_for_language(self, language: str) -> int: + """Get the user ID for a language sandbox.""" + return get_user_id_for_language(language.lower().strip()) + + def close(self): + """Clean up resources. No-op for sandbox manager.""" + pass diff --git a/src/services/sandbox/nsjail.py b/src/services/sandbox/nsjail.py new file mode 100644 index 0000000..64e2590 --- /dev/null +++ b/src/services/sandbox/nsjail.py @@ -0,0 +1,223 @@ +"""nsjail configuration and sandbox info dataclass. + +SandboxInfo is the handle for a running sandbox. NsjailConfig builds +the CLI arguments for invoking nsjail. +""" + +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional + +import structlog + +from ...config import settings +from ...config.languages import get_user_id_for_language + +logger = structlog.get_logger(__name__) + + +@dataclass +class SandboxInfo: + """Represents an nsjail sandbox instance. + + This is the handle used throughout the codebase to reference a + running execution environment. + """ + + sandbox_id: str + sandbox_dir: Path + data_dir: Path # Host dir bind-mounted as /mnt/data + language: str + session_id: str + created_at: datetime + repl_mode: bool = False + labels: Dict[str, str] = field(default_factory=dict) + + @property + def id(self) -> str: + """Compatibility property matching Container.id.""" + return self.sandbox_id + + +class NsjailConfig: + """Builds nsjail CLI arguments from settings. + + Translates the application's security and resource settings into + the corresponding nsjail command-line flags. + """ + + # Per-language read-only bind mounts for runtime paths + _LANGUAGE_BIND_MOUNTS: Dict[str, List[str]] = { + "py": [ + "/usr/local/lib/python3", + "/usr/local/bin/python3", + "/usr/local/bin/python", + ], + "js": [ + "/usr/local/bin/node", + "/usr/local/lib/node_modules", + ], + "ts": [ + "/usr/local/bin/node", + "/usr/local/bin/tsc", + "/usr/local/lib/node_modules", + ], + "go": [ + "/usr/local/go", + ], + "java": [ + "/opt/java", + "/usr/lib/jvm", + ], + "c": [], + "cpp": [], + "php": [ + "/usr/local/etc/php", + "/usr/local/bin/php", + "/usr/local/lib/php", + ], + "rs": [ + "/usr/local/cargo", + "/usr/local/rustup", + ], + "r": [ + "/usr/local/lib/R", + "/usr/lib/R", + ], + "f90": [], + "d": [ + "/usr/lib/ldc", + "/usr/bin/ldc2", + "/usr/bin/ldmd2", + ], + } + + def __init__(self): + pass + + def build_args( + self, + sandbox_dir: str, + command: List[str], + language: str, + timeout: int = None, + network: bool = False, + repl_mode: bool = False, + env: Optional[Dict[str, str]] = None, + ) -> List[str]: + """Build nsjail CLI arguments. + + Args: + sandbox_dir: Host directory to bind-mount as /mnt/data + command: Command and arguments to execute inside the sandbox + language: Programming language code + timeout: Execution timeout in seconds + network: Whether to allow network access + repl_mode: Whether this is a REPL session (affects timeout) + env: Environment variables to set inside the sandbox + + Returns: + List of nsjail CLI arguments (not including "nsjail" itself) + """ + if timeout is None: + timeout = settings.max_execution_time + + normalized_lang = language.lower().strip() + user_id = get_user_id_for_language(normalized_lang) + tmpfs_size_mb = settings.sandbox_tmpfs_size_mb + + args: List[str] = [] + + # Execution mode + args.extend(["--mode", "o"]) + + # Suppress nsjail diagnostic output + args.append("--really_quiet") + + # REPL mode: skip setsid() so stdin pipes stay connected. + # By default nsjail calls setsid() which creates a new session + # and detaches the child from the pipe's session, breaking stdin. + if repl_mode: + args.append("--skip_setsid") + + # Time limit (0 = no limit for REPL mode) + if repl_mode: + args.extend(["--time_limit", "0"]) + else: + args.extend(["--time_limit", str(timeout)]) + + # Per-process resource limits (rlimits) + args.extend( + ["--rlimit_as", "hard"] + ) # Virtual address space (Go needs unlimited) + args.extend(["--rlimit_fsize", "100"]) # Max file size: 100MB + args.extend(["--rlimit_nofile", "256"]) # Max open files + args.extend( + ["--rlimit_nproc", "256"] + ) # Max processes (needs headroom for REPL module imports) + + # Note: per-sandbox cgroup limits are not used because the + # containerized environment prevents nsjail from writing to cgroup.procs. + # Memory/CPU limits are enforced at the API container level via compose + # deploy.resources. Per-process rlimits above provide additional + # per-sandbox enforcement for file size, open files, and process count. + + # Namespace configuration: + # - User namespace disabled: avoids /proc/self/gid_map write errors + # in the containerized environment. Security is still enforced by + # PID/mount/net/IPC/UTS namespaces and capability dropping. + # - Network namespace enabled by default (disables network access). + # - Mount namespace uses --no_pivotroot with --chroot / since + # pivot_root fails in nested container environments. + args.append("--disable_clone_newuser") + if not network: + # Network isolation: new net namespace with no interfaces + args.append("--iface_no_lo") + else: + # Allow network: skip creating a new network namespace + args.append("--disable_clone_newnet") + + # Mount namespace: disabled for nsjail itself. The executor wraps nsjail + # in `unshare --mount` + `mount --bind` to map sandbox_dir to /mnt/data. + # This gives each execution its own mount namespace where /mnt/data points + # to the correct sandbox dir (concurrent-safe). + args.append("--disable_clone_newns") + + # Hostname + args.extend(["--hostname", "sandbox"]) + + # Security: do NOT use --keep_caps (that flag KEEPS caps). + # By default nsjail drops all capabilities, which is what we want. + args.append("--disable_proc") + + # Seccomp policy: block dangerous syscalls + # - ptrace: prevents process inspection/debugging (BUG-006a) + # - bind: prevents opening server sockets even with network access (BUG-006c) + # Using ERRNO(1) so the process gets EPERM rather than SIGSYS + args.extend( + [ + "--seccomp_string", + "POLICY policy { ERRNO(1) { ptrace, bind } } USE policy DEFAULT ALLOW", + ] + ) + + # Working directory: /mnt/data (bind-mounted by the executor wrapper) + args.extend(["--cwd", "/mnt/data"]) + + # User/group + args.extend(["--user", str(user_id)]) + args.extend(["--group", str(user_id)]) + + # Environment variables + if env: + for key, value in env.items(): + args.extend(["--env", f"{key}={value}"]) + + # Separator between nsjail args and the command + args.append("--") + + # Append the actual command + args.extend(command) + + return args diff --git a/src/services/sandbox/pool.py b/src/services/sandbox/pool.py new file mode 100644 index 0000000..197c289 --- /dev/null +++ b/src/services/sandbox/pool.py @@ -0,0 +1,644 @@ +"""Sandbox pool service for pre-warming nsjail sandboxes. + +This module provides a sandbox pooling mechanism that: +1. Pre-warms REPL sandboxes per language for fast acquisition +2. Provides fresh sandboxes from the pool on demand +3. Does NOT track session-to-sandbox mapping (stateless) + +After execution, sandboxes should be destroyed by the caller. +The pool continuously replenishes to maintain warm sandboxes. +""" + +import asyncio +import os +import signal +import uuid +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, Optional, Set + +import structlog + +from ...config import settings +from ...models.pool import PoolConfig, PoolStats +from ...core.events import ( + event_bus, + ContainerAcquiredFromPool, + ContainerCreatedFresh, + PoolWarmedUp, + PoolExhausted, +) +from .manager import SandboxManager +from .nsjail import NsjailConfig, SandboxInfo +from .repl_executor import SandboxREPLExecutor, SandboxREPLProcess + +logger = structlog.get_logger(__name__) + + +@dataclass +class PooledSandbox: + """Represents a sandbox available in the pool. + + Sandboxes in the pool are pre-warmed with a running REPL process + and ready to be used. After use, sandboxes are destroyed. + """ + + sandbox_info: SandboxInfo + repl_process: Optional[SandboxREPLProcess] = None + created_at: datetime = field(default_factory=datetime.utcnow) + status: str = "available" + repl_enabled: bool = False + repl_ready: bool = False + + def __hash__(self): + return hash(self.sandbox_info.sandbox_id) + + def __eq__(self, other): + if not isinstance(other, PooledSandbox): + return False + return self.sandbox_info.sandbox_id == other.sandbox_info.sandbox_id + + +class SandboxPool: + """Sandbox pool for fast sandbox acquisition. + + Key behaviors: + - Pre-warms sandboxes per language based on configuration + - Provides fresh sandboxes from pool (O(1) acquisition) + - Stateless: no session tracking (caller manages sandbox lifecycle) + - Continuously replenishes pool in background + """ + + def __init__(self, sandbox_manager: SandboxManager): + """Initialize the sandbox pool. + + Args: + sandbox_manager: Manager for sandbox lifecycle operations + """ + self._sandbox_manager = sandbox_manager + self._nsjail_config = NsjailConfig() + self._repl_executor = SandboxREPLExecutor() + self._lock = asyncio.Lock() + + # Available sandboxes per language (ready to be used) + self._available: Dict[str, asyncio.Queue[PooledSandbox]] = {} + + # Map sandbox_id -> SandboxREPLProcess for acquired sandboxes + self._repl_processes: Dict[str, SandboxREPLProcess] = {} + + # Pool statistics per language + self._stats: Dict[str, PoolStats] = {} + + # Background tasks + self._warmup_task: Optional[asyncio.Task] = None + self._running = False + + # Languages to warm up on startup + self._warmup_languages: Set[str] = set() + + # Event for exhaustion-triggered replenishment + self._replenish_event = asyncio.Event() + + async def start(self) -> None: + """Start the sandbox pool and warmup background task.""" + if self._running: + return + + self._running = True + logger.info("Starting sandbox pool (simplified, no session tracking)") + + # Only Python supports REPL pool pre-warming. + # Other languages use one-shot nsjail execution with no pooling. + config = PoolConfig.from_settings("py") + self._available["py"] = asyncio.Queue() + if config.warmup_on_startup and config.size > 0: + self._warmup_languages.add("py") + + # Subscribe to exhaustion events for immediate replenishment + if settings.sandbox_pool_exhaustion_trigger: + event_bus.register_handler(PoolExhausted, self._on_pool_exhausted) + + # Start warmup background task + self._warmup_task = asyncio.create_task(self._warmup_loop()) + + logger.info( + "Sandbox pool started", + warmup_languages=list(self._warmup_languages), + parallel_batch=settings.sandbox_pool_parallel_batch, + replenish_interval=settings.sandbox_pool_replenish_interval, + exhaustion_trigger=settings.sandbox_pool_exhaustion_trigger, + ) + + async def stop(self) -> None: + """Stop the sandbox pool and cleanup all sandboxes.""" + if not self._running: + return + + self._running = False + logger.info("Stopping sandbox pool") + + # Cancel background task + if self._warmup_task: + self._warmup_task.cancel() + try: + await self._warmup_task + except asyncio.CancelledError: + pass + + # Destroy all pooled sandboxes + for lang, queue in self._available.items(): + count = 0 + while not queue.empty(): + try: + pooled = queue.get_nowait() + await self._destroy_pooled_sandbox(pooled) + count += 1 + except asyncio.QueueEmpty: + break + if count > 0: + logger.info(f"Destroyed {count} pooled {lang} sandboxes") + + # Kill tracked REPL process trees + for sandbox_id, repl_process in list(self._repl_processes.items()): + try: + if repl_process.process.returncode is None: + try: + os.killpg(repl_process.process.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + try: + repl_process.process.kill() + except ProcessLookupError: + pass + await repl_process.process.wait() + except Exception: + pass + self._repl_processes.clear() + + logger.info("Sandbox pool stopped") + + async def acquire(self, language: str, session_id: str = "") -> SandboxInfo: + """Acquire a sandbox from the pool. + + This method: + 1. Gets a sandbox from the pool if available + 2. Creates a new sandbox if pool is empty + + Args: + language: Programming language code + session_id: Session identifier (for logging only, not tracked) + + Returns: + SandboxInfo ready for execution + """ + start_time = datetime.utcnow() + + # Try to get from pool + if settings.sandbox_pool_enabled: + queue = self._available.get(language) + if queue and not queue.empty(): + try: + pooled = queue.get_nowait() + # Verify the REPL process is still alive + if ( + pooled.repl_process + and pooled.repl_process.process.returncode is None + ): + acquire_time = ( + datetime.utcnow() - start_time + ).total_seconds() * 1000 + + # Track the REPL process for this sandbox + self._repl_processes[pooled.sandbox_info.sandbox_id] = ( + pooled.repl_process + ) + + # Update sandbox session info + pooled.sandbox_info.session_id = session_id + + await event_bus.publish( + ContainerAcquiredFromPool( + container_id=pooled.sandbox_info.sandbox_id, + session_id=session_id, + language=language, + acquire_time_ms=acquire_time, + ) + ) + self._record_stats( + language, pool_hit=True, acquire_time_ms=acquire_time + ) + logger.debug( + "Acquired sandbox from pool", + session_id=session_id[:12] if session_id else "none", + sandbox_id=pooled.sandbox_info.sandbox_id[:12], + language=language, + acquire_time_ms=f"{acquire_time:.1f}", + ) + return pooled.sandbox_info + else: + # REPL process is dead, destroy and try again + await self._destroy_pooled_sandbox(pooled) + except asyncio.QueueEmpty: + pass + + # Pool empty + await event_bus.publish( + PoolExhausted(language=language, session_id=session_id) + ) + + # Create fresh sandbox (fallback) + sandbox_info = await self._create_fresh_sandbox(session_id, language) + reason = "pool_empty" if settings.sandbox_pool_enabled else "pool_disabled" + await event_bus.publish( + ContainerCreatedFresh( + container_id=sandbox_info.sandbox_id, + session_id=session_id, + language=language, + reason=reason, + ) + ) + self._record_stats(language, pool_miss=True) + + return sandbox_info + + async def destroy_sandbox(self, sandbox_info: SandboxInfo) -> None: + """Destroy a sandbox after use. + + Kills the entire REPL process tree (unshare → nsjail → python), + then removes the sandbox directory. + """ + if sandbox_info: + # Kill REPL process tree if tracked + repl_process = self._repl_processes.pop(sandbox_info.sandbox_id, None) + if repl_process and repl_process.process.returncode is None: + try: + # Kill the entire process group (unshare + nsjail + python) + os.killpg(repl_process.process.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + # Process already dead or not a group leader + try: + repl_process.process.kill() + except ProcessLookupError: + pass + try: + await repl_process.process.wait() + except Exception: + pass + + self._sandbox_manager.destroy_sandbox(sandbox_info) + + def get_repl_process( + self, sandbox_info: SandboxInfo + ) -> Optional[SandboxREPLProcess]: + """Get the REPL process associated with a sandbox. + + Args: + sandbox_info: Sandbox to look up + + Returns: + SandboxREPLProcess if one exists, None otherwise + """ + return self._repl_processes.get(sandbox_info.sandbox_id) + + def get_stats(self, language: str = None) -> Dict[str, PoolStats]: + """Get pool statistics.""" + if language: + return { + language: self._stats.get( + language, + PoolStats(language=language), + ) + } + + # Build stats for all languages + stats = {} + for lang in set(list(self._available.keys()) + list(self._stats.keys())): + queue = self._available.get(lang) + available = queue.qsize() if queue else 0 + if lang in self._stats: + self._stats[lang].available_count = available + stats[lang] = self._stats[lang] + else: + stats[lang] = PoolStats(language=lang, available_count=available) + return stats + + # ========================================================================= + # Private methods + # ========================================================================= + + async def _create_fresh_sandbox( + self, session_id: str, language: str + ) -> SandboxInfo: + """Create a new sandbox when pool is exhausted.""" + use_repl_mode = language == "py" and settings.repl_enabled + + sandbox_info = self._sandbox_manager.create_sandbox( + session_id=session_id, + language=language, + repl_mode=use_repl_mode, + ) + + # For REPL mode, start the REPL process + if use_repl_mode: + repl_process = await self._start_repl_process(sandbox_info) + if repl_process: + self._repl_processes[sandbox_info.sandbox_id] = repl_process + else: + logger.warning( + "REPL not ready in fresh sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + language=language, + ) + + logger.debug( + "Created fresh sandbox", + session_id=session_id[:12] if session_id else "none", + sandbox_id=sandbox_info.sandbox_id[:12], + language=language, + repl_mode=use_repl_mode, + ) + + return sandbox_info + + async def _start_repl_process( + self, sandbox_info: SandboxInfo + ) -> Optional[SandboxREPLProcess]: + """Start a REPL process inside an nsjail sandbox. + + Args: + sandbox_info: Sandbox to start REPL in + + Returns: + SandboxREPLProcess if successful, None if failed + """ + try: + # Build nsjail args for REPL mode + env = self._sandbox_manager.executor._build_sanitized_env("py") + nsjail_args = self._nsjail_config.build_args( + sandbox_dir=str(sandbox_info.data_dir), + command=["/usr/bin/python3", "/opt/repl_server.py"], + language="py", + repl_mode=True, + env=env, + ) + + # Wrap nsjail in unshare+mount so /mnt/data resolves to sandbox dir + import shlex + + nsjail_cmd = " ".join( + shlex.quote(str(a)) for a in [settings.nsjail_binary] + nsjail_args + ) + wrapper_cmd = ( + # Bind sandbox dir to /mnt/data (before hiding sandboxes dir) + f"mount --bind {shlex.quote(str(sandbox_info.data_dir))} /mnt/data && " + # BUG-001: Hide other sessions' sandbox directories + f"mount -t tmpfs -o size=1k tmpfs /var/lib/code-interpreter/sandboxes && " + # BUG-002: Hide metrics database + f"mount -t tmpfs -o size=1k tmpfs /app/data && " + # BUG-004: Hide log directory + f"mount -t tmpfs -o size=1k tmpfs /var/log && " + # BUG-005: Hide SSL certs and application source + f"mount -t tmpfs -o size=1k tmpfs /app/ssl && " + f"mount -t tmpfs -o size=1k tmpfs /app/dashboard && " + f"mount -t tmpfs -o size=1k tmpfs /app/src && " + # BUG-003: Hide /proc (REPL is Python-only, always safe to mask) + f"mount --bind /tmp/empty_proc /proc && " + # Execute nsjail + f"{nsjail_cmd}" + ) + + proc = await asyncio.create_subprocess_exec( + "unshare", + "--mount", + "--", + "/bin/sh", + "-c", + wrapper_cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, # New process group for clean killpg + ) + + repl_process = SandboxREPLProcess( + process=proc, + sandbox_info=sandbox_info, + ) + + # Wait for REPL to be ready + ready = await self._repl_executor.wait_for_ready( + repl_process, + timeout=settings.repl_warmup_timeout_seconds, + ) + + if not ready: + proc.kill() + await proc.wait() + return None + + return repl_process + + except Exception as e: + logger.error( + "Failed to start REPL process", + sandbox_id=sandbox_info.sandbox_id[:12], + error=str(e), + ) + return None + + async def _destroy_pooled_sandbox(self, pooled: PooledSandbox) -> None: + """Destroy a pooled sandbox including its entire REPL process tree.""" + if pooled.repl_process and pooled.repl_process.process.returncode is None: + try: + os.killpg(pooled.repl_process.process.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + try: + pooled.repl_process.process.kill() + except ProcessLookupError: + pass + try: + await pooled.repl_process.process.wait() + except Exception: + pass + self._sandbox_manager.destroy_sandbox(pooled.sandbox_info) + + async def _warmup_loop(self) -> None: + """Background task to maintain warm sandboxes in the pool.""" + # Initial warmup + await asyncio.sleep(2) # Let the app start + + replenish_interval = settings.sandbox_pool_replenish_interval + + while self._running: + try: + for language in self._warmup_languages: + await self._warmup_language(language) + + # Wait for either timeout OR exhaustion event (if enabled) + if settings.sandbox_pool_exhaustion_trigger: + try: + await asyncio.wait_for( + self._replenish_event.wait(), + timeout=float(replenish_interval), + ) + # Event was triggered - immediate replenishment + self._replenish_event.clear() + logger.debug("Exhaustion-triggered replenishment") + except asyncio.TimeoutError: + pass # Normal timeout, continue loop + else: + await asyncio.sleep(replenish_interval) + + except asyncio.CancelledError: + break + except Exception as e: + logger.error("Warmup loop error", error=str(e)) + await asyncio.sleep(replenish_interval) + + async def _on_pool_exhausted(self, event: PoolExhausted) -> None: + """Handle pool exhaustion event by triggering immediate replenishment.""" + logger.info( + "Pool exhaustion detected, triggering replenishment", + language=event.language, + session_id=event.session_id[:12] if event.session_id else "none", + ) + self._replenish_event.set() + + async def _warmup_language(self, language: str) -> None: + """Warm up sandboxes for a specific language using parallel creation.""" + config = PoolConfig.from_settings(language) + queue = self._available.setdefault(language, asyncio.Queue()) + + current_size = queue.qsize() + if current_size >= config.size: + return + + needed = config.size - current_size + created = 0 + + # Enable REPL mode for Python if configured + use_repl_mode = language == "py" and settings.repl_enabled + + # Parallel sandbox creation in batches + batch_size = settings.sandbox_pool_parallel_batch + + for batch_start in range(0, needed, batch_size): + batch_end = min(batch_start + batch_size, needed) + batch_count = batch_end - batch_start + + # Launch sandbox creations in parallel + tasks = [ + self._create_pooled_sandbox(language, use_repl_mode) + for _ in range(batch_count) + ] + + results = await asyncio.gather(*tasks, return_exceptions=True) + + for result in results: + if isinstance(result, PooledSandbox): + await queue.put(result) + created += 1 + elif isinstance(result, Exception): + logger.warning( + "Failed to create pooled sandbox", + language=language, + error=str(result), + ) + + if created > 0: + await event_bus.publish( + PoolWarmedUp(language=language, container_count=created) + ) + logger.debug( + "Warmed up sandboxes", + language=language, + created=created, + total=queue.qsize(), + repl_mode=use_repl_mode, + ) + + async def _create_pooled_sandbox( + self, language: str, use_repl_mode: bool + ) -> Optional[PooledSandbox]: + """Create a single pooled sandbox (for parallel execution). + + Args: + language: Programming language code + use_repl_mode: Whether to enable REPL mode (Python only) + + Returns: + PooledSandbox if successful, None if failed + """ + try: + # Create sandbox with a unique pool-specific session ID + pool_session_id = f"pool-{language}-{uuid.uuid4().hex[:12]}" + sandbox_info = self._sandbox_manager.create_sandbox( + session_id=pool_session_id, + language=language, + repl_mode=use_repl_mode, + ) + + repl_process = None + repl_ready = False + + if use_repl_mode: + repl_process = await self._start_repl_process(sandbox_info) + if repl_process is None: + logger.warning( + "REPL not ready, removing sandbox", + sandbox_id=sandbox_info.sandbox_id[:12], + language=language, + ) + self._sandbox_manager.destroy_sandbox(sandbox_info) + return None + repl_ready = True + + pooled = PooledSandbox( + sandbox_info=sandbox_info, + repl_process=repl_process, + created_at=datetime.utcnow(), + status="available", + repl_enabled=use_repl_mode, + repl_ready=repl_ready, + ) + + if use_repl_mode: + logger.debug( + "REPL sandbox ready", + sandbox_id=sandbox_info.sandbox_id[:12], + language=language, + ) + + return pooled + + except Exception as e: + logger.warning( + "Failed to create pooled sandbox", + language=language, + error=str(e), + ) + return None + + def _record_stats( + self, + language: str, + pool_hit: bool = False, + pool_miss: bool = False, + acquire_time_ms: float = 0.0, + ) -> None: + """Record pool statistics.""" + if language not in self._stats: + self._stats[language] = PoolStats(language=language) + + stats = self._stats[language] + stats.total_acquisitions += 1 + + if pool_hit: + stats.pool_hits += 1 + if pool_miss: + stats.pool_misses += 1 + if acquire_time_ms > 0: + # Running average + n = stats.total_acquisitions + stats.avg_acquire_time_ms = ( + stats.avg_acquire_time_ms * (n - 1) + acquire_time_ms + ) / n diff --git a/src/services/sandbox/repl_executor.py b/src/services/sandbox/repl_executor.py new file mode 100644 index 0000000..cfe8807 --- /dev/null +++ b/src/services/sandbox/repl_executor.py @@ -0,0 +1,382 @@ +"""REPL-based code execution for pre-warmed Python sandboxes. + +This module provides fast code execution by communicating with a +running Python REPL inside an nsjail sandbox, eliminating interpreter startup. + +The REPL server runs as the main process in the sandbox and communicates +via stdin/stdout subprocess pipes using a JSON-based protocol with delimiters. +""" + +import asyncio +import json +import time +import structlog +from dataclasses import dataclass, field +from datetime import datetime +from typing import Tuple, Optional, Dict, Any, List + +from ...config import settings +from .nsjail import SandboxInfo + +logger = structlog.get_logger(__name__) + +# Protocol delimiter (must match repl_server.py) +DELIMITER = b"\n---END---\n" + + +@dataclass +class SandboxREPLProcess: + """Represents a running REPL process inside an nsjail sandbox.""" + + process: asyncio.subprocess.Process + sandbox_info: SandboxInfo + created_at: datetime = field(default_factory=datetime.utcnow) + ready: bool = False + + +class SandboxREPLExecutor: + """Executes code via running REPL in an nsjail sandbox. + + Uses subprocess stdin/stdout pipes to communicate with the REPL server. + """ + + def __init__(self): + """Initialize REPL executor.""" + pass + + async def execute( + self, + process: SandboxREPLProcess, + code: str, + timeout: int = None, + working_dir: str = "/mnt/data", + args: Optional[List[str]] = None, + ) -> Tuple[int, str, str]: + """Execute code in running REPL. + + Args: + process: REPL process to communicate with + code: Python code to execute + timeout: Maximum execution time in seconds + working_dir: Working directory for code execution + args: Optional list of command line arguments + + Returns: + Tuple of (exit_code, stdout, stderr) + """ + if timeout is None: + timeout = settings.max_execution_time + + start_time = time.perf_counter() + + # Build request + request = {"code": code, "timeout": timeout, "working_dir": working_dir} + if args: + request["args"] = args + request_json = json.dumps(request) + request_bytes = request_json.encode("utf-8") + DELIMITER + + try: + response = await self._send_and_receive(process, request_bytes, timeout + 5) + + elapsed_ms = (time.perf_counter() - start_time) * 1000 + logger.debug( + "REPL execution completed", + sandbox_id=process.sandbox_info.sandbox_id[:12], + elapsed_ms=f"{elapsed_ms:.1f}", + exit_code=response.get("exit_code", -1), + ) + + return self._parse_response(response) + + except asyncio.TimeoutError: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + logger.warning( + "REPL execution timed out", + sandbox_id=process.sandbox_info.sandbox_id[:12], + timeout=timeout, + elapsed_ms=f"{elapsed_ms:.1f}", + ) + return 124, "", f"Execution timed out after {timeout} seconds" + + except Exception as e: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + logger.error( + "REPL execution failed", + sandbox_id=process.sandbox_info.sandbox_id[:12], + error=str(e), + elapsed_ms=f"{elapsed_ms:.1f}", + ) + return 1, "", f"REPL execution error: {str(e)}" + + async def execute_with_state( + self, + process: SandboxREPLProcess, + code: str, + timeout: int = None, + working_dir: str = "/mnt/data", + initial_state: Optional[str] = None, + capture_state: bool = False, + args: Optional[List[str]] = None, + ) -> Tuple[int, str, str, Optional[str], List[str]]: + """Execute code in running REPL with optional state persistence. + + Args: + process: REPL process to communicate with + code: Python code to execute + timeout: Maximum execution time in seconds + working_dir: Working directory for code execution + initial_state: Base64-encoded state to restore before execution + capture_state: Whether to capture state after execution + args: Optional list of command line arguments + + Returns: + Tuple of (exit_code, stdout, stderr, new_state, state_errors) + new_state is base64-encoded cloudpickle, or None if not captured + """ + if timeout is None: + timeout = settings.max_execution_time + + start_time = time.perf_counter() + + # Build request with state options + request = {"code": code, "timeout": timeout, "working_dir": working_dir} + + if initial_state: + request["initial_state"] = initial_state + + if capture_state: + request["capture_state"] = True + + if args: + request["args"] = args + + request_json = json.dumps(request) + request_bytes = request_json.encode("utf-8") + DELIMITER + + try: + response = await self._send_and_receive( + process, request_bytes, timeout + 10 + ) + + elapsed_ms = (time.perf_counter() - start_time) * 1000 + logger.debug( + "REPL execution with state completed", + sandbox_id=process.sandbox_info.sandbox_id[:12], + elapsed_ms=f"{elapsed_ms:.1f}", + exit_code=response.get("exit_code", -1), + has_state="state" in response, + ) + + return self._parse_response_with_state(response) + + except asyncio.TimeoutError: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + logger.warning( + "REPL execution timed out", + sandbox_id=process.sandbox_info.sandbox_id[:12], + timeout=timeout, + elapsed_ms=f"{elapsed_ms:.1f}", + ) + return 124, "", f"Execution timed out after {timeout} seconds", None, [] + + except Exception as e: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + logger.error( + "REPL execution failed", + sandbox_id=process.sandbox_info.sandbox_id[:12], + error=str(e), + elapsed_ms=f"{elapsed_ms:.1f}", + ) + return 1, "", f"REPL execution error: {str(e)}", None, [] + + async def _send_and_receive( + self, process: SandboxREPLProcess, request: bytes, timeout: int + ) -> Dict[str, Any]: + """Send request to REPL and receive response via subprocess pipes. + + Subprocess pipes give clean stdout without multiplexed stream + headers. + + Args: + process: REPL process with stdin/stdout pipes + request: Request bytes to send + timeout: Timeout in seconds + + Returns: + Parsed JSON response dict + """ + proc = process.process + + if proc.returncode is not None: + raise RuntimeError(f"REPL process has exited with code {proc.returncode}") + + if proc.stdin is None or proc.stdout is None: + raise RuntimeError("REPL process stdin/stdout not available") + + # Send request + proc.stdin.write(request) + await proc.stdin.drain() + + # Read response until delimiter + response_bytes = b"" + + async def _read_until_delimiter(): + nonlocal response_bytes + while DELIMITER not in response_bytes: + chunk = await proc.stdout.read(4096) + if not chunk: + break + response_bytes += chunk + + await asyncio.wait_for(_read_until_delimiter(), timeout=timeout) + + # Parse response + if DELIMITER in response_bytes: + json_part = response_bytes.split(DELIMITER)[0] + json_str = json_part.decode("utf-8", errors="replace") + return json.loads(json_str) + else: + return { + "exit_code": 1, + "stdout": "", + "stderr": "Invalid response from REPL: delimiter not found", + } + + def _parse_response(self, response: Dict[str, Any]) -> Tuple[int, str, str]: + """Parse REPL response into (exit_code, stdout, stderr). + + Args: + response: JSON response from REPL + + Returns: + Tuple of (exit_code, stdout, stderr) + """ + return ( + response.get("exit_code", 1), + response.get("stdout", ""), + response.get("stderr", ""), + ) + + def _parse_response_with_state( + self, response: Dict[str, Any] + ) -> Tuple[int, str, str, Optional[str], List[str]]: + """Parse REPL response including state data. + + Args: + response: JSON response from REPL + + Returns: + Tuple of (exit_code, stdout, stderr, state, state_errors) + """ + return ( + response.get("exit_code", 1), + response.get("stdout", ""), + response.get("stderr", ""), + response.get("state"), # May be None + response.get("state_errors", []), + ) + + async def check_health( + self, process: SandboxREPLProcess, timeout: float = 5.0 + ) -> bool: + """Check if REPL is responsive. + + Sends a simple health check code and verifies response. + + Args: + process: REPL process to check + timeout: Maximum time to wait for response + + Returns: + True if REPL is healthy, False otherwise + """ + try: + exit_code, stdout, stderr = await self.execute( + process, "print('health_check_ok')", timeout=int(timeout) + ) + return exit_code == 0 and "health_check_ok" in stdout + + except Exception as e: + logger.debug( + "REPL health check failed", + sandbox_id=process.sandbox_info.sandbox_id[:12], + error=str(e), + ) + return False + + async def wait_for_ready( + self, + process: SandboxREPLProcess, + timeout: float = 10.0, + poll_interval: float = 0.1, + ) -> bool: + """Wait for REPL to be ready by consuming its ready signal. + + The REPL server sends a ready signal (a JSON message with + ``"status": "ready"``) on stdout after pre-loading libraries. + This method reads that signal directly so it does not interfere + with subsequent request/response pairs. + + Args: + process: REPL process + timeout: Maximum time to wait + poll_interval: Time between checks + + Returns: + True if REPL is ready, False if timeout + """ + start_time = time.perf_counter() + proc = process.process + + if proc.stdout is None: + return False + + # Read the ready signal directly from stdout + response_bytes = b"" + + async def _read_ready_signal(): + nonlocal response_bytes + while DELIMITER not in response_bytes: + chunk = await proc.stdout.read(4096) + if not chunk: + break + response_bytes += chunk + + try: + await asyncio.wait_for(_read_ready_signal(), timeout=timeout) + except asyncio.TimeoutError: + logger.warning( + "REPL ready timeout waiting for ready signal", + sandbox_id=process.sandbox_info.sandbox_id[:12], + timeout=timeout, + ) + return False + + if DELIMITER in response_bytes: + json_part = response_bytes.split(DELIMITER)[0] + try: + ready_msg = json.loads(json_part.decode("utf-8", errors="replace")) + if ready_msg.get("status") == "ready": + elapsed = time.perf_counter() - start_time + logger.debug( + "REPL ready", + sandbox_id=process.sandbox_info.sandbox_id[:12], + elapsed_ms=f"{elapsed * 1000:.1f}", + preloaded=ready_msg.get("preloaded_modules"), + ) + process.ready = True + return True + except (json.JSONDecodeError, UnicodeDecodeError) as e: + logger.warning( + "REPL ready signal parse error", + sandbox_id=process.sandbox_info.sandbox_id[:12], + error=str(e), + ) + + logger.warning( + "REPL ready timeout", + sandbox_id=process.sandbox_info.sandbox_id[:12], + timeout=timeout, + ) + return False diff --git a/src/services/session.py b/src/services/session.py index 162e1d6..53f6eaa 100644 --- a/src/services/session.py +++ b/src/services/session.py @@ -35,6 +35,14 @@ def __init__( self._execution_service = execution_service self._file_service = file_service self._redis_available = False + + def set_execution_service(self, execution_service) -> None: + """Set the execution service dependency.""" + self._execution_service = execution_service + + def set_file_service(self, file_service) -> None: + """Set the file service dependency.""" + self._file_service = file_service logger.info("Redis client created", url=settings.get_redis_url().split("@")[-1]) async def _check_redis_connectivity(self) -> bool: @@ -195,7 +203,7 @@ async def create_session(self, request: SessionCreate) -> Session: finally: await pipe.reset() - logger.info( + logger.debug( "Session created", session_id=session_id, expires_at=expires_at.isoformat() ) return session @@ -290,7 +298,7 @@ async def delete_session(self, session_id: str) -> bool: if self._execution_service: try: await self._execution_service.cleanup_session(session_id) - logger.info( + logger.debug( "Cleaned up execution resources for session", session_id=session_id ) except Exception as e: @@ -307,7 +315,7 @@ async def delete_session(self, session_id: str) -> bool: deleted_files = await self._file_service.cleanup_session_files( session_id ) - logger.info( + logger.debug( "Cleaned up file resources for session", session_id=session_id, deleted_files=deleted_files, @@ -339,7 +347,7 @@ async def delete_session(self, session_id: str) -> bool: deleted = result[0] > 0 # First command result (delete) if deleted: - logger.info("Session deleted", session_id=session_id, entity_id=entity_id) + logger.debug("Session deleted", session_id=session_id, entity_id=entity_id) return deleted @@ -371,7 +379,7 @@ async def cleanup_expired_sessions(self) -> int: session = await self.get_session(session_id) # If session data is missing, treat as expired/orphaned and clean up indexes if not session: - logger.info( + logger.debug( "Cleaning up orphaned session (missing data)", session_id=session_id ) # Attempt to clean up any files associated with this session by prefix @@ -380,13 +388,13 @@ async def cleanup_expired_sessions(self) -> int: deleted_files = await self._file_service.cleanup_session_files( session_id ) - logger.info( + logger.debug( "Cleaned up files for orphaned session", session_id=session_id, deleted_files=deleted_files, ) except Exception as e: - logger.error( + logger.warning( "Failed to cleanup files for orphaned session", session_id=session_id, error=str(e), @@ -404,11 +412,10 @@ async def cleanup_expired_sessions(self) -> int: continue if session.expires_at < now: - logger.info( + logger.debug( "Cleaning up expired session", session_id=session_id, expired_at=session.expires_at.isoformat(), - current_time=now.isoformat(), ) await self.delete_session(session_id) cleaned_count += 1 diff --git a/src/services/sqlite_metrics.py b/src/services/sqlite_metrics.py deleted file mode 100644 index 2bacc4b..0000000 --- a/src/services/sqlite_metrics.py +++ /dev/null @@ -1,690 +0,0 @@ -"""SQLite-based metrics storage for long-term analytics. - -This module provides persistent storage for execution metrics using SQLite, -enabling historical analytics, time-series charts, and dashboard visualizations. -""" - -import asyncio -from datetime import datetime, timedelta, timezone -from pathlib import Path -from typing import Any, Dict, List, Optional - -import aiosqlite - -from src.config import settings -from src.models.metrics import DetailedExecutionMetrics -from src.utils.logging import get_logger - -logger = get_logger(__name__) - -# SQL Schema -SCHEMA_SQL = """ --- Individual execution records (90-day retention by default) -CREATE TABLE IF NOT EXISTS executions ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - execution_id TEXT NOT NULL UNIQUE, - session_id TEXT NOT NULL, - api_key_hash TEXT NOT NULL, - user_id TEXT, - entity_id TEXT, - language TEXT NOT NULL, - status TEXT NOT NULL, - execution_time_ms REAL NOT NULL, - memory_peak_mb REAL, - cpu_time_ms REAL, - container_source TEXT, - repl_mode INTEGER DEFAULT 0, - files_uploaded INTEGER DEFAULT 0, - files_generated INTEGER DEFAULT 0, - output_size_bytes INTEGER DEFAULT 0, - state_size_bytes INTEGER, - created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP -); - --- Daily aggregates (1-year retention by default) -CREATE TABLE IF NOT EXISTS daily_aggregates ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - date DATE NOT NULL, - api_key_hash TEXT, - language TEXT, - execution_count INTEGER DEFAULT 0, - success_count INTEGER DEFAULT 0, - failure_count INTEGER DEFAULT 0, - timeout_count INTEGER DEFAULT 0, - total_execution_time_ms REAL DEFAULT 0, - total_memory_mb REAL DEFAULT 0, - pool_hits INTEGER DEFAULT 0, - pool_misses INTEGER DEFAULT 0, - created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - UNIQUE(date, api_key_hash, language) -); - --- Hourly activity for heatmap (90-day retention) -CREATE TABLE IF NOT EXISTS hourly_activity ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - date DATE NOT NULL, - hour INTEGER NOT NULL, - day_of_week INTEGER NOT NULL, - api_key_hash TEXT, - execution_count INTEGER DEFAULT 0, - success_count INTEGER DEFAULT 0, - avg_execution_time_ms REAL, - UNIQUE(date, hour, api_key_hash) -); - --- Indexes for efficient querying -CREATE INDEX IF NOT EXISTS idx_executions_created_at ON executions(created_at); -CREATE INDEX IF NOT EXISTS idx_executions_api_key_hash ON executions(api_key_hash); -CREATE INDEX IF NOT EXISTS idx_executions_language ON executions(language); -CREATE INDEX IF NOT EXISTS idx_executions_status ON executions(status); -CREATE INDEX IF NOT EXISTS idx_executions_composite ON executions(created_at, api_key_hash, language); - -CREATE INDEX IF NOT EXISTS idx_daily_date ON daily_aggregates(date); -CREATE INDEX IF NOT EXISTS idx_daily_api_key ON daily_aggregates(api_key_hash); -CREATE INDEX IF NOT EXISTS idx_daily_language ON daily_aggregates(language); - -CREATE INDEX IF NOT EXISTS idx_hourly_date ON hourly_activity(date); -CREATE INDEX IF NOT EXISTS idx_hourly_dow_hour ON hourly_activity(day_of_week, hour); -""" - - -class SQLiteMetricsService: - """SQLite-based metrics storage for long-term analytics.""" - - def __init__(self, db_path: Optional[str] = None): - self.db_path = db_path or settings.sqlite_metrics_db_path - self._db: Optional[aiosqlite.Connection] = None - self._write_queue: asyncio.Queue = asyncio.Queue() - self._writer_task: Optional[asyncio.Task] = None - self._aggregation_task: Optional[asyncio.Task] = None - self._cleanup_task: Optional[asyncio.Task] = None - self._running = False - self._batch_size = 100 - self._flush_interval = 5.0 # seconds - - async def start(self) -> None: - """Initialize database and start background tasks.""" - if self._running: - return - - # Ensure data directory exists - db_dir = Path(self.db_path).parent - db_dir.mkdir(parents=True, exist_ok=True) - - # Connect to database - self._db = await aiosqlite.connect(self.db_path) - self._db.row_factory = aiosqlite.Row - - # Enable WAL mode for better concurrent read/write performance - await self._db.execute("PRAGMA journal_mode=WAL") - await self._db.execute("PRAGMA synchronous=NORMAL") - await self._db.execute("PRAGMA cache_size=10000") - - # Create schema - await self._db.executescript(SCHEMA_SQL) - await self._db.commit() - - self._running = True - - # Start background tasks - self._writer_task = asyncio.create_task(self._batch_writer()) - self._aggregation_task = asyncio.create_task(self._aggregation_loop()) - self._cleanup_task = asyncio.create_task(self._cleanup_loop()) - - logger.info("SQLite metrics service started", db_path=self.db_path) - - async def stop(self) -> None: - """Flush pending writes and close connection.""" - if not self._running: - return - - self._running = False - - # Cancel background tasks - for task in [self._writer_task, self._aggregation_task, self._cleanup_task]: - if task: - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - # Flush remaining writes - await self._flush_queue() - - # Close database - if self._db: - await self._db.close() - self._db = None - - logger.info("SQLite metrics service stopped") - - async def record_execution(self, metrics: DetailedExecutionMetrics) -> None: - """Queue an execution record for batch writing.""" - if not self._running: - return - - await self._write_queue.put(metrics) - - async def _batch_writer(self) -> None: - """Background task that batches writes for efficiency.""" - batch: List[DetailedExecutionMetrics] = [] - - while self._running: - try: - # Wait for items with timeout - try: - item = await asyncio.wait_for( - self._write_queue.get(), timeout=self._flush_interval - ) - batch.append(item) - except asyncio.TimeoutError: - pass - - # Flush if batch is full or timeout occurred - if len(batch) >= self._batch_size or ( - batch and self._write_queue.empty() - ): - await self._write_batch(batch) - batch = [] - - except asyncio.CancelledError: - # Flush remaining on shutdown - if batch: - await self._write_batch(batch) - raise - except Exception as e: - logger.error("Error in batch writer", error=str(e)) - - async def _write_batch(self, batch: List[DetailedExecutionMetrics]) -> None: - """Write a batch of execution records to the database.""" - if not batch or not self._db: - return - - try: - await self._db.executemany( - """ - INSERT OR IGNORE INTO executions ( - execution_id, session_id, api_key_hash, user_id, entity_id, - language, status, execution_time_ms, memory_peak_mb, cpu_time_ms, - container_source, repl_mode, files_uploaded, files_generated, - output_size_bytes, state_size_bytes, created_at - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - m.execution_id, - m.session_id, - m.api_key_hash[:16] if m.api_key_hash else "unknown", - m.user_id, - m.entity_id, - m.language, - m.status, - m.execution_time_ms, - m.memory_peak_mb, - m.cpu_time_ms, - m.container_source, - 1 if m.repl_mode else 0, - m.files_uploaded, - m.files_generated, - m.output_size_bytes, - m.state_size_bytes, - ( - m.timestamp.isoformat() - if m.timestamp - else datetime.now(timezone.utc).isoformat() - ), - ) - for m in batch - ], - ) - await self._db.commit() - logger.debug("Wrote metrics batch", count=len(batch)) - except Exception as e: - logger.error("Failed to write metrics batch", error=str(e)) - - async def _flush_queue(self) -> None: - """Flush all pending writes from the queue.""" - batch: List[DetailedExecutionMetrics] = [] - while not self._write_queue.empty(): - try: - batch.append(self._write_queue.get_nowait()) - except asyncio.QueueEmpty: - break - if batch: - await self._write_batch(batch) - - async def _aggregation_loop(self) -> None: - """Periodically aggregate executions into daily summaries.""" - interval = settings.metrics_aggregation_interval_minutes * 60 - - while self._running: - try: - await asyncio.sleep(interval) - await self.run_aggregation() - except asyncio.CancelledError: - raise - except Exception as e: - logger.error("Error in aggregation loop", error=str(e)) - - async def run_aggregation(self) -> None: - """Build daily aggregates from execution records.""" - if not self._db: - return - - try: - # Get yesterday's date for aggregation - yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).date() - - # Aggregate by date, api_key, language - await self._db.execute( - """ - INSERT OR REPLACE INTO daily_aggregates ( - date, api_key_hash, language, - execution_count, success_count, failure_count, timeout_count, - total_execution_time_ms, total_memory_mb, pool_hits, pool_misses - ) - SELECT - DATE(created_at) as date, - api_key_hash, - language, - COUNT(*) as execution_count, - SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failure_count, - SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeout_count, - SUM(execution_time_ms) as total_execution_time_ms, - SUM(COALESCE(memory_peak_mb, 0)) as total_memory_mb, - SUM(CASE WHEN container_source = 'pool_hit' THEN 1 ELSE 0 END) as pool_hits, - SUM(CASE WHEN container_source = 'pool_miss' THEN 1 ELSE 0 END) as pool_misses - FROM executions - WHERE DATE(created_at) <= ? - GROUP BY DATE(created_at), api_key_hash, language - """, - (yesterday.isoformat(),), - ) - - # Aggregate hourly activity - await self._db.execute( - """ - INSERT OR REPLACE INTO hourly_activity ( - date, hour, day_of_week, api_key_hash, - execution_count, success_count, avg_execution_time_ms - ) - SELECT - DATE(created_at) as date, - CAST(strftime('%H', created_at) AS INTEGER) as hour, - CAST(strftime('%w', created_at) AS INTEGER) as day_of_week, - api_key_hash, - COUNT(*) as execution_count, - SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, - AVG(execution_time_ms) as avg_execution_time_ms - FROM executions - WHERE DATE(created_at) <= ? - GROUP BY DATE(created_at), hour, api_key_hash - """, - (yesterday.isoformat(),), - ) - - await self._db.commit() - logger.info("Aggregation completed", up_to_date=yesterday.isoformat()) - except Exception as e: - logger.error("Aggregation failed", error=str(e)) - - async def _cleanup_loop(self) -> None: - """Periodically clean up old data based on retention settings.""" - # Run cleanup once per day - interval = 24 * 60 * 60 - - while self._running: - try: - await asyncio.sleep(interval) - await self.cleanup_old_data() - except asyncio.CancelledError: - raise - except Exception as e: - logger.error("Error in cleanup loop", error=str(e)) - - async def cleanup_old_data(self) -> None: - """Remove data older than retention periods.""" - if not self._db: - return - - try: - now = datetime.now(timezone.utc) - - # Clean up old executions - exec_cutoff = ( - now - timedelta(days=settings.metrics_execution_retention_days) - ).isoformat() - result = await self._db.execute( - "DELETE FROM executions WHERE created_at < ?", (exec_cutoff,) - ) - exec_deleted = result.rowcount - - # Clean up old daily aggregates - daily_cutoff = ( - (now - timedelta(days=settings.metrics_daily_retention_days)) - .date() - .isoformat() - ) - result = await self._db.execute( - "DELETE FROM daily_aggregates WHERE date < ?", (daily_cutoff,) - ) - daily_deleted = result.rowcount - - # Clean up old hourly activity - hourly_cutoff = ( - (now - timedelta(days=settings.metrics_execution_retention_days)) - .date() - .isoformat() - ) - result = await self._db.execute( - "DELETE FROM hourly_activity WHERE date < ?", (hourly_cutoff,) - ) - hourly_deleted = result.rowcount - - await self._db.commit() - - # Vacuum to reclaim space - await self._db.execute("VACUUM") - - logger.info( - "Cleanup completed", - executions_deleted=exec_deleted, - daily_deleted=daily_deleted, - hourly_deleted=hourly_deleted, - ) - except Exception as e: - logger.error("Cleanup failed", error=str(e)) - - # ========================================================================= - # Query Methods for Dashboard - # ========================================================================= - - async def get_summary_stats( - self, - start: datetime, - end: datetime, - api_key_hash: Optional[str] = None, - ) -> Dict[str, Any]: - """Get summary statistics for stats cards.""" - if not self._db: - return {} - - params: List[Any] = [start.isoformat(), end.isoformat()] - api_key_filter = "" - if api_key_hash: - api_key_filter = "AND api_key_hash = ?" - params.append(api_key_hash) - - cursor = await self._db.execute( - f""" - SELECT - COUNT(*) as total_executions, - SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, - SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failure_count, - SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeout_count, - AVG(execution_time_ms) as avg_execution_time_ms, - SUM(CASE WHEN container_source = 'pool_hit' THEN 1 ELSE 0 END) as pool_hits, - SUM(CASE WHEN container_source IN ('pool_hit', 'pool_miss') THEN 1 ELSE 0 END) as pool_total, - COUNT(DISTINCT api_key_hash) as active_api_keys - FROM executions - WHERE created_at >= ? AND created_at <= ? {api_key_filter} - """, - params, - ) - row = await cursor.fetchone() - - if not row or row["total_executions"] == 0: - return { - "total_executions": 0, - "success_rate": 0, - "avg_execution_time_ms": 0, - "pool_hit_rate": 0, - "active_api_keys": 0, - } - - total = row["total_executions"] - success_rate = (row["success_count"] / total * 100) if total > 0 else 0 - pool_hit_rate = ( - (row["pool_hits"] / row["pool_total"] * 100) if row["pool_total"] > 0 else 0 - ) - - return { - "total_executions": total, - "success_count": row["success_count"] or 0, - "failure_count": row["failure_count"] or 0, - "timeout_count": row["timeout_count"] or 0, - "success_rate": round(success_rate, 1), - "avg_execution_time_ms": round(row["avg_execution_time_ms"] or 0, 1), - "pool_hit_rate": round(pool_hit_rate, 1), - "active_api_keys": row["active_api_keys"] or 0, - } - - async def get_language_usage( - self, - start: datetime, - end: datetime, - api_key_hash: Optional[str] = None, - stack_by_api_key: bool = False, - ) -> Dict[str, Any]: - """Get language usage data for stacked bar chart.""" - if not self._db: - return {"by_language": {}, "by_api_key": {}, "matrix": {}} - - params: List[Any] = [start.isoformat(), end.isoformat()] - api_key_filter = "" - if api_key_hash: - api_key_filter = "AND api_key_hash = ?" - params.append(api_key_hash) - - # Get totals by language - cursor = await self._db.execute( - f""" - SELECT language, COUNT(*) as count - FROM executions - WHERE created_at >= ? AND created_at <= ? {api_key_filter} - GROUP BY language - ORDER BY count DESC - """, - params, - ) - by_language = {row["language"]: row["count"] async for row in cursor} - - if not stack_by_api_key: - return {"by_language": by_language, "by_api_key": {}, "matrix": {}} - - # Get stacked data: language x api_key matrix - params = [start.isoformat(), end.isoformat()] - cursor = await self._db.execute( - """ - SELECT language, api_key_hash, COUNT(*) as count - FROM executions - WHERE created_at >= ? AND created_at <= ? - GROUP BY language, api_key_hash - ORDER BY language, count DESC - """, - params, - ) - - matrix: Dict[str, Dict[str, int]] = {} - api_keys_seen: Dict[str, int] = {} - - async for row in cursor: - lang = row["language"] - key = row["api_key_hash"] - count = row["count"] - - if lang not in matrix: - matrix[lang] = {} - matrix[lang][key] = count - - if key not in api_keys_seen: - api_keys_seen[key] = 0 - api_keys_seen[key] += count - - return { - "by_language": by_language, - "by_api_key": api_keys_seen, - "matrix": matrix, - } - - async def get_time_series( - self, - start: datetime, - end: datetime, - api_key_hash: Optional[str] = None, - granularity: str = "hour", - ) -> Dict[str, Any]: - """Get execution trend data for line chart.""" - if not self._db: - return { - "timestamps": [], - "executions": [], - "success_rate": [], - "avg_duration": [], - } - - params: List[Any] = [start.isoformat(), end.isoformat()] - api_key_filter = "" - if api_key_hash: - api_key_filter = "AND api_key_hash = ?" - params.append(api_key_hash) - - # Determine time grouping format - if granularity == "hour": - time_format = "%Y-%m-%d %H:00" - elif granularity == "day": - time_format = "%Y-%m-%d" - else: # week - time_format = "%Y-%W" - - cursor = await self._db.execute( - f""" - SELECT - strftime('{time_format}', created_at) as period, - COUNT(*) as executions, - SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as success_count, - AVG(execution_time_ms) as avg_duration - FROM executions - WHERE created_at >= ? AND created_at <= ? {api_key_filter} - GROUP BY period - ORDER BY period - """, - params, - ) - - timestamps = [] - executions = [] - success_rate = [] - avg_duration = [] - - async for row in cursor: - timestamps.append(row["period"]) - executions.append(row["executions"]) - rate = ( - (row["success_count"] / row["executions"] * 100) - if row["executions"] > 0 - else 0 - ) - success_rate.append(round(rate, 1)) - avg_duration.append(round(row["avg_duration"] or 0, 1)) - - return { - "timestamps": timestamps, - "executions": executions, - "success_rate": success_rate, - "avg_duration": avg_duration, - } - - async def get_heatmap_data( - self, - start: datetime, - end: datetime, - api_key_hash: Optional[str] = None, - ) -> Dict[str, Any]: - """Get day-of-week x hour activity matrix for heatmap.""" - if not self._db: - return {"matrix": [[0] * 24 for _ in range(7)], "max_value": 0} - - params: List[Any] = [start.isoformat(), end.isoformat()] - api_key_filter = "" - if api_key_hash: - api_key_filter = "AND api_key_hash = ?" - params.append(api_key_hash) - - cursor = await self._db.execute( - f""" - SELECT - CAST(strftime('%w', created_at) AS INTEGER) as day_of_week, - CAST(strftime('%H', created_at) AS INTEGER) as hour, - COUNT(*) as count - FROM executions - WHERE created_at >= ? AND created_at <= ? {api_key_filter} - GROUP BY day_of_week, hour - """, - params, - ) - - # Initialize 7x24 matrix (0=Sunday in SQLite, we'll adjust to 0=Monday) - matrix = [[0] * 24 for _ in range(7)] - max_value = 0 - - async for row in cursor: - # SQLite: 0=Sunday, 1=Monday, ..., 6=Saturday - # Convert to: 0=Monday, 1=Tuesday, ..., 6=Sunday - dow = (row["day_of_week"] - 1) % 7 - hour = row["hour"] - count = row["count"] - matrix[dow][hour] = count - max_value = max(max_value, count) - - return {"matrix": matrix, "max_value": max_value} - - async def get_api_keys_list(self) -> List[Dict[str, Any]]: - """Get list of API keys for filter dropdown.""" - if not self._db: - return [] - - cursor = await self._db.execute(""" - SELECT DISTINCT api_key_hash, COUNT(*) as usage_count - FROM executions - GROUP BY api_key_hash - ORDER BY usage_count DESC - LIMIT 50 - """) - - return [ - {"key_hash": row["api_key_hash"], "usage_count": row["usage_count"]} - async for row in cursor - ] - - async def get_top_languages( - self, - start: datetime, - end: datetime, - limit: int = 5, - ) -> List[Dict[str, Any]]: - """Get top languages by execution count.""" - if not self._db: - return [] - - cursor = await self._db.execute( - """ - SELECT language, COUNT(*) as count - FROM executions - WHERE created_at >= ? AND created_at <= ? - GROUP BY language - ORDER BY count DESC - LIMIT ? - """, - (start.isoformat(), end.isoformat(), limit), - ) - - return [ - {"language": row["language"], "count": row["count"]} async for row in cursor - ] - - -# Global service instance -sqlite_metrics_service = SQLiteMetricsService() diff --git a/src/services/state.py b/src/services/state.py index 34de4f3..8cbc308 100644 --- a/src/services/state.py +++ b/src/services/state.py @@ -167,13 +167,11 @@ async def save_state( await pipe.execute() - logger.info( + logger.debug( "Saved state to Redis", session_id=session_id[:12], state_size=len(raw_bytes), hash=state_hash[:12], - ttl_seconds=ttl_seconds, - from_upload=from_upload, ) return True, state_hash except Exception as e: diff --git a/src/utils/__init__.py b/src/utils/__init__.py index fb7b680..6cc0449 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -2,7 +2,7 @@ from .logging import setup_logging, get_logger from .security import SecurityValidator, RateLimiter, SecurityAudit, get_rate_limiter -from ..services.container import ContainerManager +from ..services.sandbox import SandboxManager __all__ = [ "setup_logging", @@ -11,5 +11,5 @@ "RateLimiter", "SecurityAudit", "get_rate_limiter", - "ContainerManager", + "SandboxManager", ] diff --git a/src/utils/config_validator.py b/src/utils/config_validator.py index 5a9c0fd..e44f65d 100644 --- a/src/utils/config_validator.py +++ b/src/utils/config_validator.py @@ -1,8 +1,8 @@ """Configuration validation utilities.""" import logging +import shutil from typing import List, Dict, Any -import docker import redis from minio import Minio from minio.error import S3Error @@ -39,7 +39,7 @@ def validate_all(self) -> bool: # Validate external services self._validate_redis_connection() self._validate_minio_connection() - self._validate_docker_connection() + self._validate_nsjail() # Log results if self.warnings: @@ -74,25 +74,16 @@ def _validate_security_config(self): if not settings.allowed_file_extensions: self.warnings.append("No allowed file extensions configured") - # Validate Docker security settings + # Validate sandbox security settings if not settings.enable_network_isolation: self.warnings.append("Network isolation is disabled - security risk") if not settings.enable_filesystem_isolation: self.warnings.append("Filesystem isolation is disabled - security risk") - if settings.docker_network_mode != "none": - self.warnings.append( - f"Docker network mode '{settings.docker_network_mode}' may allow network access" - ) - def _validate_resource_limits(self): """Validate resource limit configuration.""" - # Check critical limit conflicts - if settings.max_total_file_size_mb < settings.max_file_size_mb: - self.errors.append( - "Total file size limit is less than individual file size limit" - ) + pass def _validate_file_config(self): """Validate file handling configuration.""" @@ -138,7 +129,6 @@ def _validate_minio_connection(self): access_key=settings.minio_access_key, secret_key=settings.minio_secret_key, secure=settings.minio_secure, - region=settings.minio_region, ) # Test connection by listing buckets @@ -166,52 +156,15 @@ def _validate_minio_connection(self): else: self.errors.append(f"MinIO validation error: {e}") - def _validate_docker_connection(self): - """Validate Docker connection (non-blocking).""" - try: - # Try to create Docker client with very short timeout to avoid blocking - try: - client = docker.from_env(timeout=1) - except Exception as e: - logger.warning(f"Failed to create Docker client from environment: {e}") - # Fallback to explicit socket path with short timeout - try: - client = docker.DockerClient( - base_url="unix://var/run/docker.sock", timeout=1 - ) - except Exception as fallback_e: - self.warnings.append(f"Docker connection error: {fallback_e}") - return - - # Skip ping test during startup to avoid blocking - # The actual connection will be tested when Docker is first used - - # Skip image validation during startup to avoid blocking - # Images will be pulled when first needed - - except docker.errors.DockerException as e: - self.warnings.append(f"Docker connection error: {e}") - except Exception as e: - self.warnings.append(f"Docker validation error: {e}") - - def _validate_language_images(self, docker_client): - """Validate that required language images are available or can be pulled.""" - required_images = set() - for lang_config in settings.supported_languages.values(): - if "image" in lang_config: - required_images.add(lang_config["image"]) - - missing_images = [] - for image in required_images: - try: - docker_client.images.get(image) - except docker.errors.ImageNotFound: - missing_images.append(image) - - if missing_images: + def _validate_nsjail(self): + """Validate nsjail sandbox availability.""" + nsjail_path = shutil.which("nsjail") + if not nsjail_path: self.warnings.append( - f"Docker images not found locally (will be pulled on first use): {', '.join(missing_images)}" + "nsjail binary not found in PATH - sandboxed execution will not work" ) + else: + logger.info(f"nsjail found at: {nsjail_path}") def validate_configuration() -> bool: diff --git a/src/utils/logging.py b/src/utils/logging.py index 901d76d..36c5be5 100644 --- a/src/utils/logging.py +++ b/src/utils/logging.py @@ -94,14 +94,16 @@ def setup_file_logging() -> None: def configure_third_party_loggers() -> None: """Configure logging levels for third-party libraries.""" # Reduce noise from third-party libraries - logging.getLogger("uvicorn.access").setLevel(logging.WARNING) - logging.getLogger("docker").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("minio").setLevel(logging.WARNING) - # Enable access logs if configured + # Suppress uvicorn access logs - RequestLoggingMiddleware handles this + # with status-aware levels (DEBUG for 2xx, WARNING for 4xx, ERROR for 5xx). + # Set ENABLE_ACCESS_LOGS=true to re-enable uvicorn's native access logs. if settings.enable_access_logs: logging.getLogger("uvicorn.access").setLevel(logging.INFO) + else: + logging.getLogger("uvicorn.access").setLevel(logging.WARNING) # Enable security logs if configured if settings.enable_security_logs: diff --git a/src/utils/request_helpers.py b/src/utils/request_helpers.py index 98c2c6b..069a02d 100644 --- a/src/utils/request_helpers.py +++ b/src/utils/request_helpers.py @@ -11,10 +11,7 @@ def extract_api_key(request: Request) -> Optional[str]: """Extract API key from request headers. - Checks in order: - 1. x-api-key header (preferred) - 2. Authorization header with Bearer token - 3. Authorization header with ApiKey token + Only checks the x-api-key header. Args: request: FastAPI Request object @@ -22,20 +19,7 @@ def extract_api_key(request: Request) -> Optional[str]: Returns: API key string or None if not found """ - # Check x-api-key header first (preferred method) - api_key = request.headers.get("x-api-key") - if api_key: - return api_key - - # Check Authorization header as fallback - auth_header = request.headers.get("authorization") - if auth_header: - if auth_header.startswith("Bearer "): - return auth_header[7:] - elif auth_header.startswith("ApiKey "): - return auth_header[7:] - - return None + return request.headers.get("x-api-key") def get_client_ip(request: Request) -> str: diff --git a/src/utils/shutdown.py b/src/utils/shutdown.py index ff6bd2d..fd0ba96 100644 --- a/src/utils/shutdown.py +++ b/src/utils/shutdown.py @@ -5,7 +5,7 @@ import structlog from ..services.health import health_service -from ..services.metrics import metrics_collector +from ..services.metrics import metrics_service logger = structlog.get_logger(__name__) @@ -73,14 +73,14 @@ async def cleanup_services() -> None: except Exception as e: logger.error("Error stopping session service", error=str(e)) - # Stop metrics collector with timeout + # Stop metrics service with timeout try: - await asyncio.wait_for(metrics_collector.stop(), timeout=5.0) - logger.info("Metrics collector stopped") + await asyncio.wait_for(metrics_service.stop(), timeout=5.0) + logger.info("Metrics service stopped") except asyncio.TimeoutError: - logger.warning("Metrics collector stop timed out") + logger.warning("Metrics service stop timed out") except Exception as e: - logger.error("Error stopping metrics collector", error=str(e)) + logger.error("Error stopping metrics service", error=str(e)) # Close health service with timeout try: @@ -92,9 +92,9 @@ async def cleanup_services() -> None: logger.error("Error closing health service", error=str(e)) -async def cleanup_active_containers() -> None: - """Cleanup active containers during shutdown.""" - logger.info("Cleaning up active containers") +async def cleanup_active_sandboxes() -> None: + """Cleanup active sandboxes during shutdown.""" + logger.info("Cleaning up active sandboxes") try: # Import here to avoid circular imports and handle import errors @@ -104,14 +104,14 @@ async def cleanup_active_containers() -> None: execution_service = get_execution_service() # Stop all active executions with shorter timeout to prevent hanging - await asyncio.wait_for(execution_service.cleanup_all_containers(), timeout=8.0) - logger.info("Container cleanup completed") + await asyncio.wait_for(execution_service.cleanup_all_sandboxes(), timeout=8.0) + logger.info("Sandbox cleanup completed") except asyncio.TimeoutError: - logger.warning("Container cleanup timed out after 8 seconds - forcing shutdown") + logger.warning("Sandbox cleanup timed out after 8 seconds - forcing shutdown") except ImportError as e: logger.warning(f"Could not import execution service during shutdown: {e}") except Exception as e: - logger.error("Error cleaning up containers", error=str(e)) + logger.error("Error cleaning up sandboxes", error=str(e)) async def flush_logs_and_metrics() -> None: @@ -130,7 +130,7 @@ def setup_graceful_shutdown() -> None: """Setup graceful shutdown handling.""" # Add shutdown callbacks in order of execution (reversed during shutdown) shutdown_handler.add_shutdown_callback(flush_logs_and_metrics) - shutdown_handler.add_shutdown_callback(cleanup_active_containers) + shutdown_handler.add_shutdown_callback(cleanup_active_sandboxes) shutdown_handler.add_shutdown_callback(cleanup_services) logger.info("Graceful shutdown handling configured") diff --git a/tests/conftest.py b/tests/conftest.py index a1384fa..04e60b4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,8 +7,8 @@ from typing import AsyncGenerator, Generator import redis.asyncio as redis from minio import Minio -from docker import DockerClient from datetime import datetime, timezone +from pathlib import Path import os # Set test environment before importing config @@ -73,23 +73,34 @@ def mock_minio(): @pytest.fixture -def mock_docker(): - """Mock Docker client for testing.""" - mock_client = MagicMock(spec=DockerClient) - mock_container = MagicMock() - - # Mock container operations - mock_container.id = "test_container_id" - mock_container.status = "running" - mock_container.reload.return_value = None - mock_container.exec_run.return_value = MagicMock(exit_code=0, output=b"test output") - - mock_client.containers.create.return_value = mock_container - mock_client.containers.get.return_value = mock_container - mock_client.images.pull.return_value = None - mock_client.images.get.return_value = MagicMock() +def mock_sandbox_manager(): + """Mock SandboxManager for testing.""" + from src.services.sandbox.nsjail import SandboxInfo + + manager = AsyncMock() + manager.is_available.return_value = True + manager.get_initialization_error.return_value = None + + # Create a mock SandboxInfo + mock_sandbox = SandboxInfo( + sandbox_id="test-sandbox-123", + sandbox_dir=Path("/tmp/test-sandbox"), + data_dir=Path("/tmp/test-sandbox/data"), + language="py", + session_id="test-session", + created_at=datetime.utcnow(), + repl_mode=False, + ) - return mock_client + manager.create_sandbox.return_value = mock_sandbox + manager.destroy_sandbox.return_value = True + manager.copy_content_to_sandbox.return_value = True + manager.get_file_content_from_sandbox.return_value = b"test content" + manager.execute_command.return_value = (0, "output", "") + manager.get_user_id_for_language.return_value = 1001 + manager.close.return_value = None + + return manager @pytest.fixture @@ -101,27 +112,12 @@ async def session_service(mock_redis): @pytest.fixture -def execution_service(): +def execution_service(mock_sandbox_manager): """Create CodeExecutionService instance with mocked dependencies.""" with patch( - "src.services.execution.runner.ContainerManager" - ) as mock_container_manager: - mock_manager = MagicMock() - mock_container_manager.return_value = mock_manager - - # Mock container manager methods - mock_manager.get_image_for_language.return_value = "python:3.11" - mock_manager.pull_image_if_needed = AsyncMock() - mock_manager.create_container.return_value = MagicMock(id="test_container") - mock_manager.start_container = AsyncMock() - mock_manager.execute_command = AsyncMock(return_value=(0, "output", "")) - mock_manager.get_container_stats = AsyncMock( - return_value={"memory_usage_mb": 50} - ) - mock_manager.stop_container = AsyncMock() - mock_manager.remove_container = AsyncMock() - mock_manager.close.return_value = None - + "src.services.execution.runner.SandboxManager", + return_value=mock_sandbox_manager, + ): service = CodeExecutionService() yield service @@ -173,8 +169,6 @@ def mock_settings(): mock_settings.redis_url = None mock_settings.session_ttl_hours = 24 mock_settings.session_cleanup_interval_minutes = 60 - mock_settings.container_ttl_minutes = 5 - mock_settings.container_cleanup_interval_minutes = 5 mock_settings.minio_endpoint = "localhost:9000" mock_settings.minio_access_key = "test_key" mock_settings.minio_secret_key = "test_secret" @@ -185,13 +179,10 @@ def mock_settings(): mock_settings.max_file_size_mb = 10 mock_settings.max_output_files = 10 - # Add helper methods for backward compatibility + # Add helper methods mock_settings.get_session_ttl_minutes = ( lambda: mock_settings.session_ttl_hours * 60 ) - mock_settings.get_container_ttl_minutes = ( - lambda: mock_settings.container_ttl_minutes - ) yield mock_settings diff --git a/tests/functional/test_exec_workflow.py b/tests/functional/test_exec_workflow.py index 543b8d6..63b5ed4 100644 --- a/tests/functional/test_exec_workflow.py +++ b/tests/functional/test_exec_workflow.py @@ -184,13 +184,6 @@ async def test_exec_response_includes_state_fields( assert r.status_code == 200 data = r.json() - # State fields should be present for Python - assert "has_state" in data - # If state was captured, additional fields should be present - if data.get("has_state"): - assert "state_size" in data or data["state_size"] is None - assert "state_hash" in data or data["state_hash"] is None - @pytest.mark.asyncio async def test_dataframe_persists_across_executions( self, async_client, auth_headers, unique_entity_id diff --git a/tests/functional/test_files.py b/tests/functional/test_files.py index 8ff3782..1ad5c0e 100644 --- a/tests/functional/test_files.py +++ b/tests/functional/test_files.py @@ -225,38 +225,64 @@ async def test_download_nonexistent_returns_404( assert response.status_code == 404 -class TestFileDelete: - """Test DELETE /files/{session_id}/{file_id}.""" +class TestFileExecutionIntegration: + """Test the full upload → execute (read file) → generate output → download flow.""" @pytest.mark.asyncio - async def test_delete_file(self, async_client, auth_headers, unique_entity_id): - """Delete uploaded file returns 200.""" - files = {"files": ("delete-test.txt", b"Delete me", "text/plain")} + async def test_uploaded_file_readable_at_mnt_data( + self, async_client, auth_headers, unique_entity_id + ): + """Uploaded file is readable at /mnt/data/ inside execution sandbox.""" + csv_content = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n" + files = {"files": ("people.csv", csv_content, "text/csv")} + # Upload upload = await async_client.post( "/upload", headers={"x-api-key": auth_headers["x-api-key"]}, files=files, data={"entity_id": unique_entity_id}, ) - - session_id = upload.json()["session_id"] - file_id = upload.json()["files"][0]["fileId"] - - # Delete - response = await async_client.delete( - f"/files/{session_id}/{file_id}", + assert upload.status_code == 200 + upload_data = upload.json() + session_id = upload_data["session_id"] + file_id = upload_data["files"][0]["fileId"] + filename = upload_data["files"][0]["filename"] + + # Execute code that reads the file via /mnt/data/ path + exec_response = await async_client.post( + "/exec", headers=auth_headers, + json={ + "code": ( + "import csv\n" + f"with open('/mnt/data/{filename}') as f:\n" + " reader = csv.DictReader(f)\n" + " rows = list(reader)\n" + "print(len(rows))\n" + "print(rows[0]['name'])\n" + ), + "lang": "py", + "session_id": session_id, + "files": [ + {"id": file_id, "session_id": session_id, "name": filename} + ], + }, ) - assert response.status_code == 200 + assert exec_response.status_code == 200 + result = exec_response.json() + assert "2" in result["stdout"] + assert "Alice" in result["stdout"] + assert result["stderr"] == "" @pytest.mark.asyncio - async def test_file_not_in_list_after_delete( + async def test_uploaded_file_readable_via_relative_path( self, async_client, auth_headers, unique_entity_id ): - """Deleted file no longer appears in file list.""" - files = {"files": ("delete-verify.txt", b"To be deleted", "text/plain")} + """Uploaded file is also readable via relative path (CWD = /mnt/data).""" + content = b"hello from uploaded file" + files = {"files": ("greeting.txt", content, "text/plain")} upload = await async_client.post( "/upload", @@ -264,28 +290,90 @@ async def test_file_not_in_list_after_delete( files=files, data={"entity_id": unique_entity_id}, ) + upload_data = upload.json() + session_id = upload_data["session_id"] + file_id = upload_data["files"][0]["fileId"] + filename = upload_data["files"][0]["filename"] - session_id = upload.json()["session_id"] - file_id = upload.json()["files"][0]["fileId"] - - # Delete - await async_client.delete( - f"/files/{session_id}/{file_id}", + exec_response = await async_client.post( + "/exec", headers=auth_headers, + json={ + "code": f"print(open('{filename}').read())", + "lang": "py", + "session_id": session_id, + "files": [ + {"id": file_id, "session_id": session_id, "name": filename} + ], + }, ) - # Verify deleted - list should be empty or not contain the file - list_response = await async_client.get( - f"/files/{session_id}", + result = exec_response.json() + assert "hello from uploaded file" in result["stdout"] + + @pytest.mark.asyncio + async def test_upload_execute_generate_download( + self, async_client, auth_headers, unique_entity_id + ): + """Full round-trip: upload CSV → process with pandas → download result.""" + csv_data = b"product,price\nWidget,9.99\nGadget,19.99\n" + files = {"files": ("input.csv", csv_data, "text/csv")} + + # Upload + upload = await async_client.post( + "/upload", + headers={"x-api-key": auth_headers["x-api-key"]}, + files=files, + data={"entity_id": unique_entity_id}, + ) + upload_data = upload.json() + session_id = upload_data["session_id"] + file_id = upload_data["files"][0]["fileId"] + filename = upload_data["files"][0]["filename"] + + # Execute: read input, transform, write output + exec_response = await async_client.post( + "/exec", headers=auth_headers, + json={ + "code": ( + "import csv\n" + f"with open('/mnt/data/{filename}') as f:\n" + " reader = csv.DictReader(f)\n" + " rows = list(reader)\n" + "with open('/mnt/data/output.csv', 'w', newline='') as f:\n" + " writer = csv.DictWriter(f, fieldnames=['product', 'price', 'tax'])\n" + " writer.writeheader()\n" + " for row in rows:\n" + " row['tax'] = f\"{float(row['price']) * 0.1:.2f}\"\n" + " writer.writerow(row)\n" + "print('done')\n" + ), + "lang": "py", + "session_id": session_id, + "files": [ + {"id": file_id, "session_id": session_id, "name": filename} + ], + }, ) - files_list = list_response.json() - file_ids = [] - for f in files_list: - # Handle different response formats - fid = f.get("id") or f.get("fileId") or f.get("file_id") - if fid: - file_ids.append(fid) + result = exec_response.json() + assert "done" in result["stdout"] + assert len(result["files"]) >= 1 + + # Find the generated output file + output_file = next( + (f for f in result["files"] if f["name"] == "output.csv"), None + ) + assert output_file is not None, f"output.csv not in files: {result['files']}" - assert file_id not in file_ids + # Download and verify content + download = await async_client.get( + f"/download/{session_id}/{output_file['id']}", + headers=auth_headers, + ) + assert download.status_code == 200 + downloaded_text = download.content.decode() + assert "product,price,tax" in downloaded_text + assert "Widget" in downloaded_text + assert "1.00" in downloaded_text # 9.99 * 0.1 = 1.00 diff --git a/tests/functional/test_state.py b/tests/functional/test_state.py deleted file mode 100644 index f1286b6..0000000 --- a/tests/functional/test_state.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Functional tests for state persistence API endpoints. - -These are extended functionality beyond LibreChat's current usage. -LibreChat currently only supports file session persistence, not Python state. -""" - -import pytest - - -class TestStateInfo: - """Test GET /state/{session_id}/info.""" - - @pytest.mark.asyncio - async def test_info_nonexistent_state( - self, async_client, auth_headers, unique_session_id - ): - """Info for non-existent state returns exists=false.""" - response = await async_client.get( - f"/state/{unique_session_id}/info", - headers=auth_headers, - ) - - assert response.status_code == 200 - data = response.json() - assert data["exists"] is False - - @pytest.mark.asyncio - async def test_info_after_execution( - self, async_client, auth_headers, unique_entity_id - ): - """State info after Python execution shows state exists.""" - # Create state via execution - exec_response = await async_client.post( - "/exec", - headers=auth_headers, - json={ - "code": "state_test_var = {'key': 'value', 'number': 42}", - "lang": "py", - "entity_id": unique_entity_id, - }, - ) - assert exec_response.status_code == 200 - session_id = exec_response.json()["session_id"] - - # Check state info - info_response = await async_client.get( - f"/state/{session_id}/info", - headers=auth_headers, - ) - - assert info_response.status_code == 200 - data = info_response.json() - # State should exist after Python execution - assert "exists" in data - if data["exists"]: - assert "size_bytes" in data - assert "hash" in data - - -class TestStateDownload: - """Test GET /state/{session_id}.""" - - @pytest.mark.asyncio - async def test_download_nonexistent_state( - self, async_client, auth_headers, unique_session_id - ): - """Download state for non-existent session returns 404.""" - response = await async_client.get( - f"/state/{unique_session_id}", - headers=auth_headers, - ) - - assert response.status_code == 404 - - @pytest.mark.asyncio - async def test_download_state_after_execution( - self, async_client, auth_headers, unique_entity_id - ): - """Download state after Python execution returns binary data.""" - # Create state via execution - exec_response = await async_client.post( - "/exec", - headers=auth_headers, - json={ - "code": "download_test_data = {'key': 'value'}", - "lang": "py", - "entity_id": unique_entity_id, - }, - ) - assert exec_response.status_code == 200 - session_id = exec_response.json()["session_id"] - - # Try to download state - state_response = await async_client.get( - f"/state/{session_id}", - headers=auth_headers, - ) - - # May be 200 (state exists) or 404 (no state captured) - assert state_response.status_code in [200, 404] - - if state_response.status_code == 200: - # Should have ETag header - assert "etag" in state_response.headers - # Should have binary content - assert len(state_response.content) > 0 - - @pytest.mark.asyncio - async def test_state_etag_conditional_request( - self, async_client, auth_headers, unique_entity_id - ): - """State download supports ETag conditional requests.""" - # Create state - exec_response = await async_client.post( - "/exec", - headers=auth_headers, - json={ - "code": "etag_test_data = [1, 2, 3, 4, 5]", - "lang": "py", - "entity_id": unique_entity_id, - }, - ) - session_id = exec_response.json()["session_id"] - - # First download to get ETag - first_response = await async_client.get( - f"/state/{session_id}", - headers=auth_headers, - ) - - if first_response.status_code == 200: - etag = first_response.headers.get("etag") - if etag: - # Second request with If-None-Match should return 304 - second_response = await async_client.get( - f"/state/{session_id}", - headers={**auth_headers, "If-None-Match": etag}, - ) - # Should return 304 Not Modified - assert second_response.status_code in [200, 304] - - -class TestStateUpload: - """Test POST /state/{session_id}.""" - - @pytest.mark.asyncio - async def test_upload_valid_state( - self, async_client, auth_headers, unique_session_id - ): - """Upload valid state returns 201.""" - # Create minimal valid state (version 2 + lz4 compressed data) - # Version byte 0x02 indicates state format version 2 - state_bytes = b"\x02" + b"x" * 100 # Version byte + dummy data - - response = await async_client.post( - f"/state/{unique_session_id}", - headers={**auth_headers, "Content-Type": "application/octet-stream"}, - content=state_bytes, - ) - - assert response.status_code == 201 - data = response.json() - assert data["message"] == "state_uploaded" - assert data["size"] == len(state_bytes) - - @pytest.mark.asyncio - async def test_upload_invalid_version( - self, async_client, auth_headers, unique_session_id - ): - """Upload state with invalid version returns 400.""" - # Invalid version byte (0x99 is not valid) - state_bytes = b"\x99invalid_version_data" - - response = await async_client.post( - f"/state/{unique_session_id}", - headers={**auth_headers, "Content-Type": "application/octet-stream"}, - content=state_bytes, - ) - - assert response.status_code == 400 - - @pytest.mark.asyncio - async def test_upload_empty_state( - self, async_client, auth_headers, unique_session_id - ): - """Upload empty state returns 400.""" - response = await async_client.post( - f"/state/{unique_session_id}", - headers={**auth_headers, "Content-Type": "application/octet-stream"}, - content=b"", - ) - - assert response.status_code == 400 - - -class TestStateDelete: - """Test DELETE /state/{session_id}.""" - - @pytest.mark.asyncio - async def test_delete_state(self, async_client, auth_headers, unique_session_id): - """Delete state returns 204.""" - response = await async_client.delete( - f"/state/{unique_session_id}", - headers=auth_headers, - ) - - assert response.status_code == 204 - - @pytest.mark.asyncio - async def test_delete_nonexistent_state( - self, async_client, auth_headers, unique_session_id - ): - """Delete non-existent state still returns 204.""" - response = await async_client.delete( - f"/state/{unique_session_id}", - headers=auth_headers, - ) - - assert response.status_code == 204 - - @pytest.mark.asyncio - async def test_state_not_found_after_delete( - self, async_client, auth_headers, unique_entity_id - ): - """State returns 404 after deletion.""" - # Create state - exec_response = await async_client.post( - "/exec", - headers=auth_headers, - json={ - "code": "delete_test_data = 'to be deleted'", - "lang": "py", - "entity_id": unique_entity_id, - }, - ) - session_id = exec_response.json()["session_id"] - - # Verify state exists (or might not if state capture didn't happen) - check_response = await async_client.get( - f"/state/{session_id}/info", - headers=auth_headers, - ) - - if check_response.json().get("exists"): - # Delete state - delete_response = await async_client.delete( - f"/state/{session_id}", - headers=auth_headers, - ) - assert delete_response.status_code == 204 - - # Verify state no longer exists - info_response = await async_client.get( - f"/state/{session_id}/info", - headers=auth_headers, - ) - assert info_response.json()["exists"] is False diff --git a/tests/functional/test_timing.py b/tests/functional/test_timing.py index c34d025..f6906cc 100644 --- a/tests/functional/test_timing.py +++ b/tests/functional/test_timing.py @@ -115,20 +115,3 @@ async def test_download_under_5s( assert latency < 5.0, f"Download took {latency:.1f}s, expected < 5s" -class TestStateTiming: - """Test state operation timing.""" - - @pytest.mark.asyncio - async def test_state_info_under_2s( - self, async_client, auth_headers, unique_session_id - ): - """State info check responds within 2 seconds.""" - start = time.perf_counter() - response = await async_client.get( - f"/state/{unique_session_id}/info", - headers=auth_headers, - ) - latency = time.perf_counter() - start - - assert response.status_code == 200 - assert latency < 2.0, f"State info took {latency:.1f}s, expected < 2s" diff --git a/tests/integration/test_api_contracts.py b/tests/integration/test_api_contracts.py index 0661c0b..169341f 100644 --- a/tests/integration/test_api_contracts.py +++ b/tests/integration/test_api_contracts.py @@ -133,7 +133,6 @@ def mock_file_service(): path="/test.txt", ) service.download_file.return_value = "https://minio.example.com/download-url" - service.delete_file.return_value = True return service @@ -473,29 +472,6 @@ def test_download_not_found(self, client, auth_headers, mock_file_service): assert response.status_code == 404 -class TestFileDeleteContract: - """Test file deletion endpoint contract.""" - - def test_delete_success(self, client, auth_headers, mock_file_service): - """Test successful file deletion.""" - response = client.delete( - "/files/test-session/test-file-id-123", headers=auth_headers - ) - - # API returns 200 with empty body for LibreChat compatibility - assert response.status_code == 200 - - def test_delete_not_found(self, client, auth_headers, mock_file_service): - """Test deletion of non-existent file.""" - mock_file_service.get_file_info.return_value = None - - response = client.delete( - "/files/test-session/nonexistent", headers=auth_headers - ) - - assert response.status_code == 404 - - # ============================================================================= # HEALTH ENDPOINTS # ============================================================================= @@ -585,23 +561,23 @@ def test_x_api_key_header(self, client): assert response.status_code != 401 - def test_authorization_bearer(self, client): - """Test Authorization Bearer authentication.""" + def test_authorization_bearer_rejected(self, client): + """Test Authorization Bearer header is not accepted for authentication.""" headers = {"Authorization": "Bearer test-api-key-for-testing-12345"} response = client.post( "/exec", json={"code": "print('test')", "lang": "py"}, headers=headers ) - assert response.status_code != 401 + assert response.status_code == 401 - def test_authorization_apikey(self, client): - """Test Authorization ApiKey authentication.""" + def test_authorization_apikey_rejected(self, client): + """Test Authorization ApiKey header is not accepted for authentication.""" headers = {"Authorization": "ApiKey test-api-key-for-testing-12345"} response = client.post( "/exec", json={"code": "print('test')", "lang": "py"}, headers=headers ) - assert response.status_code != 401 + assert response.status_code == 401 def test_no_auth_rejected(self, client): """Test requests without auth are rejected.""" diff --git a/tests/integration/test_auth_integration.py b/tests/integration/test_auth_integration.py index 32de11f..58fa285 100644 --- a/tests/integration/test_auth_integration.py +++ b/tests/integration/test_auth_integration.py @@ -57,29 +57,23 @@ def test_valid_api_key_x_api_key_header(self, client, mock_services): # Should not fail with authentication error assert response.status_code != 401 - def test_valid_api_key_authorization_bearer(self, client, mock_services): - """Test authentication with valid API key in Authorization Bearer header.""" + def test_authorization_bearer_rejected(self, client, mock_services): + """Test that Authorization Bearer header is not accepted.""" headers = {"Authorization": "Bearer test-api-key-for-testing-12345"} - with patch("src.services.auth.settings") as mock_settings: - mock_settings.api_key = "test-api-key-for-testing-12345" - - response = client.get("/sessions", headers=headers) + response = client.get("/sessions", headers=headers) - # Should not fail with authentication error - assert response.status_code != 401 + # Bearer auth is not supported; only x-api-key header is accepted + assert response.status_code == 401 - def test_valid_api_key_authorization_apikey(self, client, mock_services): - """Test authentication with valid API key in Authorization ApiKey header.""" + def test_authorization_apikey_rejected(self, client, mock_services): + """Test that Authorization ApiKey header is not accepted.""" headers = {"Authorization": "ApiKey test-api-key-for-testing-12345"} - with patch("src.services.auth.settings") as mock_settings: - mock_settings.api_key = "test-api-key-for-testing-12345" - - response = client.get("/sessions", headers=headers) + response = client.get("/sessions", headers=headers) - # Should not fail with authentication error - assert response.status_code != 401 + # ApiKey auth is not supported; only x-api-key header is accepted + assert response.status_code == 401 def test_invalid_api_key(self, client, mock_services): """Test authentication with invalid API key.""" @@ -265,8 +259,8 @@ def test_auth_with_whitespace_in_key(self, client, mock_services): # Should either trim whitespace or reject assert response.status_code in [401, 200] # Depends on implementation - def test_multiple_auth_headers(self, client, mock_services): - """Test request with multiple authentication headers.""" + def test_multiple_auth_headers_only_x_api_key_used(self, client, mock_services): + """Test that only x-api-key header is used, Authorization header is ignored.""" with patch("src.services.auth.settings") as mock_settings: mock_settings.api_key = "test-api-key-for-testing-12345" @@ -277,7 +271,7 @@ def test_multiple_auth_headers(self, client, mock_services): response = client.get("/sessions", headers=headers) - # Should use one of the headers (typically x-api-key takes precedence) + # Only x-api-key is used, Authorization header is ignored assert response.status_code != 401 def test_auth_header_injection_attempt(self, client, mock_services): diff --git a/tests/integration/test_container_hardening.py b/tests/integration/test_container_hardening.py index d9bbc31..bb146be 100644 --- a/tests/integration/test_container_hardening.py +++ b/tests/integration/test_container_hardening.py @@ -43,11 +43,12 @@ class TestContainerHardening: """Test container hardening against information leakage.""" def test_hardening_config_defaults_enabled(self): - """Test that hardening configuration defaults are enabled.""" + """Test that sandbox hardening is enabled by default via nsjail.""" from src.config import settings - assert settings.container_mask_host_info is True - assert settings.container_generic_hostname == "sandbox" + # nsjail handles host info masking and hostname isolation natively + assert settings.enable_network_isolation is True + assert settings.enable_filesystem_isolation is True def test_hostname_is_generic(self, client, auth_headers): """Verify hostname is 'sandbox' instead of revealing host info.""" @@ -250,42 +251,29 @@ def test_machine_id_masked(self, client, auth_headers): class TestContainerHardeningConfig: """Test container hardening configuration integration.""" - def test_hardening_config_applied_to_container(self): - """Test that hardening config is used in container creation.""" - from src.services.container.manager import ContainerManager + def test_hardening_config_applied_to_sandbox(self): + """Test that hardening config is used in sandbox creation.""" + from src.services.sandbox.manager import SandboxManager from src.config import settings - # Verify settings are correctly configured - assert hasattr(settings, "container_mask_host_info") - assert hasattr(settings, "container_generic_hostname") + # Verify sandbox settings are correctly configured + assert hasattr(settings, "nsjail_binary") + assert hasattr(settings, "sandbox_base_dir") def test_masked_paths_list_complete(self): - """Test that all expected paths are in the masked paths list.""" + """Test that nsjail masks sensitive paths by default.""" from src.config import settings - # These are the paths that should be masked when hardening is enabled - expected_masked = [ - "/proc/version", - "/etc/machine-id", - ] + # nsjail handles path masking natively through its mount configuration + # Verify sandbox isolation settings are enabled + assert settings.enable_filesystem_isolation is True - # The actual paths are defined in manager.py when container_mask_host_info is True - # This test verifies the setting exists - assert settings.container_mask_host_info is True - - def test_dns_search_sanitized_for_wan(self): - """Test that dns_search is empty for WAN containers.""" + def test_network_isolation_enabled(self): + """Test that network isolation is enabled by default.""" from src.config import settings - # Verify WAN DNS configuration exists - assert hasattr(settings, "wan_dns_servers") - assert len(settings.wan_dns_servers) > 0 - # DNS servers should be public (e.g., 8.8.8.8, 1.1.1.1) - for dns in settings.wan_dns_servers: - # Should not be internal/private DNS - assert not dns.startswith("10.") - assert not dns.startswith("192.168.") - assert not dns.startswith("172.") + # nsjail sandboxes run without network access by default + assert settings.enable_network_isolation is True class TestContainerHardeningWAN: @@ -441,32 +429,9 @@ def test_ptrace_blocked_by_seccomp(self, client, auth_headers): finally: app.dependency_overrides.clear() - def test_seccomp_profile_config_exists(self): - """Verify seccomp profile configuration is set.""" + def test_sandbox_config_exists(self): + """Verify sandbox configuration is set.""" from src.config import settings - assert settings.docker_seccomp_profile == "docker/seccomp-sandbox.json" - - def test_seccomp_profile_file_exists(self): - """Verify seccomp profile file exists and is valid JSON.""" - import json - from pathlib import Path - - profile_path = Path("docker/seccomp-sandbox.json") - assert profile_path.exists(), "Seccomp profile file should exist" - - with open(profile_path) as f: - profile = json.load(f) - - # Verify structure - assert "defaultAction" in profile - assert "syscalls" in profile - assert isinstance(profile["syscalls"], list) - - # Verify ptrace is blocked - blocked_syscalls = [] - for rule in profile["syscalls"]: - if rule.get("action") == "SCMP_ACT_ERRNO": - blocked_syscalls.extend(rule.get("names", [])) - - assert "ptrace" in blocked_syscalls, "ptrace should be blocked by seccomp" + assert hasattr(settings, "nsjail_binary") + assert settings.nsjail_binary == "nsjail" diff --git a/tests/integration/test_librechat_compat.py b/tests/integration/test_librechat_compat.py index 3a474a0..1b9f502 100644 --- a/tests/integration/test_librechat_compat.py +++ b/tests/integration/test_librechat_compat.py @@ -17,14 +17,16 @@ import pytest from fastapi.testclient import TestClient -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch from datetime import datetime, timezone, timedelta +import concurrent.futures import io import json from src.main import app from src.models.exec import ExecResponse, FileRef from src.models.files import FileInfo +from src.models.session import Session, SessionStatus @pytest.fixture @@ -201,8 +203,6 @@ class TestLibreChatExecResponse: - stdout: string (required) - stderr: string (required) - files?: Array<{id, name, path?}> - - Additional fields (has_state, state_size, state_hash) are allowed and ignored. """ @patch("src.services.orchestrator.ExecutionOrchestrator.execute") @@ -211,8 +211,6 @@ def test_response_has_required_fields(self, mock_execute, client, auth_headers): Test LibreChat response has required fields: session_id, files, stdout, stderr. LibreChat reads these 4 fields from the response (from @librechat/agents ExecuteResult type). - Additional fields (like has_state, state_size, state_hash for Python) are allowed - and will be ignored by LibreChat. """ mock_execute.return_value = ExecResponse( session_id="resp-session-123", stdout="test output\n", stderr="", files=[] @@ -353,10 +351,22 @@ def setup_mocks(self): """Set up mocks.""" mock_file_service = AsyncMock() mock_file_service.store_uploaded_file.return_value = "lc-file-123" + mock_file_service.validate_uploads = MagicMock(return_value=None) + + mock_session_service = AsyncMock() + mock_session_service.create_session.return_value = Session( + session_id="upload-session-123", + status=SessionStatus.ACTIVE, + created_at=datetime.now(timezone.utc), + last_activity=datetime.now(timezone.utc), + expires_at=datetime.now(timezone.utc) + timedelta(hours=24), + metadata={}, + ) - from src.dependencies.services import get_file_service + from src.dependencies.services import get_file_service, get_session_service app.dependency_overrides[get_file_service] = lambda: mock_file_service + app.dependency_overrides[get_session_service] = lambda: mock_session_service yield @@ -481,6 +491,16 @@ def test_files_endpoint_with_detail_summary(self, client, auth_headers): assert response.status_code == 200 data = response.json() assert isinstance(data, list) + assert len(data) == 1 + item = data[0] + assert "name" in item, "Summary must have 'name' field" + assert "lastModified" in item, "Summary must have 'lastModified' field" + # LibreChat parses name with: file.name.startsWith(path) where path = "session_id/fileId" + assert item["name"] == "test-session-123/file-123", \ + f"name must be 'session_id/fileId' format, got: {item['name']}" + # lastModified must be ISO 8601 with Z suffix for LibreChat's Date parsing + assert item["lastModified"].endswith("Z"), \ + f"lastModified must end with 'Z', got: {item['lastModified']}" def test_files_endpoint_with_detail_full(self, client, auth_headers): """ @@ -513,22 +533,25 @@ def test_download_endpoint(self, client, auth_headers): Test GET /download/{session_id}/{fileId} endpoint. LibreChat downloads generated files using this endpoint. - From crud.js: GET /download/{session_id}/{fileId} + From crud.js: GET /download/{session_id}/{fileId} with responseType: 'arraybuffer' """ - # Mock file service to return file content - self.mock_file_service.get_file.return_value = ( - io.BytesIO(b"file content here"), - "output.txt", - "text/plain", + self.mock_file_service.get_file_info.return_value = FileInfo( + file_id="file-abc", + filename="output.txt", + size=17, + content_type="text/plain", + created_at=datetime.now(timezone.utc), + path="/output.txt", ) + self.mock_file_service.get_file_content.return_value = b"file content here" response = client.get( "/download/test-session-789/file-abc", headers=auth_headers ) - # Should return file content or appropriate response - # Note: Actual status depends on whether file exists in mock - assert response.status_code in [200, 404] + assert response.status_code == 200 + assert response.content == b"file content here" + assert "content-disposition" in response.headers # ============================================================================= @@ -543,16 +566,21 @@ class TestLibreChatAuthentication: From CodeExecutor.ts: headers: { 'X-API-Key': apiKey } """ - def test_x_api_key_header(self, client): + @patch("src.services.orchestrator.ExecutionOrchestrator.execute") + def test_x_api_key_header(self, mock_execute, client): """ - Test x-api-key header authentication. + Test x-api-key header authentication on protected endpoint. LibreChat sends: headers: { 'X-API-Key': apiKey } """ + mock_execute.return_value = ExecResponse( + session_id="auth-test", stdout="ok\n", stderr="", files=[] + ) headers = {"x-api-key": "test-api-key-for-testing-12345"} - # Just check auth doesn't fail - response = client.get("/health", headers=headers) + response = client.post( + "/exec", json={"code": "print('ok')", "lang": "py"}, headers=headers + ) assert response.status_code == 200 @@ -635,3 +663,571 @@ def test_timeout_returns_200(self, mock_execute, client, auth_headers): # Should return 200 even for timeout assert response.status_code == 200 + + +# ============================================================================= +# LIBRECHAT FILE LIFECYCLE +# ============================================================================= + + +class TestLibreChatFileLifecycle: + """Test the complete file lifecycle as LibreChat performs it. + + Full flow: + 1. Upload file via POST /upload (with 'file' singular field) + 2. Execute code referencing the uploaded file + 3. Check output files via GET /files/{session_id}?detail=summary + 4. Download output file via GET /download/{session_id}/{fileId} + """ + + @pytest.fixture(autouse=True) + def setup_mocks(self): + """Set up mocks for full lifecycle tests.""" + self.mock_file_service = AsyncMock() + self.mock_file_service.store_uploaded_file.return_value = "uploaded-file-001" + self.mock_file_service.validate_uploads = MagicMock(return_value=None) + + self.mock_session_service = AsyncMock() + self.mock_session_service.create_session.return_value = Session( + session_id="lifecycle-session-123", + status=SessionStatus.ACTIVE, + created_at=datetime.now(timezone.utc), + last_activity=datetime.now(timezone.utc), + expires_at=datetime.now(timezone.utc) + timedelta(hours=24), + metadata={}, + ) + + from src.dependencies.services import get_file_service, get_session_service + + app.dependency_overrides[get_file_service] = lambda: self.mock_file_service + app.dependency_overrides[get_session_service] = lambda: self.mock_session_service + + yield + + app.dependency_overrides.clear() + + def test_upload_then_check_summary(self, client, auth_headers): + """ + Test upload a file, then verify it appears in session file summary. + + This is the primeFiles check: upload -> GET /files/{session_id}?detail=summary + """ + # Step 1: Upload file (LibreChat uses 'file' singular) + upload_files = {"file": ("data.csv", io.BytesIO(b"col1,col2\n1,2\n"), "text/csv")} + upload_data = {"entity_id": "asst_test_agent"} + + upload_response = client.post( + "/upload", files=upload_files, data=upload_data, headers=auth_headers + ) + assert upload_response.status_code == 200 + upload_result = upload_response.json() + assert upload_result["message"] == "success" + session_id = upload_result["session_id"] + file_id = upload_result["files"][0]["fileId"] + + # Step 2: Check summary endpoint + self.mock_file_service.list_files.return_value = [ + FileInfo( + file_id=file_id, + filename="data.csv", + size=14, + content_type="text/csv", + created_at=datetime.now(timezone.utc), + path="/data.csv", + ) + ] + + summary_response = client.get( + f"/files/{session_id}?detail=summary", headers=auth_headers + ) + assert summary_response.status_code == 200 + summary_data = summary_response.json() + assert isinstance(summary_data, list) + assert len(summary_data) >= 1 + + # Verify format matches what LibreChat's process.js parses + item = summary_data[0] + assert "name" in item + assert "lastModified" in item + # name must be in "session_id/fileId" format + assert "/" in item["name"], "name must contain '/' separator" + + @patch("src.services.orchestrator.ExecutionOrchestrator.execute") + def test_upload_then_exec_with_file_ref(self, mock_execute, client, auth_headers): + """ + Test upload a file, then execute code that references it. + + LibreChat sends the session_id and fileId from upload response in exec request. + """ + # Step 1: Upload + upload_files = {"file": ("input.txt", io.BytesIO(b"hello world"), "text/plain")} + upload_response = client.post( + "/upload", files=upload_files, data={"entity_id": "asst_test"}, headers=auth_headers + ) + assert upload_response.status_code == 200 + upload_result = upload_response.json() + session_id = upload_result["session_id"] + file_id = upload_result["files"][0]["fileId"] + + # Step 2: Execute with file reference + mock_execute.return_value = ExecResponse( + session_id=session_id, + stdout="hello world\n", + stderr="", + files=[], + ) + + exec_response = client.post( + "/exec", + json={ + "code": "with open('/mnt/data/input.txt') as f: print(f.read())", + "lang": "py", + "files": [{"id": file_id, "session_id": session_id, "name": "input.txt"}], + }, + headers=auth_headers, + ) + assert exec_response.status_code == 200 + exec_data = exec_response.json() + assert exec_data["session_id"] == session_id + assert exec_data["stdout"] == "hello world\n" + + def test_download_output_file(self, client, auth_headers): + """ + Test downloading an output file as LibreChat does. + + LibreChat calls: GET /download/{session_id}/{fileId} with responseType: 'arraybuffer' + From crud.js: axios({ method: 'get', url, responseType: 'arraybuffer' }) + """ + session_id = "lifecycle-session-123" + file_id = "output-file-456" + file_content = b"\x89PNG\r\n\x1a\n fake image content" + + self.mock_file_service.get_file_info.return_value = FileInfo( + file_id=file_id, + filename="chart.png", + size=len(file_content), + content_type="image/png", + created_at=datetime.now(timezone.utc), + path="/chart.png", + ) + self.mock_file_service.get_file_content.return_value = file_content + + response = client.get( + f"/download/{session_id}/{file_id}", headers=auth_headers + ) + + assert response.status_code == 200 + assert response.content == file_content + assert "content-disposition" in response.headers + + def test_librechat_user_agent_header(self, client, auth_headers): + """ + Test that User-Agent: LibreChat/1.0 header works correctly. + + LibreChat always sends this header. Verify it doesn't cause issues. + """ + headers = { + **auth_headers, + "User-Agent": "LibreChat/1.0", + "User-Id": "user_abc123", + } + upload_files = {"file": ("test.txt", io.BytesIO(b"test"), "text/plain")} + + response = client.post("/upload", files=upload_files, headers=headers) + assert response.status_code == 200 + + +# ============================================================================= +# LIBRECHAT PRIME FILES FLOW +# ============================================================================= + + +class TestLibreChatPrimeFiles: + """Test the primeFiles() flow from LibreChat's process.js. + + primeFiles() checks if previously uploaded files still exist in the + code interpreter session, and re-uploads them if they've expired. + + Flow: + 1. GET /files/{session_id}?detail=summary + 2. Check response for file by matching name.startsWith("session_id/fileId") + 3. Check if lastModified is less than 23 hours old + 4. If missing or expired, re-upload via POST /upload + """ + + @pytest.fixture(autouse=True) + def setup_mocks(self): + """Set up mocks for primeFiles tests.""" + self.mock_file_service = AsyncMock() + self.mock_file_service.validate_uploads = MagicMock(return_value=None) + self.mock_file_service.store_uploaded_file.return_value = "reuploaded-file-001" + + self.mock_session_service = AsyncMock() + self.mock_session_service.create_session.return_value = Session( + session_id="prime-session-123", + status=SessionStatus.ACTIVE, + created_at=datetime.now(timezone.utc), + last_activity=datetime.now(timezone.utc), + expires_at=datetime.now(timezone.utc) + timedelta(hours=24), + metadata={}, + ) + + from src.dependencies.services import get_file_service, get_session_service + + app.dependency_overrides[get_file_service] = lambda: self.mock_file_service + app.dependency_overrides[get_session_service] = lambda: self.mock_session_service + + yield + + app.dependency_overrides.clear() + + def test_prime_files_check_existing(self, client, auth_headers): + """ + Test checking if a file exists via summary endpoint. + + LibreChat calls: GET /files/{session_id}?detail=summary + Then checks: response.data.find(file => file.name.startsWith(path)) + """ + session_id = "prime-session-123" + file_id = "prime-file-456" + + self.mock_file_service.list_files.return_value = [ + FileInfo( + file_id=file_id, + filename="data.csv", + size=100, + content_type="text/csv", + created_at=datetime.now(timezone.utc), + path="/data.csv", + ) + ] + + response = client.get( + f"/files/{session_id}?detail=summary", headers=auth_headers + ) + + assert response.status_code == 200 + data = response.json() + assert isinstance(data, list) + assert len(data) == 1 + + # Simulate LibreChat's client-side parsing: + # file.name.startsWith("session_id/fileId") + file_identifier = f"{session_id}/{file_id}" + matching = [f for f in data if f["name"].startswith(file_identifier)] + assert len(matching) == 1, \ + f"LibreChat expects to find file by name.startsWith('{file_identifier}')" + + def test_prime_files_reupload_flow(self, client, auth_headers): + """ + Test the re-upload flow when file is expired. + + After checking summary, LibreChat re-uploads via POST /upload + if the file is missing or expired (>23 hours old). + """ + session_id = "expired-session-123" + + # Step 1: Summary returns empty (file expired/cleaned up) + self.mock_file_service.list_files.return_value = [] + + response = client.get( + f"/files/{session_id}?detail=summary", headers=auth_headers + ) + assert response.status_code == 200 + data = response.json() + assert data == [], "Empty session should return empty array" + + # Step 2: Re-upload the file (LibreChat uses 'file' singular) + upload_files = {"file": ("data.csv", io.BytesIO(b"col1,col2\n"), "text/csv")} + upload_data = {"entity_id": "asst_reupload_test"} + + upload_response = client.post( + "/upload", files=upload_files, data=upload_data, headers=auth_headers + ) + assert upload_response.status_code == 200 + result = upload_response.json() + assert result["message"] == "success" + assert "session_id" in result + assert len(result["files"]) == 1 + + def test_prime_files_empty_session_returns_empty_array(self, client, auth_headers): + """ + Test that non-existent session returns empty array, not 404. + + LibreChat expects an empty array for sessions with no files. + A 404 would cause an error in primeFiles(). + """ + self.mock_file_service.list_files.return_value = [] + + response = client.get( + "/files/nonexistent-session-xyz?detail=summary", headers=auth_headers + ) + + assert response.status_code == 200 + data = response.json() + assert data == [], "Non-existent session must return [], not 404" + + def test_prime_files_name_format_matches_client_parsing(self, client, auth_headers): + """ + Test that the name field format can be parsed by LibreChat. + + LibreChat splits the fileIdentifier as: + const [path, queryString] = fileIdentifier.split('?') + const [session_id, id] = path.split('/') + + So the name in summary must be "session_id/fileId" format. + """ + session_id = "parse-test-session" + file_id = "parse-test-file" + + self.mock_file_service.list_files.return_value = [ + FileInfo( + file_id=file_id, + filename="result.json", + size=50, + content_type="application/json", + created_at=datetime.now(timezone.utc), + path="/result.json", + ) + ] + + response = client.get( + f"/files/{session_id}?detail=summary", headers=auth_headers + ) + + assert response.status_code == 200 + data = response.json() + assert len(data) == 1 + + name = data[0]["name"] + # Simulate LibreChat's parsing + parts = name.split("/") + assert len(parts) == 2, f"name must have exactly 2 parts split by '/', got: {name}" + parsed_session_id, parsed_file_id = parts + assert parsed_session_id == session_id, \ + f"First part must be session_id '{session_id}', got: '{parsed_session_id}'" + assert parsed_file_id == file_id, \ + f"Second part must be file_id '{file_id}', got: '{parsed_file_id}'" + + def test_prime_files_last_modified_is_parseable_date(self, client, auth_headers): + """ + Test that lastModified is a parseable date string. + + LibreChat uses: checkIfActive(dateString) which creates new Date(dateString). + The date must be valid JavaScript Date-parseable ISO 8601 format. + """ + self.mock_file_service.list_files.return_value = [ + FileInfo( + file_id="date-test-file", + filename="test.txt", + size=10, + content_type="text/plain", + created_at=datetime.now(timezone.utc), + path="/test.txt", + ) + ] + + response = client.get( + "/files/date-test-session?detail=summary", headers=auth_headers + ) + + assert response.status_code == 200 + data = response.json() + last_modified = data[0]["lastModified"] + + # Must be parseable as ISO 8601 datetime + parsed = datetime.fromisoformat(last_modified.replace("Z", "+00:00")) + assert parsed is not None, "lastModified must be parseable ISO 8601" + # Must end with Z (UTC) for JavaScript Date compatibility + assert last_modified.endswith("Z"), \ + f"lastModified must end with 'Z' for JS Date parsing, got: {last_modified}" + + +# ============================================================================= +# LIBRECHAT CONCURRENCY AND HEADERS +# ============================================================================= + + +class TestLibreChatConcurrency: + """Test rapid sequential access patterns that LibreChat may produce. + + LibreChat can fire multiple tool calls in parallel, leading to + multiple exec requests that reference the same session or files. + TestClient is not thread-safe, so we test rapid sequential requests. + """ + + @patch("src.services.orchestrator.ExecutionOrchestrator.execute") + def test_rapid_exec_requests(self, mock_execute, client, auth_headers): + """ + Test multiple rapid exec requests can be processed without errors. + + LibreChat may send parallel tool calls that execute code simultaneously. + Each should get a valid response. + """ + mock_execute.return_value = ExecResponse( + session_id="concurrent-session", stdout="ok\n", stderr="", files=[] + ) + + responses = [] + for i in range(5): + resp = client.post( + "/exec", + json={"code": f"print({i})", "lang": "py"}, + headers=auth_headers, + ) + responses.append(resp) + + # All requests should succeed + for resp in responses: + assert resp.status_code == 200 + data = resp.json() + assert "session_id" in data + assert "stdout" in data + assert "stderr" in data + + @patch("src.services.orchestrator.ExecutionOrchestrator.execute") + def test_rapid_exec_same_session(self, mock_execute, client, auth_headers): + """ + Test rapid exec requests referencing the same session_id. + + LibreChat may have multiple requests accessing the same session. + """ + session_id = "shared-session-123" + mock_execute.return_value = ExecResponse( + session_id=session_id, stdout="result\n", stderr="", files=[] + ) + + responses = [] + for i in range(3): + resp = client.post( + "/exec", + json={ + "code": f"x = {i}", + "lang": "py", + "session_id": session_id, + }, + headers=auth_headers, + ) + responses.append(resp) + + for resp in responses: + assert resp.status_code == 200 + + +class TestLibreChatFullHeaders: + """Test that the full set of headers LibreChat sends work correctly. + + LibreChat sends various headers depending on the operation. + These tests verify none of them cause issues. + """ + + @patch("src.services.orchestrator.ExecutionOrchestrator.execute") + def test_exec_with_full_librechat_headers(self, mock_execute, client): + """ + Test exec request with all headers LibreChat sends. + + From CodeExecutor.ts and crud.js, LibreChat sends: + - X-API-Key: apiKey + - User-Agent: LibreChat/1.0 + - Content-Type: application/json + """ + mock_execute.return_value = ExecResponse( + session_id="header-test", stdout="ok\n", stderr="", files=[] + ) + + headers = { + "X-API-Key": "test-api-key-for-testing-12345", + "User-Agent": "LibreChat/1.0", + "Content-Type": "application/json", + } + + response = client.post( + "/exec", + json={"code": "print('ok')", "lang": "py"}, + headers=headers, + ) + assert response.status_code == 200 + + def test_upload_with_full_librechat_headers(self, client): + """ + Test upload request with all headers LibreChat sends. + + From crud.js, LibreChat sends: + - X-API-Key: apiKey + - User-Agent: LibreChat/1.0 + - User-Id: req.user.id + - Content-Type: multipart/form-data (set by form) + """ + mock_file_service = AsyncMock() + mock_file_service.store_uploaded_file.return_value = "header-test-file" + mock_file_service.validate_uploads = MagicMock(return_value=None) + + mock_session_service = AsyncMock() + mock_session_service.create_session.return_value = Session( + session_id="header-test-session", + status=SessionStatus.ACTIVE, + created_at=datetime.now(timezone.utc), + last_activity=datetime.now(timezone.utc), + expires_at=datetime.now(timezone.utc) + timedelta(hours=24), + metadata={}, + ) + + from src.dependencies.services import get_file_service, get_session_service + + app.dependency_overrides[get_file_service] = lambda: mock_file_service + app.dependency_overrides[get_session_service] = lambda: mock_session_service + + try: + headers = { + "X-API-Key": "test-api-key-for-testing-12345", + "User-Agent": "LibreChat/1.0", + "User-Id": "user_header_test", + } + + upload_files = {"file": ("test.txt", io.BytesIO(b"content"), "text/plain")} + upload_data = {"entity_id": "asst_header_test"} + + response = client.post( + "/upload", files=upload_files, data=upload_data, headers=headers + ) + assert response.status_code == 200 + result = response.json() + assert result["message"] == "success" + finally: + app.dependency_overrides.clear() + + def test_download_with_full_librechat_headers(self, client): + """ + Test download request with LibreChat headers. + + From crud.js: headers: { 'User-Agent': 'LibreChat/1.0', 'X-API-Key': apiKey } + Timeout: 15000ms + """ + mock_file_service = AsyncMock() + mock_file_service.get_file_info.return_value = FileInfo( + file_id="dl-file", + filename="output.txt", + size=5, + content_type="text/plain", + created_at=datetime.now(timezone.utc), + path="/output.txt", + ) + mock_file_service.get_file_content.return_value = b"hello" + + from src.dependencies.services import get_file_service + + app.dependency_overrides[get_file_service] = lambda: mock_file_service + + try: + headers = { + "X-API-Key": "test-api-key-for-testing-12345", + "User-Agent": "LibreChat/1.0", + } + + response = client.get( + "/download/dl-session/dl-file", headers=headers + ) + assert response.status_code == 200 + assert response.content == b"hello" + finally: + app.dependency_overrides.clear() diff --git a/tests/integration/test_new_features.py b/tests/integration/test_new_features.py index b22bda1..9a52fc9 100644 --- a/tests/integration/test_new_features.py +++ b/tests/integration/test_new_features.py @@ -64,33 +64,6 @@ def test_file_info_state_fields_optional(self): assert file_info.last_used_at is None -class TestRequestFileRestoreState: - """Tests for Issue 3: RequestFile model includes restore_state field.""" - - def test_request_file_has_restore_state_field(self): - """Test that RequestFile model includes restore_state field.""" - from src.models.exec import RequestFile - - file_ref = RequestFile( - id="file-123", - session_id="session-456", - name="data.txt", - restore_state=True, - ) - assert file_ref.restore_state is True - - def test_request_file_restore_state_defaults_false(self): - """Test that restore_state defaults to False.""" - from src.models.exec import RequestFile - - file_ref = RequestFile( - id="file-123", - session_id="session-456", - name="data.txt", - ) - assert file_ref.restore_state is False - - class TestExecuteCodeRequestArgs: """Tests for Issue 2: ExecuteCodeRequest model includes args field.""" @@ -345,7 +318,6 @@ class TestUploadedFileStateRestoration: Uploaded files should share the same behavior as generated files: - After first use in execution, they get a state_hash - - On subsequent use with restore_state=true, that state is restored """ def test_uploaded_file_no_initial_state_hash(self): @@ -428,51 +400,3 @@ async def test_update_file_state_hash_works_for_uploaded_files(self): assert mapping["execution_id"] == "exec-abc" assert "last_used_at" in mapping - def test_restore_state_flag_works_with_state_hash(self): - """Test that RequestFile with restore_state=True works when file has state_hash.""" - from src.models.exec import RequestFile - - # Uploaded file reference with restore_state flag - file_ref = RequestFile( - id="uploaded-file-123", - session_id="session-456", - name="data.csv", - restore_state=True, # Request state restoration - ) - assert file_ref.restore_state is True - - def test_restore_state_requires_state_hash_to_be_set(self): - """Test that state restoration requires file to have state_hash. - - This documents expected behavior: if an uploaded file hasn't been used - yet (no state_hash), restore_state=True is effectively ignored until - the file is used in an execution. - """ - # File with no state_hash (never used in execution) - file_info_no_state = FileInfo( - file_id="uploaded-file-123", - filename="data.csv", - size=1024, - content_type="text/csv", - created_at=datetime.now(timezone.utc), - path="/data.csv", - ) - - # The mount logic checks: file_info.state_hash is truthy - # For uploaded files that haven't been used, this will be None/False - can_restore = bool(file_info_no_state.state_hash) - assert can_restore is False - - # After first use, file has state_hash - file_info_with_state = FileInfo( - file_id="uploaded-file-123", - filename="data.csv", - size=1024, - content_type="text/csv", - created_at=datetime.now(timezone.utc), - path="/data.csv", - state_hash="abc123def456", - ) - - can_restore_now = bool(file_info_with_state.state_hash) - assert can_restore_now is True diff --git a/tests/integration/test_security_integration.py b/tests/integration/test_security_integration.py index eccbe21..a62ee74 100644 --- a/tests/integration/test_security_integration.py +++ b/tests/integration/test_security_integration.py @@ -81,13 +81,12 @@ def test_cors_headers_present(self, client): 405, ] # Either allowed or method not allowed - def test_authorization_header_fallback(self, client): - """Test that Authorization header works as fallback for API key.""" - # Use the test API key from conftest + def test_authorization_header_not_accepted(self, client): + """Test that Authorization header is not accepted for authentication.""" headers = {"Authorization": "Bearer test-api-key-for-testing-12345"} response = client.get("/sessions", headers=headers) - # Should not be 401 (auth failure) - assert response.status_code != 401 + # Only x-api-key header is accepted + assert response.status_code == 401 def test_request_size_limit(self, client): """Test request size limiting.""" @@ -111,21 +110,21 @@ def test_invalid_content_type(self, client): response = client.post("/sessions", data="", headers=headers) assert response.status_code == 415 # Unsupported Media Type - def test_multiple_auth_methods(self, client): - """Test that multiple authentication methods work.""" + def test_only_x_api_key_accepted(self, client): + """Test that only x-api-key header is accepted for authentication.""" test_key = "test-api-key-for-testing-12345" - # Test x-api-key header + # Test x-api-key header - should work response1 = client.get("/sessions", headers={"x-api-key": test_key}) - # Test Authorization Bearer header + # Test Authorization Bearer header - should be rejected response2 = client.get( "/sessions", headers={"Authorization": f"Bearer {test_key}"} ) - # Both should have same result (not 401) - assert response1.status_code == response2.status_code + # x-api-key should work, Bearer should be rejected assert response1.status_code != 401 + assert response2.status_code == 401 def test_case_insensitive_headers(self, client): """Test that header names are case insensitive.""" diff --git a/tests/integration/test_state_api.py b/tests/integration/test_state_api.py index 74c0af8..a9b7068 100644 --- a/tests/integration/test_state_api.py +++ b/tests/integration/test_state_api.py @@ -222,34 +222,3 @@ def test_delete_nonexistent_still_returns_204( assert response.status_code == 204 -class TestExecResponseStateFields: - """Tests for state fields in /exec response.""" - - def test_exec_response_includes_state_fields_for_python(self, client, auth_headers): - """Test that Python execution response includes state fields.""" - # This is a more complex integration test that requires full stack - # For now, we test the model structure - from src.models.exec import ExecResponse - - response = ExecResponse( - session_id="test-session", - stdout="output", - stderr="", - has_state=True, - state_size=1024, - state_hash="abc123", - ) - - assert response.has_state is True - assert response.state_size == 1024 - assert response.state_hash == "abc123" - - def test_exec_response_defaults_state_fields(self): - """Test that state fields have correct defaults.""" - from src.models.exec import ExecResponse - - response = ExecResponse(session_id="test-session", stdout="", stderr="") - - assert response.has_state is False - assert response.state_size is None - assert response.state_hash is None diff --git a/tests/unit/test_nsjail_config.py b/tests/unit/test_nsjail_config.py new file mode 100644 index 0000000..4108858 --- /dev/null +++ b/tests/unit/test_nsjail_config.py @@ -0,0 +1,271 @@ +"""Unit tests for NsjailConfig builder and SandboxInfo dataclass.""" + +import pytest +from pathlib import Path +from datetime import datetime + +from src.services.sandbox.nsjail import NsjailConfig, SandboxInfo + + +class TestNsjailConfigBuildArgs: + """Test NsjailConfig.build_args() generates correct nsjail CLI arguments.""" + + def test_basic_python_args(self): + """Test basic argument generation for Python.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["python3", "code.py"], + language="py", + timeout=30, + ) + assert "--mode" in args + assert "o" in args + assert "--cwd" in args + assert "/mnt/data" in args + assert "python3" in args + assert "code.py" in args + + def test_network_disabled_by_default(self): + """Test that network namespace is created by default (no network access).""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["python3", "code.py"], + language="py", + ) + # Network isolation is on by default (iface_no_lo disables loopback) + assert "--iface_no_lo" in args + # Should NOT have --disable_clone_newnet + assert "--disable_clone_newnet" not in args + + def test_network_enabled(self): + """Test network access when enabled (disable network namespace).""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["python3", "code.py"], + language="py", + network=True, + ) + # When network=True, network namespace is disabled + assert "--disable_clone_newnet" in args + assert "--iface_no_lo" not in args + + def test_timeout_set(self): + """Test timeout is passed correctly.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["python3", "code.py"], + language="py", + timeout=60, + ) + assert "--time_limit" in args + idx = args.index("--time_limit") + assert args[idx + 1] == "60" + + def test_repl_mode_timeout_zero(self): + """Test REPL mode sets timeout to 0 and enables skip_setsid.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["python3", "/opt/repl_server.py"], + language="py", + repl_mode=True, + ) + assert "--time_limit" in args + idx = args.index("--time_limit") + assert args[idx + 1] == "0" + assert "--skip_setsid" in args + + def test_different_languages(self): + """Test args generation for different languages.""" + config = NsjailConfig() + for lang in ["py", "js", "go", "java", "c", "cpp", "rs"]: + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language=lang, + ) + assert len(args) > 0 + assert "--mode" in args + assert "echo" in args + assert "test" in args + + def test_capabilities_dropped_by_default(self): + """Test capabilities are dropped (no --keep_caps flag).""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + # nsjail drops all caps by default. --keep_caps would KEEP them. + assert "--keep_caps" not in args + + def test_user_namespace_disabled(self): + """Test user namespace is disabled (Docker compatibility).""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + assert "--disable_clone_newuser" in args + + def test_mount_namespace_disabled(self): + """Test mount namespace is disabled (executor handles /mnt/data via unshare).""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + assert "--disable_clone_newns" in args + + def test_hostname_set_to_sandbox(self): + """Test hostname is set to 'sandbox'.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + assert "--hostname" in args + idx = args.index("--hostname") + assert args[idx + 1] == "sandbox" + + def test_proc_disabled(self): + """Test proc is disabled.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + assert "--disable_proc" in args + + def test_command_separator(self): + """Test command separator '--' is present before the command.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["python3", "code.py"], + language="py", + ) + assert "--" in args + separator_idx = args.index("--") + assert args[separator_idx + 1] == "python3" + assert args[separator_idx + 2] == "code.py" + + def test_env_vars_passed(self): + """Test environment variables are passed correctly.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + env={"MY_VAR": "my_value", "ANOTHER": "val2"}, + ) + assert "--env" in args + env_indices = [i for i, a in enumerate(args) if a == "--env"] + env_values = [args[i + 1] for i in env_indices] + assert "MY_VAR=my_value" in env_values + assert "ANOTHER=val2" in env_values + + def test_user_id_set(self): + """Test user and group IDs are set.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + assert "--user" in args + assert "--group" in args + + def test_cwd_is_mnt_data(self): + """Test working directory is /mnt/data.""" + config = NsjailConfig() + args = config.build_args( + sandbox_dir="/tmp/sandbox/data", + command=["echo", "test"], + language="py", + ) + idx = args.index("--cwd") + assert args[idx + 1] == "/mnt/data" + + +class TestSandboxInfo: + """Test SandboxInfo dataclass.""" + + def test_id_property(self): + """Test id property returns sandbox_id.""" + info = SandboxInfo( + sandbox_id="abc123", + sandbox_dir=Path("/tmp/abc"), + data_dir=Path("/tmp/abc/data"), + language="py", + session_id="sess1", + created_at=datetime.utcnow(), + ) + assert info.id == "abc123" + + def test_default_values(self): + """Test default values are set correctly.""" + info = SandboxInfo( + sandbox_id="abc", + sandbox_dir=Path("/tmp/abc"), + data_dir=Path("/tmp/abc/data"), + language="py", + session_id="s1", + created_at=datetime.utcnow(), + ) + assert info.repl_mode is False + assert info.labels == {} + + def test_repl_mode_set(self): + """Test repl_mode can be set.""" + info = SandboxInfo( + sandbox_id="abc", + sandbox_dir=Path("/tmp/abc"), + data_dir=Path("/tmp/abc/data"), + language="py", + session_id="s1", + created_at=datetime.utcnow(), + repl_mode=True, + ) + assert info.repl_mode is True + + def test_labels_set(self): + """Test labels can be set.""" + labels = {"key1": "val1", "key2": "val2"} + info = SandboxInfo( + sandbox_id="abc", + sandbox_dir=Path("/tmp/abc"), + data_dir=Path("/tmp/abc/data"), + language="py", + session_id="s1", + created_at=datetime.utcnow(), + labels=labels, + ) + assert info.labels == labels + + def test_fields_stored(self): + """Test all fields are stored correctly.""" + now = datetime.utcnow() + info = SandboxInfo( + sandbox_id="sandbox-xyz", + sandbox_dir=Path("/var/sandboxes/xyz"), + data_dir=Path("/var/sandboxes/xyz/data"), + language="go", + session_id="session-456", + created_at=now, + ) + assert info.sandbox_id == "sandbox-xyz" + assert info.sandbox_dir == Path("/var/sandboxes/xyz") + assert info.data_dir == Path("/var/sandboxes/xyz/data") + assert info.language == "go" + assert info.session_id == "session-456" + assert info.created_at == now diff --git a/tests/unit/test_orchestrator.py b/tests/unit/test_orchestrator.py index f364f8d..e6de994 100644 --- a/tests/unit/test_orchestrator.py +++ b/tests/unit/test_orchestrator.py @@ -280,10 +280,10 @@ class TestExplicitFileMounting: """Tests for explicit file mounting behavior.""" @pytest.mark.asyncio - async def test_explicit_mount_with_restore_state( + async def test_explicit_mount_files( self, orchestrator, mock_file_service ): - """Explicit mount should handle restore_state flag.""" + """Explicit mount should mount requested files.""" from src.models.exec import RequestFile mock_file_service.get_file_info = AsyncMock( @@ -294,7 +294,6 @@ async def test_explicit_mount_with_restore_state( content_type="text/csv", created_at=datetime.now(), path="/mnt/data/data.csv", - state_hash="abc123", ) ) @@ -306,7 +305,6 @@ async def test_explicit_mount_with_restore_state( id="file-1", session_id="test-session", name="data.csv", - restore_state=True, ), ], ) @@ -316,19 +314,11 @@ async def test_explicit_mount_with_restore_state( session_id="test-session", ) - # Mock the state loading - with patch.object( - orchestrator, "_load_state_by_hash", new_callable=AsyncMock - ) as mock_load: - with patch("src.services.orchestrator.settings") as mock_settings: - mock_settings.state_persistence_enabled = True - - result = await orchestrator._mount_explicit_files(ctx) - - # Verify state loading was triggered - mock_load.assert_called_once_with(ctx, "abc123") + result = await orchestrator._mount_explicit_files(ctx) assert len(result) == 1 + assert result[0]["file_id"] == "file-1" + assert result[0]["filename"] == "data.csv" @pytest.mark.asyncio async def test_explicit_mount_fallback_to_name_lookup( diff --git a/tests/unit/test_sandbox_executor.py b/tests/unit/test_sandbox_executor.py new file mode 100644 index 0000000..cabe9f1 --- /dev/null +++ b/tests/unit/test_sandbox_executor.py @@ -0,0 +1,223 @@ +"""Unit tests for SandboxExecutor.""" + +import pytest +from unittest.mock import patch, AsyncMock, MagicMock + +from src.services.sandbox.executor import SandboxExecutor +from src.services.sandbox.nsjail import NsjailConfig + + +class TestBuildSanitizedEnv: + """Test _build_sanitized_env method.""" + + def test_python_env(self): + """Test sanitized env for Python.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("py") + assert "PYTHONUNBUFFERED" in env + assert env["PYTHONUNBUFFERED"] == "1" + assert "PATH" in env + assert "PYTHONDONTWRITEBYTECODE" in env + assert "PYTHONPATH" in env + assert "MPLBACKEND" in env + + def test_javascript_env(self): + """Test sanitized env for JavaScript.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("js") + assert "NODE_PATH" in env + assert "PATH" in env + + def test_typescript_env(self): + """Test sanitized env for TypeScript.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("ts") + assert "NODE_PATH" in env + + def test_go_env(self): + """Test sanitized env for Go.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("go") + assert "GO111MODULE" in env + assert "GOCACHE" in env + + def test_java_env(self): + """Test sanitized env for Java.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("java") + assert "CLASSPATH" in env + assert "JAVA_OPTS" in env + + def test_c_env(self): + """Test sanitized env for C.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("c") + assert "CC" in env + assert env["CC"] == "gcc" + + def test_cpp_env(self): + """Test sanitized env for C++.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("cpp") + assert "CXX" in env + assert env["CXX"] == "g++" + + def test_rust_env(self): + """Test sanitized env for Rust.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("rs") + assert "CARGO_HOME" in env + assert "RUSTUP_HOME" in env + + def test_php_env(self): + """Test sanitized env for PHP.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("php") + assert "PHP_INI_SCAN_DIR" in env + + def test_r_env(self): + """Test sanitized env for R.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("r") + assert "R_LIBS_USER" in env + + def test_fortran_env(self): + """Test sanitized env for Fortran.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("f90") + assert "FC" in env + assert env["FC"] == "gfortran" + + def test_unknown_language_has_base_env(self): + """Test unknown language gets base env only.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env("unknown") + assert "PATH" in env + assert "HOME" in env + assert "TMPDIR" in env + # Should not have language-specific vars + assert "PYTHONUNBUFFERED" not in env + assert "NODE_PATH" not in env + + def test_none_language_has_base_env(self): + """Test None language gets base env only.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + env = executor._build_sanitized_env(None) + assert "PATH" in env + assert "HOME" in env + + def test_base_env_always_present(self): + """Test base env vars are always present.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + for lang in ["py", "js", "go", "java", "c", "cpp", "rs", "r", "f90"]: + env = executor._build_sanitized_env(lang) + assert "HOME" in env + assert env["HOME"] == "/tmp" + assert "TMPDIR" in env + assert env["TMPDIR"] == "/tmp" + + +class TestSanitizeOutput: + """Test _sanitize_output method.""" + + def test_normal_output(self): + """Test normal output is preserved.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output(b"hello world\n") + assert result == "hello world\n" + + def test_unicode_output(self): + """Test unicode output is handled.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output("hello 世界\n".encode("utf-8")) + assert "hello 世界" in result + + def test_truncates_large_output(self): + """Test large output is truncated.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + large_output = b"x" * (1024 * 1024 + 100) + result = executor._sanitize_output(large_output) + assert "[Output truncated" in result + + def test_strips_control_chars(self): + """Test control characters are stripped.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output(b"hello\x00\x01world") + assert "\x00" not in result + assert "\x01" not in result + assert "hello" in result + assert "world" in result + + def test_preserves_newlines(self): + """Test newlines are preserved.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output(b"line1\nline2\n") + assert result == "line1\nline2\n" + + def test_preserves_tabs(self): + """Test tabs are preserved.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output(b"col1\tcol2\n") + assert "\t" in result + + def test_empty_output(self): + """Test empty output.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output(b"") + assert result == "" + + def test_invalid_utf8_replaced(self): + """Test invalid UTF-8 bytes are replaced.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._sanitize_output(b"hello \xff\xfe world") + # Should not raise, invalid bytes replaced + assert "hello" in result + assert "world" in result + + +class TestEscapeEnvValue: + """Test _escape_env_value method.""" + + def test_simple_value(self): + """Test simple value escaping.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._escape_env_value("simple") + assert result == "'simple'" + + def test_value_with_single_quotes(self): + """Test value with single quotes.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._escape_env_value("it's") + assert "it" in result + assert "s" in result + + def test_empty_value(self): + """Test empty value.""" + config = NsjailConfig() + executor = SandboxExecutor(config) + result = executor._escape_env_value("") + assert result == "''" diff --git a/tests/unit/test_sandbox_manager.py b/tests/unit/test_sandbox_manager.py new file mode 100644 index 0000000..ef27e36 --- /dev/null +++ b/tests/unit/test_sandbox_manager.py @@ -0,0 +1,322 @@ +"""Unit tests for SandboxManager.""" + +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path + +from src.services.sandbox.manager import SandboxManager + + +class TestSandboxManagerAvailability: + """Test SandboxManager availability checks.""" + + def test_is_available_when_nsjail_exists(self): + """Test is_available returns True when nsjail binary is found.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test-sandboxes") + manager._initialization_error = None + assert manager.is_available() is True + + def test_is_not_available_when_nsjail_missing(self): + """Test is_available returns False when nsjail binary is not found.""" + with patch("shutil.which", return_value=None): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test-sandboxes") + manager._initialization_error = None + assert manager.is_available() is False + + def test_get_initialization_error_nsjail_missing(self): + """Test error message when nsjail is not available.""" + with patch("shutil.which", return_value=None): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test-sandboxes") + manager._initialization_error = None + error = manager.get_initialization_error() + assert error is not None + assert "nsjail" in error.lower() + + def test_get_initialization_error_none_when_available(self): + """Test no error when nsjail is available and init succeeded.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test-sandboxes") + manager._initialization_error = None + assert manager.get_initialization_error() is None + + def test_get_initialization_error_from_init(self): + """Test initialization error is reported.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test-sandboxes") + manager._initialization_error = "Failed to create directory" + assert manager.get_initialization_error() == "Failed to create directory" + + +class TestSandboxLifecycle: + """Test sandbox creation and destruction.""" + + def test_create_sandbox_creates_directory(self, tmp_path): + """Test create_sandbox creates the data directory.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + + assert info.data_dir.exists() + assert info.language == "py" + assert info.session_id == "session1" + + def test_create_sandbox_sets_repl_mode(self, tmp_path): + """Test create_sandbox sets repl_mode correctly.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py", repl_mode=True) + + assert info.repl_mode is True + + def test_create_sandbox_sets_labels(self, tmp_path): + """Test create_sandbox sets appropriate labels.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + + assert info.labels["com.code-interpreter.managed"] == "true" + assert info.labels["com.code-interpreter.session-id"] == "session1" + assert info.labels["com.code-interpreter.language"] == "py" + + def test_create_sandbox_generates_unique_ids(self, tmp_path): + """Test create_sandbox generates unique sandbox IDs.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info1 = manager.create_sandbox("session1", "py") + info2 = manager.create_sandbox("session2", "py") + + assert info1.sandbox_id != info2.sandbox_id + + def test_destroy_sandbox_removes_directory(self, tmp_path): + """Test destroy_sandbox removes the sandbox directory.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + assert info.sandbox_dir.exists() + + result = manager.destroy_sandbox(info) + assert result is True + assert not info.sandbox_dir.exists() + + def test_destroy_sandbox_nonexistent_returns_true(self, tmp_path): + """Test destroying a non-existent sandbox returns True.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + from src.services.sandbox.nsjail import SandboxInfo + from datetime import datetime + + info = SandboxInfo( + sandbox_id="nonexistent", + sandbox_dir=tmp_path / "nonexistent", + data_dir=tmp_path / "nonexistent" / "data", + language="py", + session_id="session1", + created_at=datetime.utcnow(), + ) + + result = manager.destroy_sandbox(info) + assert result is True + + +class TestFileOperations: + """Test file copy and retrieval operations.""" + + def test_copy_content_to_sandbox(self, tmp_path): + """Test writing content to a sandbox.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"), \ + patch("os.chmod"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + result = manager.copy_content_to_sandbox( + info, b"hello world", "/mnt/data/test.txt", "py" + ) + assert result is True + assert (info.data_dir / "test.txt").read_bytes() == b"hello world" + + def test_copy_content_extracts_filename(self, tmp_path): + """Test that copy extracts filename from full path.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"), \ + patch("os.chmod"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + result = manager.copy_content_to_sandbox( + info, b"data", "/mnt/data/subdir/file.txt", "py" + ) + assert result is True + # Should extract just the filename + assert (info.data_dir / "file.txt").read_bytes() == b"data" + + def test_get_file_content_from_sandbox(self, tmp_path): + """Test reading content from a sandbox.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"), \ + patch("os.chmod"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + (info.data_dir / "output.txt").write_bytes(b"result data") + content = manager.get_file_content_from_sandbox( + info, "/mnt/data/output.txt" + ) + assert content == b"result data" + + def test_get_file_content_not_found(self, tmp_path): + """Test reading non-existent file returns None.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + content = manager.get_file_content_from_sandbox( + info, "/mnt/data/nonexistent.txt" + ) + assert content is None + + def test_get_file_content_mnt_data_prefix(self, tmp_path): + """Test reading file with /mnt/data/ prefix.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"), \ + patch("os.chown"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = tmp_path + manager._initialization_error = None + + info = manager.create_sandbox("session1", "py") + (info.data_dir / "test.py").write_bytes(b"print('hi')") + content = manager.get_file_content_from_sandbox( + info, "/mnt/data/test.py" + ) + assert content == b"print('hi')" + + +class TestManagerUtility: + """Test utility methods.""" + + def test_get_user_id_for_language(self): + """Test get_user_id_for_language returns correct IDs.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test") + manager._initialization_error = None + + # Python user ID is 999 + assert manager.get_user_id_for_language("py") == 999 + # JS user ID is 1001 + assert manager.get_user_id_for_language("js") == 1001 + + def test_close_is_noop(self): + """Test close method is a no-op.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + manager._nsjail_config = MagicMock() + manager._executor = MagicMock() + manager._base_dir = Path("/tmp/test") + manager._initialization_error = None + # Should not raise + manager.close() + + def test_executor_property(self): + """Test executor property returns the executor.""" + with patch("shutil.which", return_value="/usr/bin/nsjail"): + with patch.object(SandboxManager, "__init__", lambda self: None): + manager = SandboxManager() + mock_executor = MagicMock() + manager._nsjail_config = MagicMock() + manager._executor = mock_executor + manager._base_dir = Path("/tmp/test") + manager._initialization_error = None + + assert manager.executor is mock_executor