diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 00000000..c17f905d --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,107 @@ +name: Build and Push Advanced Backend Docker Images + +on: + push: + branches: + - main + - develop + paths: + - 'backends/advanced-backend/**' + pull_request: + branches: + - main + paths: + - 'backends/advanced-backend/**' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME_BACKEND: ${{ github.repository }}/friend-backend + IMAGE_NAME_WEBUI: ${{ github.repository }}/friend-webui + +jobs: + build-backend: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for backend + id: meta-backend + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_BACKEND }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix={{branch}}- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push backend Docker image + uses: docker/build-push-action@v5 + with: + context: ./backends/advanced-backend + file: ./backends/advanced-backend/Dockerfile + push: true + tags: ${{ steps.meta-backend.outputs.tags }} + labels: ${{ steps.meta-backend.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + platforms: linux/amd64,linux/arm64 + + build-webui: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for webui + id: meta-webui + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_WEBUI }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix={{branch}}- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push webui Docker image + uses: docker/build-push-action@v5 + with: + context: ./backends/advanced-backend + file: ./backends/advanced-backend/Dockerfile.webui + push: true + tags: ${{ steps.meta-webui.outputs.tags }} + labels: ${{ steps.meta-webui.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + platforms: linux/amd64,linux/arm64 \ No newline at end of file diff --git a/backends/advanced-backend/README.md b/backends/advanced-backend/README.md index 506eb9a7..ff730706 100644 --- a/backends/advanced-backend/README.md +++ b/backends/advanced-backend/README.md @@ -1,161 +1 @@ -# Advanced Omi Backend - -## Transcription Configuration - -This backend supports conditional transcription methods: - -### 1. Deepgram API (Not Yet Implemented) -When `DEEPGRAM_API_KEY` is provided, the system is designed to use Deepgram's cloud API for transcription. However, this feature is not yet implemented and will fall back to offline ASR with a warning. - -### 2. Offline ASR (Current Implementation) -The system uses the offline ASR service specified by `OFFLINE_ASR_TCP_URI`. - -```bash -export OFFLINE_ASR_TCP_URI="tcp://192.168.0.110:8765/" -``` - -## Environment Variables - -```bash -# For future Deepgram implementation (currently not implemented) -DEEPGRAM_API_KEY="your_api_key" - -# Required for offline ASR (current implementation) -OFFLINE_ASR_TCP_URI="tcp://192.168.0.110:8765/" -``` - -The system automatically detects which transcription method to use based on the availability of `DEEPGRAM_API_KEY`, but currently always falls back to offline ASR. - -# Setup - -To setup the backend, you need to do the following: -0. Clone the repository -1. Change the directory to the backend, -`cd backends/advanced-backend` -2. Fill out the .env variables as you require (check the .env.template for the required variables) -3. Run the backend with `docker compose up --build -d`. This will take a couple minutes, be patient. - - -# Backend Walkthrough - -## Architecture Overview - -This is a real-time audio processing backend built with FastAPI that handles continuous audio streams, transcription, memory storage, and conversation management. The system is being designed for 24/7 operation with robust recovery mechanisms. - -## Core Services (Docker Compose) - -- **friend-backend**: Main FastAPI application serving the audio processing pipeline -- **streamlit**: Web UI for conversation management, speaker enrollment, and system monitoring -- **proxy**: Nginx reverse proxy handling external requests -- **qdrant**: Vector database for semantic memory storage and retrieval -- **mongo**: Document database for conversations, users, and speakers -- **Optional services**: speaker-recognition (GPU-based), ollama (LLM inference) - -## Audio Processing Flow - -### 1. Audio Ingestion -- Clients connect via WebSocket endpoints: - - `/ws`: Opus-encoded audio streams (from mobile apps) - - `/ws_pcm`: Raw PCM audio streams (from desktop clients) -- Each client gets a `ClientState` managing their processing pipeline -- Audio data flows into central queues to decouple ingestion from processing - -### 2. Parallel Processing Pipeline -The system runs multiple async consumers processing audio in parallel: - -**Audio Saver Consumer** (`_audio_saver`): -- Buffers incoming PCM audio data -- Writes 60-second WAV chunks to `./data/audio_chunks/` directory -- Tracks speech segments for audio cropping -- Generates unique audio UUIDs for each chunk - -**Transcription Consumer** (`_transcription_processor`): -- Sends audio chunks to Wyoming ASR service via TCP -- Supports fallback to Deepgram API (not yet implemented) -- Handles real-time transcription with segment timing -- Processes voice activity detection (VAD) events - -**Memory Consumer** (`_memory_processor`): -- Stores completed transcripts in mem0 vector database -- Creates semantic memories for long-term retrieval -- Manages conversation context and user associations -- Handles background memory processing - -### 3. Advanced Features - -**Speaker Recognition**: -- Voice enrollment via audio samples -- Real-time speaker identification during conversations -- Speaker diarization and transcript attribution - -**Audio Cropping**: -- Removes silence using speech segment detection -- Preserves only voice activity with configurable padding -- Reduces storage requirements and improves processing efficiency - -**Action Items Extraction**: -- Uses LLM (Ollama) to extract tasks from conversations -- Provides API for task management - -**Conversation Management**: -- Automatic conversation segmentation based on silence timeouts -- Session state management across client connections -- Conversation closing and archival - -### 4. Data Storage - -**MongoDB Collections**: -- `audio_chunks`: Audio file metadata, transcripts, timing, speakers -- `users`: User profiles and settings -- `speakers`: Voice enrollment data and models - -**File System**: -- `./data/audio_chunks/`: Raw and cropped WAV files -- `./data/qdrant_data/`: Vector database storage -- `./data/mongo_data/`: Document database storage -- `./data/neo4j_data/`: Graph database storage -- `./data/debug_dir/`: Debug and system tracking data - -### 5. Health & Monitoring - -Current health checks verify: -- MongoDB connectivity (critical service) -- ASR service availability (Wyoming protocol) -- Memory service (mem0 + Qdrant + Ollama) -- Speaker recognition service -- File system access - -## Key Classes & Components - -- `ClientState`: Per-client audio processing state and queues -- `TranscriptionManager`: ASR service management and reconnection logic -- `ChunkRepo`: MongoDB operations for audio chunks and metadata -- `MemoryService`: mem0 integration for semantic memory -- `SpeakerService`: speaker recognition and enrollment -- `ActionItemsService`: LLM-based task extraction and management - -## Recovery & Reliability -TODO - -## Metrics & Monitoring Plan - -### Target: 24 Hours Uninterrupted Audio Processing - -The primary goal is to achieve at least 24 hours of continuous audio recording and processing without interruptions. The metrics system will track: - -### Core Metrics to Implement - -**System Uptime Metrics**: -- Total system uptime vs. total recording time -- Service-level uptime for each component (friend-backend, mongo, qdrant, ASR, etc.) -- Connection uptime per client -- WebSocket connection stability and reconnection events - -**Audio Processing Metrics**: -- Total audio recorded (duration in hours/minutes) -- Total voice activity detected vs. silence -- Audio chunks successfully processed vs. failed -- Transcription success rate and latency -- Memory storage success rate - -On the happy path, you could do `sudo rm -rf ./data/` to reset all system data. \ No newline at end of file +(QuickStart)[https://github.com/AnkushMalaker/friend-lite/blob/main/backends/advanced-backend/Docs/quickstart.md] \ No newline at end of file diff --git a/backends/advanced-backend/docker-compose.yml b/backends/advanced-backend/docker-compose.yml index b45739f8..29175a5c 100644 --- a/backends/advanced-backend/docker-compose.yml +++ b/backends/advanced-backend/docker-compose.yml @@ -13,24 +13,56 @@ services: - ./data/debug_dir:/app/debug_dir - ./data:/app/data environment: + # Core backend settings + - HOST=${HOST:-0.0.0.0} + - PORT=${PORT:-8000} + - DEBUG=${DEBUG:-false} + - DEBUG_DUMP_DIR=${DEBUG_DUMP_DIR:-debug_dumps} + + # Database connections + - MONGODB_URI=${MONGODB_URI:-mongodb://mongo:27017} + - QDRANT_BASE_URL=${QDRANT_BASE_URL:-qdrant} + + # Audio processing + - NEW_CONVERSATION_TIMEOUT_MINUTES=${NEW_CONVERSATION_TIMEOUT_MINUTES:-1.5} + - AUDIO_CROPPING_ENABLED=${AUDIO_CROPPING_ENABLED:-true} + - MIN_SPEECH_SEGMENT_DURATION=${MIN_SPEECH_SEGMENT_DURATION:-1.0} + - CROPPING_CONTEXT_PADDING=${CROPPING_CONTEXT_PADDING:-0.1} + + # Authentication (required) + - AUTH_SECRET_KEY=${AUTH_SECRET_KEY} + - ADMIN_PASSWORD=${ADMIN_PASSWORD} + - ADMIN_EMAIL=${ADMIN_EMAIL:-admin@example.com} + - COOKIE_SECURE=${COOKIE_SECURE:-false} + + # Transcription providers + - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER} - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} - MISTRAL_API_KEY=${MISTRAL_API_KEY} - - MISTRAL_MODEL=${MISTRAL_MODEL} - - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER} - - OFFLINE_ASR_TCP_URI=${OFFLINE_ASR_TCP_URI} + - MISTRAL_MODEL=${MISTRAL_MODEL:-voxtral-mini-2507} + - OFFLINE_ASR_TCP_URI=${OFFLINE_ASR_TCP_URI:-tcp://localhost:8765} + + # LLM providers + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1} + - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o} + - OPENAI_EMBEDDER_MODEL=${OPENAI_EMBEDDER_MODEL:-text-embedding-3-small} - OLLAMA_BASE_URL=${OLLAMA_BASE_URL} - - HF_TOKEN=${HF_TOKEN} + - OLLAMA_EMBEDDER_MODEL=${OLLAMA_EMBEDDER_MODEL:-nomic-embed-text:latest} + + # Memory services + - NEO4J_HOST=${NEO4J_HOST:-neo4j-mem0} + - NEO4J_USER=${NEO4J_USER:-neo4j} + - NEO4J_PASSWORD=${NEO4J_PASSWORD:-password} + - MEM0_TELEMETRY=${MEM0_TELEMETRY:-false} + - MEM0_ORGANIZATION_ID=${MEM0_ORGANIZATION_ID:-friend-lite-org} + - MEM0_PROJECT_ID=${MEM0_PROJECT_ID:-audio-conversations} + - MEM0_APP_ID=${MEM0_APP_ID:-omi-backend} + + # Additional services - SPEAKER_SERVICE_URL=${SPEAKER_SERVICE_URL} - - ADMIN_PASSWORD=${ADMIN_PASSWORD} - - ADMIN_EMAIL=${ADMIN_EMAIL} - - AUTH_SECRET_KEY=${AUTH_SECRET_KEY} - - LLM_PROVIDER=${LLM_PROVIDER} - - OPENAI_API_KEY=${OPENAI_API_KEY} - - OPENAI_BASE_URL=${OPENAI_BASE_URL} - - OPENAI_MODEL=${OPENAI_MODEL} - - NEO4J_HOST=${NEO4J_HOST} - - NEO4J_USER=${NEO4J_USER} - - NEO4J_PASSWORD=${NEO4J_PASSWORD} + - HF_TOKEN=${HF_TOKEN} depends_on: qdrant: condition: service_started @@ -53,9 +85,10 @@ services: ports: - "8501:8501" environment: - - BACKEND_API_URL=http://friend-backend:8000 - - BACKEND_PUBLIC_URL=http://100.99.62.5:8000 # Your BROWSER should be able to access this (Only for displaying audio) - - STREAMLIT_SERVER_ENABLE_CORS=false + - BACKEND_API_URL=${BACKEND_API_URL:-http://friend-backend:8000} + - BACKEND_PUBLIC_URL=${BACKEND_PUBLIC_URL:-http://localhost:8000} + - STREAMLIT_SERVER_ENABLE_CORS=${STREAMLIT_SERVER_ENABLE_CORS:-false} + - DEBUG=${DEBUG:-false} depends_on: friend-backend: condition: service_healthy diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..4909e573 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,149 @@ +services: + friend-backend: + image: ghcr.io/AnkushMalaker/friend-lite/friend-backend:latest + ports: + - "8000:8000" + env_file: + - backends/advanced-backend/.env + volumes: + - ./data/audio_chunks:/app/audio_chunks + - ./data/debug_dir:/app/debug_dir + - ./data:/app/data + environment: + # Core backend settings + - HOST=${HOST:-0.0.0.0} + - PORT=${PORT:-8000} + - DEBUG=${DEBUG:-false} + - DEBUG_DUMP_DIR=${DEBUG_DUMP_DIR:-debug_dumps} + + # Database connections + - MONGODB_URI=${MONGODB_URI:-mongodb://mongo:27017} + - QDRANT_BASE_URL=${QDRANT_BASE_URL:-qdrant} + + # Audio processing + - NEW_CONVERSATION_TIMEOUT_MINUTES=${NEW_CONVERSATION_TIMEOUT_MINUTES:-1.5} + - AUDIO_CROPPING_ENABLED=${AUDIO_CROPPING_ENABLED:-true} + - MIN_SPEECH_SEGMENT_DURATION=${MIN_SPEECH_SEGMENT_DURATION:-1.0} + - CROPPING_CONTEXT_PADDING=${CROPPING_CONTEXT_PADDING:-0.1} + + # Authentication (required) + - AUTH_SECRET_KEY=${AUTH_SECRET_KEY} + - ADMIN_PASSWORD=${ADMIN_PASSWORD} + - ADMIN_EMAIL=${ADMIN_EMAIL:-admin@example.com} + - COOKIE_SECURE=${COOKIE_SECURE:-false} + + # Transcription providers + - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER} + - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} + - MISTRAL_API_KEY=${MISTRAL_API_KEY} + - MISTRAL_MODEL=${MISTRAL_MODEL:-voxtral-mini-2507} + - OFFLINE_ASR_TCP_URI=${OFFLINE_ASR_TCP_URI:-tcp://localhost:8765} + + # LLM providers + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1} + - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o} + - OPENAI_EMBEDDER_MODEL=${OPENAI_EMBEDDER_MODEL:-text-embedding-3-small} + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL} + - OLLAMA_EMBEDDER_MODEL=${OLLAMA_EMBEDDER_MODEL:-nomic-embed-text:latest} + + # Memory services + - NEO4J_HOST=${NEO4J_HOST:-neo4j-mem0} + - NEO4J_USER=${NEO4J_USER:-neo4j} + - NEO4J_PASSWORD=${NEO4J_PASSWORD:-password} + - MEM0_TELEMETRY=${MEM0_TELEMETRY:-false} + - MEM0_ORGANIZATION_ID=${MEM0_ORGANIZATION_ID:-friend-lite-org} + - MEM0_PROJECT_ID=${MEM0_PROJECT_ID:-audio-conversations} + - MEM0_APP_ID=${MEM0_APP_ID:-omi-backend} + + # Additional services + - SPEAKER_SERVICE_URL=${SPEAKER_SERVICE_URL} + - HF_TOKEN=${HF_TOKEN} + depends_on: + qdrant: + condition: service_started + mongo: + condition: service_started + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/readiness"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 5s + restart: unless-stopped + + streamlit: + image: ghcr.io/AnkushMalaker/friend-lite/friend-webui:latest + ports: + - "8501:8501" + environment: + - BACKEND_API_URL=http://friend-backend:8000 + - BACKEND_PUBLIC_URL=${BACKEND_PUBLIC_URL:-http://localhost:8000} + - STREAMLIT_SERVER_ENABLE_CORS=false + depends_on: + friend-backend: + condition: service_healthy + mongo: + condition: service_started + qdrant: + condition: service_started + + qdrant: + image: qdrant/qdrant:latest + ports: + - "6333:6333" # gRPC + - "6334:6334" # HTTP + volumes: + - ./data/qdrant_data:/qdrant/storage + + mongo: + image: mongo:4.4.18 + ports: + - "27017:27017" + volumes: + - ./data/mongo_data:/data/db + + # Optional services (uncomment to enable) + + # neo4j-mem0: + # image: neo4j:5.15-community + # ports: + # - "7474:7474" # HTTP + # - "7687:7687" # Bolt + # environment: + # - NEO4J_AUTH=neo4j/${NEO4J_PASSWORD:-password} + # - NEO4J_PLUGINS=["apoc"] + # - NEO4J_dbms_security_procedures_unrestricted=apoc.* + # - NEO4J_dbms_security_procedures_allowlist=apoc.* + # volumes: + # - ./data/neo4j_data:/data + # - ./data/neo4j_logs:/logs + # restart: unless-stopped + + # ollama: + # image: ollama/ollama:latest + # container_name: ollama + # ports: + # - "11434:11434" + # volumes: + # - ollama_data:/root/.ollama + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # count: all + # capabilities: [gpu] + + # nginx: + # image: nginx:alpine + # depends_on: [friend-backend, streamlit] + # volumes: + # - ./backends/advanced-backend/nginx.conf:/etc/nginx/nginx.conf:ro + # ports: + # - "80:80" + +# volumes: +# ollama_data: +# driver: local \ No newline at end of file