diff --git a/.env.template b/.env.template index c2a4d8a2..388edbf5 100644 --- a/.env.template +++ b/.env.template @@ -90,16 +90,12 @@ CHAT_TEMPERATURE=0.7 # SPEECH-TO-TEXT CONFIGURATION # ======================================== -# Primary transcription provider: deepgram, mistral, or parakeet +# Primary transcription provider: deepgram or parakeet TRANSCRIPTION_PROVIDER=deepgram # Deepgram configuration DEEPGRAM_API_KEY=your-deepgram-key-here -# Mistral configuration (when TRANSCRIPTION_PROVIDER=mistral) -MISTRAL_API_KEY=your-mistral-key-here -MISTRAL_MODEL=voxtral-mini-2507 - # Parakeet ASR configuration (when TRANSCRIPTION_PROVIDER=parakeet) PARAKEET_ASR_URL=http://host.docker.internal:8767 diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 5e98cd18..0b8987c5 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -1,91 +1,408 @@ -# GitHub Actions CI/CD Setup for Friend Lite +# Chronicle GitHub Workflows -This sets up **automatic GitHub releases** with APK/IPA files whenever you push code. +Documentation for CI/CD workflows and test automation. -## πŸš€ How This Works +## Test Workflows Overview -1. You push code to GitHub -2. GitHub automatically builds **both Android APK and iOS IPA** -3. **Creates GitHub Releases** with both files attached -4. You download directly from the **Releases** tab! +Chronicle uses **three separate test workflows** to balance fast PR feedback with comprehensive testing: -## 🎯 Quick Setup (2 Steps) +| Workflow | Trigger | Test Coverage | API Keys | Purpose | +|----------|---------|---------------|----------|---------| +| `robot-tests.yml` | All PRs | ~70% (no-API tests) | ❌ Not required | Fast PR validation | +| `full-tests-with-api.yml` | Push to dev/main | 100% (full suite) | βœ… Required | Comprehensive validation | +| `pr-tests-with-api.yml` | PR label trigger | 100% (full suite) | βœ… Required | Pre-merge API testing | -### Step 1: Get Expo Token -1. Go to [expo.dev](https://expo.dev) and sign in/create account -2. Go to [Access Tokens](https://expo.dev/accounts/[account]/settings/access-tokens) -3. Create a new token and copy it +## Workflow Details -### Step 2: Add GitHub Secret -1. In your GitHub repo: **Settings** β†’ **Secrets and variables** β†’ **Actions** -2. Click **New repository secret** -3. Name: `EXPO_TOKEN` -4. Value: Paste your token from Step 1 -5. Click **Add secret** +### 1. `robot-tests.yml` - PR Tests (No API Keys) -## ⚑ That's It! -# GitHub Actions Workflows +**File**: `.github/workflows/robot-tests.yml` -## Integration Tests +**Trigger**: +```yaml +on: + pull_request: + paths: + - 'tests/**/*.robot' + - 'tests/**/*.py' + - 'backends/advanced/src/**' +``` + +**Characteristics**: +- **No secrets required** - Works for external contributors +- **Excludes**: Tests tagged with `requires-api-keys` +- **Config**: `tests/configs/mock-services.yml` +- **Test Script**: `./run-no-api-tests.sh` +- **Results**: `results-no-api/` +- **Time**: ~10-15 minutes +- **Coverage**: ~70% of test suite + +**Benefits**: +- Fast feedback on PRs +- No API costs for every PR +- External contributors can run full CI +- Most development workflows covered + +**What's Tested**: +- API endpoints (auth, CRUD, permissions) +- Infrastructure (workers, queues, health) +- Basic integration (non-transcription) + +**What's Skipped**: +- Audio upload with transcription +- Memory operations requiring LLM +- Audio streaming with STT +- Full E2E pipeline tests + +### 2. `full-tests-with-api.yml` - Dev/Main Tests (Full Suite) + +**File**: `.github/workflows/full-tests-with-api.yml` + +**Trigger**: +```yaml +on: + push: + branches: [dev, main] + paths: + - 'tests/**' + - 'backends/advanced/src/**' + workflow_dispatch: # Manual trigger available +``` + +**Characteristics**: +- **Requires secrets**: `DEEPGRAM_API_KEY`, `OPENAI_API_KEY`, `HF_TOKEN` +- **Includes**: All tests (including `requires-api-keys`) +- **Config**: `tests/configs/deepgram-openai.yml` +- **Test Script**: `./run-robot-tests.sh` +- **Results**: `results/` +- **Time**: ~20-30 minutes +- **Coverage**: 100% of test suite + +**Benefits**: +- Full validation before deployment +- Catches API integration issues +- Validates real transcription and memory processing +- Comprehensive E2E coverage + +**What's Tested**: +- Everything from `robot-tests.yml` PLUS: +- Audio upload with real transcription +- Memory extraction with LLM +- Audio streaming with STT +- Full E2E pipeline validation + +### 3. `pr-tests-with-api.yml` - Label-Triggered PR Tests + +**File**: `.github/workflows/pr-tests-with-api.yml` + +**Trigger**: +```yaml +on: + pull_request: + types: [labeled, synchronize] +``` + +**Condition**: +```yaml +if: contains(github.event.pull_request.labels.*.name, 'test-with-api-keys') +``` + +**Characteristics**: +- **Requires**: PR labeled with `test-with-api-keys` +- **Requires secrets**: `DEEPGRAM_API_KEY`, `OPENAI_API_KEY`, `HF_TOKEN` +- **Includes**: All tests (same as full-tests-with-api.yml) +- **Config**: `tests/configs/deepgram-openai.yml` +- **Time**: ~20-30 minutes +- **Re-runs**: On new commits while label present -### Automatic Integration Tests (`integration-tests.yml`) -- **Triggers**: Push/PR to `main` or `develop` branches affecting backend code -- **Timeout**: 15 minutes -- **Mode**: Cached mode (better for CI environment) -- **Dependencies**: Requires `DEEPGRAM_API_KEY` and `OPENAI_API_KEY` secrets +**Benefits**: +- Test API integrations before merging +- Useful for PRs modifying transcription/LLM code +- Maintainers can trigger on trusted PRs +- Catches issues before they reach dev/main + +**Use Cases**: +- PRs that modify transcription logic +- PRs that change memory extraction +- PRs that affect audio processing pipeline +- Before merging large feature branches + +## Usage Guide + +### For Contributors + +**Normal PR Workflow**: +1. Push your branch +2. Create PR +3. `robot-tests.yml` runs automatically (~70% coverage) +4. Fix any failures +5. Merge when tests pass + +**Testing API Integrations**: +1. Push your branch +2. Create PR +3. Ask maintainer to add `test-with-api-keys` label +4. `pr-tests-with-api.yml` runs (100% coverage) +5. Fix any failures +6. Merge when tests pass + +### For Maintainers + +**Adding the Label**: +```bash +# Via GitHub UI +1. Go to PR +2. Click "Labels" on right sidebar +3. Select "test-with-api-keys" + +# Via GitHub CLI +gh pr edit --add-label "test-with-api-keys" +``` + +**When to Use Label**: +- PR modifies audio processing or transcription +- PR changes memory extraction logic +- PR affects LLM integration +- Before merging large features +- When in doubt about API changes + +**Removing the Label**: +- Label is automatically retained on new commits +- Remove manually if no longer needed +- Saves API costs if changes don't affect APIs + +## Test Results + +### PR Comments + +All workflows post results as PR comments: + +```markdown +## πŸŽ‰ Robot Framework Test Results (No API Keys) + +**Status**: βœ… All tests passed! + +| Metric | Count | +|--------|-------| +| βœ… Passed | 76 | +| ❌ Failed | 0 | +| πŸ“Š Total | 76 | + +### πŸ“Š View Reports +- [Test Report](https://pages-url/report.html) +- [Detailed Log](https://pages-url/log.html) +``` + +### GitHub Pages + +Test reports are automatically deployed to GitHub Pages: +- **Live Reports**: Clickable links in PR comments +- **Persistence**: 30 days retention +- **Format**: HTML reports from Robot Framework + +### Artifacts + +Downloadable artifacts for deeper analysis: +- **HTML Reports**: `robot-test-reports-html-*` +- **XML Results**: `robot-test-results-xml-*` +- **Logs**: `robot-test-logs-*` (on failure only) +- **Retention**: 30 days for reports, 7 days for logs ## Required Secrets -Add these secrets in your GitHub repository settings: +### Repository Secrets +Must be configured in GitHub repository settings: + +```bash +DEEPGRAM_API_KEY # Required for full-tests-with-api.yml +OPENAI_API_KEY # Required for full-tests-with-api.yml +HF_TOKEN # Optional (speaker recognition) ``` -DEEPGRAM_API_KEY=your-deepgram-api-key -OPENAI_API_KEY=your-openai-api-key + +**Setting Secrets**: +1. Go to repository Settings +2. Navigate to Secrets and variables β†’ Actions +3. Click "New repository secret" +4. Add each secret + +### Secret Validation + +Workflows validate secrets before running tests: +```yaml +- name: Verify required secrets + env: + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + if [ -z "$DEEPGRAM_API_KEY" ]; then + echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" + exit 1 + fi ``` -## Test Environment +## Cost Management + +### API Cost Breakdown + +**No-API Tests** (`robot-tests.yml`): +- **Cost**: $0 per run +- **Frequency**: Every PR commit +- **Monthly**: Potentially hundreds of runs +- **Savings**: Significant with external contributors + +**Full Tests** (`full-tests-with-api.yml`, `pr-tests-with-api.yml`): +- **Transcription**: ~$0.10-0.30 per run (Deepgram) +- **LLM**: ~$0.05-0.15 per run (OpenAI) +- **Total**: ~$0.15-0.45 per run +- **Frequency**: dev/main pushes + labeled PRs +- **Monthly**: Typically 10-50 runs + +### Cost Optimization + +**Strategies**: +1. Most PRs use no-API tests (free) +2. Full tests only on protected branches +3. Label-triggered for selective full testing +4. No redundant API calls on every commit + +**Before This System**: +- Every PR: ~$0.45 cost +- 100 PRs/month: ~$45 + +**After This System**: +- Most PRs: $0 cost +- 10 dev/main pushes: ~$4.50 +- 5 labeled PRs: ~$2.25 +- Total: ~$6.75/month (85% savings) + +## Workflow Configuration + +### Common Settings -- **Runtime**: Ubuntu latest with Docker support -- **Python**: 3.12 with uv package manager -- **Services**: MongoDB (port 27018), Qdrant (ports 6335/6336), Backend (port 8001) -- **Test Data**: Isolated test directories and databases -- **Audio**: 4-minute glass blowing tutorial for end-to-end validation +All test workflows share: -## Modes +```yaml +# Performance +timeout-minutes: 30 +runs-on: ubuntu-latest -### Cached Mode (Recommended for CI) -- Reuses containers and data between test runs -- Faster startup time -- Better for containerized CI environments -- Used by default in automatic workflows +# Caching +- uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ hashFiles(...) }} -### Fresh Mode (Recommended for Local Development) -- Completely clean environment each run -- Removes all test data and containers -- Slower but more reliable for debugging -- Can be selected in manual workflow +# Python setup +- uses: actions/setup-python@v5 + with: + python-version: "3.12" + +# UV package manager +- uses: astral-sh/setup-uv@v4 + with: + version: "latest" +``` + +### Test Execution Pattern + +```yaml +- name: Run tests + env: + CLEANUP_CONTAINERS: "false" # Handled by workflow + # API keys if needed + run: | + ./run-{no-api|robot}-tests.sh + TEST_EXIT_CODE=$? + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_ENV + exit 0 # Don't fail yet + +- name: Fail workflow if tests failed + if: always() + run: | + if [ "${{ env.test_exit_code }}" != "0" ]; then + echo "❌ Tests failed" + exit 1 + fi +``` + +**Benefits**: +- Artifacts uploaded even on test failure +- Clean container teardown guaranteed +- Clear separation of test execution and reporting ## Troubleshooting -1. **Test Timeout**: Increase `timeout_minutes` in manual workflow -2. **Memory Issues**: Check container logs in failed run artifacts -3. **API Key Issues**: Verify secrets are set correctly in repository settings -4. **Fresh Mode Fails**: Try cached mode for comparison +### Workflow Not Triggering -## Local Testing +**Problem**: Workflow doesn't run on PR +**Solutions**: +- Check file paths in workflow trigger +- Verify workflow file syntax (YAML) +- Check repository permissions +- Look for disabled workflows in Settings -To run the same tests locally: +### Secret Errors -```bash -cd backends/advanced-backend +**Problem**: "ERROR: DEEPGRAM_API_KEY secret is not set" +**Solutions**: +- Verify secret is set in repository settings +- Check secret name matches exactly (case-sensitive) +- Ensure workflow has access to secrets +- Fork PRs cannot access secrets (expected) + +### Test Failures + +**Problem**: Tests fail in CI but pass locally +**Solutions**: +- Check environment differences (.env.test) +- Verify test isolation (database cleanup) +- Look for timing issues (increase timeouts) +- Check Docker resource limits in CI + +### Label Workflow Not Running + +**Problem**: Added label but workflow doesn't trigger +**Solutions**: +- Verify label name is exactly `test-with-api-keys` +- Check workflow trigger includes `types: [labeled]` +- Try removing and re-adding label +- Push new commit to trigger synchronize event + +## Maintenance + +### Updating Workflows + +**When to Update**: +- Adding new test categories +- Changing test execution scripts +- Modifying timeout values +- Updating artifact retention + +**Testing Changes**: +1. Create test branch +2. Modify workflow file +3. Push to trigger workflow +4. Verify execution +5. Merge if successful + +### Monitoring + +**Key Metrics**: +- Test pass rate (target: >95%) +- Workflow execution time (target: <30min) +- API costs (target: <$10/month) +- Artifact storage usage -# Install dependencies -uv sync --dev +**Tools**: +- GitHub Actions dashboard +- Workflow run history +- Cost tracking (GitHub billing) +- Test result trends -# Set up environment (copy from .env.template) -cp .env.template .env.test -# Add your API keys to .env.test +## Reference Links -# Run Robot Framework integration tests -uv run robot --outputdir test-results --loglevel INFO tests/integration/integration_test.robot -``` \ No newline at end of file +- **Test Suite README**: `tests/README.md` +- **Testing Guidelines**: `tests/TESTING_GUIDELINES.md` +- **Tag Documentation**: `tests/tags.md` +- **GitHub Actions Docs**: https://docs.github.com/en/actions diff --git a/.github/workflows/full-tests-with-api.yml b/.github/workflows/full-tests-with-api.yml new file mode 100644 index 00000000..b5881fcd --- /dev/null +++ b/.github/workflows/full-tests-with-api.yml @@ -0,0 +1,264 @@ +name: Robot Framework Tests (Full - With API Keys) + +on: + push: + branches: + - dev + - main + paths: + - 'tests/**/*.robot' + - 'tests/**/*.py' + - 'backends/advanced/src/**' + - '.github/workflows/full-tests-with-api.yml' + workflow_dispatch: # Allow manual triggering + +permissions: + contents: read + pull-requests: write + issues: write + pages: write + id-token: write + +jobs: + full-robot-tests: + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Verify required secrets + env: + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + echo "Verifying required secrets..." + if [ -z "$DEEPGRAM_API_KEY" ]; then + echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" + exit 1 + fi + if [ -z "$OPENAI_API_KEY" ]; then + echo "❌ ERROR: OPENAI_API_KEY secret is not set" + exit 1 + fi + if [ -z "$HF_TOKEN" ]; then + echo "⚠️ WARNING: HF_TOKEN secret is not set (speaker recognition will be disabled)" + else + echo "βœ“ HF_TOKEN is set (length: ${#HF_TOKEN})" + fi + echo "βœ“ DEEPGRAM_API_KEY is set (length: ${#DEEPGRAM_API_KEY})" + echo "βœ“ OPENAI_API_KEY is set (length: ${#OPENAI_API_KEY})" + echo "βœ“ Required secrets verified" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + network=host + + - name: Cache Docker layers + uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ hashFiles('backends/advanced/Dockerfile', 'backends/advanced/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Install Robot Framework and dependencies + run: | + uv pip install --system robotframework robotframework-requests python-dotenv websockets + + - name: Create test config.yml + run: | + echo "Copying test configuration file..." + mkdir -p config + cp tests/configs/deepgram-openai.yml config/config.yml + echo "βœ“ Test config.yml created from tests/configs/deepgram-openai.yml" + ls -lh config/config.yml + + - name: Create plugins.yml from template + run: | + echo "Creating plugins.yml from template..." + if [ -f "config/plugins.yml.template" ]; then + cp config/plugins.yml.template config/plugins.yml + echo "βœ“ plugins.yml created from template" + ls -lh config/plugins.yml + else + echo "❌ ERROR: config/plugins.yml.template not found" + exit 1 + fi + + - name: Run Full Robot Framework tests + working-directory: tests + env: + # Required for test runner script + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + CLEANUP_CONTAINERS: "false" # Don't cleanup in CI - handled by workflow + run: | + # Use the full test script (includes all tests with API keys) + ./run-robot-tests.sh + TEST_EXIT_CODE=$? + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_ENV + exit 0 # Don't fail here, we'll fail at the end after uploading artifacts + + - name: Save service logs to files + if: always() + working-directory: backends/advanced + run: | + echo "Checking running containers..." + docker compose -f docker-compose-test.yml ps -a + echo "" + echo "Saving service logs to files..." + mkdir -p logs + docker compose -f docker-compose-test.yml logs chronicle-backend-test > logs/backend.log 2>&1 || true + docker compose -f docker-compose-test.yml logs workers-test > logs/workers.log 2>&1 || true + docker compose -f docker-compose-test.yml logs mongo-test > logs/mongo.log 2>&1 || true + docker compose -f docker-compose-test.yml logs redis-test > logs/redis.log 2>&1 || true + docker compose -f docker-compose-test.yml logs qdrant-test > logs/qdrant.log 2>&1 || true + docker compose -f docker-compose-test.yml logs speaker-service-test > logs/speaker.log 2>&1 || true + echo "βœ“ Logs saved to backends/advanced/logs/" + ls -lh logs/ + + - name: Check if test results exist + if: always() + id: check_results + run: | + if [ -f tests/results/output.xml ]; then + echo "results_exist=true" >> $GITHUB_OUTPUT + else + echo "results_exist=false" >> $GITHUB_OUTPUT + echo "⚠️ No test results found in tests/results/" + ls -la tests/results/ || echo "Results directory doesn't exist" + fi + + - name: Upload Robot Framework HTML reports + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-artifact@v4 + with: + name: robot-test-reports-html-full + path: | + tests/results/report.html + tests/results/log.html + retention-days: 30 + + - name: Publish HTML Report as GitHub Pages artifact + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-pages-artifact@v3 + with: + path: tests/results + + - name: Deploy to GitHub Pages + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/deploy-pages@v4 + id: deployment + + - name: Generate test summary + if: always() && steps.check_results.outputs.results_exist == 'true' + id: test_summary + run: | + # Parse test results + python3 << 'PYTHON_SCRIPT' > test_summary.txt + import xml.etree.ElementTree as ET + tree = ET.parse('tests/results/output.xml') + root = tree.getroot() + stats = root.find('.//total/stat') + if stats is not None: + passed = stats.get("pass", "0") + failed = stats.get("fail", "0") + total = int(passed) + int(failed) + print(f"PASSED={passed}") + print(f"FAILED={failed}") + print(f"TOTAL={total}") + PYTHON_SCRIPT + + # Source the variables + source test_summary.txt + + # Set outputs + echo "passed=$PASSED" >> $GITHUB_OUTPUT + echo "failed=$FAILED" >> $GITHUB_OUTPUT + echo "total=$TOTAL" >> $GITHUB_OUTPUT + + - name: Upload Robot Framework XML output + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-artifact@v4 + with: + name: robot-test-results-xml-full + path: tests/results/output.xml + retention-days: 30 + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: robot-test-logs-full + path: | + backends/advanced/logs/*.log + backends/advanced/.env + tests/setup/.env.test + retention-days: 7 + + - name: Display test results summary + if: always() + run: | + if [ -f tests/results/output.xml ]; then + echo "Full test results generated successfully (With API Keys)" + echo "========================================" + python3 << 'PYTHON_SCRIPT' + import xml.etree.ElementTree as ET + tree = ET.parse('tests/results/output.xml') + root = tree.getroot() + stats = root.find('.//total/stat') + if stats is not None: + passed = stats.get("pass", "0") + failed = stats.get("fail", "0") + print(f'βœ… Passed: {passed}') + print(f'❌ Failed: {failed}') + print(f'πŸ“Š Total: {int(passed) + int(failed)}') + PYTHON_SCRIPT + echo "========================================" + echo "" + echo "ℹ️ Full test suite including API-dependent tests" + echo "" + echo "πŸ“Š FULL TEST REPORTS AVAILABLE:" + echo " 1. Go to the 'Summary' tab at the top of this page" + echo " 2. Scroll down to 'Artifacts' section" + echo " 3. Download 'robot-test-reports-html-full'" + echo " 4. Extract and open report.html or log.html in your browser" + echo "" + echo "The HTML reports provide:" + echo " - report.html: Executive summary with statistics" + echo " - log.html: Detailed step-by-step execution log" + echo "" + fi + + - name: Cleanup + if: always() + working-directory: backends/advanced + run: | + docker compose -f docker-compose-test.yml down -v + + - name: Fail workflow if tests failed + if: always() + run: | + if [ "${{ env.test_exit_code }}" != "0" ]; then + echo "❌ Tests failed with exit code ${{ env.test_exit_code }}" + exit 1 + else + echo "βœ… All tests passed" + fi diff --git a/.github/workflows/pr-tests-with-api.yml b/.github/workflows/pr-tests-with-api.yml new file mode 100644 index 00000000..aeb45b1c --- /dev/null +++ b/.github/workflows/pr-tests-with-api.yml @@ -0,0 +1,303 @@ +name: Robot Framework Tests (PR - Label Triggered) + +on: + pull_request: + types: [labeled, synchronize] + +permissions: + contents: read + pull-requests: write + issues: write + pages: write + id-token: write + +jobs: + pr-full-tests: + # Only run if PR has the 'test-with-api-keys' label + if: contains(github.event.pull_request.labels.*.name, 'test-with-api-keys') + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Verify required secrets + env: + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + echo "Verifying required secrets for label-triggered full test run..." + if [ -z "$DEEPGRAM_API_KEY" ]; then + echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" + exit 1 + fi + if [ -z "$OPENAI_API_KEY" ]; then + echo "❌ ERROR: OPENAI_API_KEY secret is not set" + exit 1 + fi + if [ -z "$HF_TOKEN" ]; then + echo "⚠️ WARNING: HF_TOKEN secret is not set (speaker recognition will be disabled)" + else + echo "βœ“ HF_TOKEN is set (length: ${#HF_TOKEN})" + fi + echo "βœ“ DEEPGRAM_API_KEY is set (length: ${#DEEPGRAM_API_KEY})" + echo "βœ“ OPENAI_API_KEY is set (length: ${#OPENAI_API_KEY})" + echo "βœ“ Required secrets verified" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + network=host + + - name: Cache Docker layers + uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ hashFiles('backends/advanced/Dockerfile', 'backends/advanced/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Install Robot Framework and dependencies + run: | + uv pip install --system robotframework robotframework-requests python-dotenv websockets + + - name: Create test config.yml + run: | + echo "Copying test configuration file..." + mkdir -p config + cp tests/configs/deepgram-openai.yml config/config.yml + echo "βœ“ Test config.yml created from tests/configs/deepgram-openai.yml" + ls -lh config/config.yml + + - name: Create plugins.yml from template + run: | + echo "Creating plugins.yml from template..." + if [ -f "config/plugins.yml.template" ]; then + cp config/plugins.yml.template config/plugins.yml + echo "βœ“ plugins.yml created from template" + ls -lh config/plugins.yml + else + echo "❌ ERROR: config/plugins.yml.template not found" + exit 1 + fi + + - name: Run Full Robot Framework tests + working-directory: tests + env: + # Required for test runner script + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + CLEANUP_CONTAINERS: "false" # Don't cleanup in CI - handled by workflow + run: | + # Use the full test script (includes all tests with API keys) + ./run-robot-tests.sh + TEST_EXIT_CODE=$? + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_ENV + exit 0 # Don't fail here, we'll fail at the end after uploading artifacts + + - name: Save service logs to files + if: always() + working-directory: backends/advanced + run: | + echo "Checking running containers..." + docker compose -f docker-compose-test.yml ps -a + echo "" + echo "Saving service logs to files..." + mkdir -p logs + docker compose -f docker-compose-test.yml logs chronicle-backend-test > logs/backend.log 2>&1 || true + docker compose -f docker-compose-test.yml logs workers-test > logs/workers.log 2>&1 || true + docker compose -f docker-compose-test.yml logs mongo-test > logs/mongo.log 2>&1 || true + docker compose -f docker-compose-test.yml logs redis-test > logs/redis.log 2>&1 || true + docker compose -f docker-compose-test.yml logs qdrant-test > logs/qdrant.log 2>&1 || true + docker compose -f docker-compose-test.yml logs speaker-service-test > logs/speaker.log 2>&1 || true + echo "βœ“ Logs saved to backends/advanced/logs/" + ls -lh logs/ + + - name: Check if test results exist + if: always() + id: check_results + run: | + if [ -f tests/results/output.xml ]; then + echo "results_exist=true" >> $GITHUB_OUTPUT + else + echo "results_exist=false" >> $GITHUB_OUTPUT + echo "⚠️ No test results found in tests/results/" + ls -la tests/results/ || echo "Results directory doesn't exist" + fi + + - name: Upload Robot Framework HTML reports + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-artifact@v4 + with: + name: robot-test-reports-html-pr-labeled + path: | + tests/results/report.html + tests/results/log.html + retention-days: 30 + + - name: Publish HTML Report as GitHub Pages artifact + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-pages-artifact@v3 + with: + path: tests/results + + - name: Deploy to GitHub Pages + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/deploy-pages@v4 + id: deployment + + - name: Generate test summary + if: always() && steps.check_results.outputs.results_exist == 'true' + id: test_summary + run: | + # Parse test results + python3 << 'PYTHON_SCRIPT' > test_summary.txt + import xml.etree.ElementTree as ET + tree = ET.parse('tests/results/output.xml') + root = tree.getroot() + stats = root.find('.//total/stat') + if stats is not None: + passed = stats.get("pass", "0") + failed = stats.get("fail", "0") + total = int(passed) + int(failed) + print(f"PASSED={passed}") + print(f"FAILED={failed}") + print(f"TOTAL={total}") + PYTHON_SCRIPT + + # Source the variables + source test_summary.txt + + # Set outputs + echo "passed=$PASSED" >> $GITHUB_OUTPUT + echo "failed=$FAILED" >> $GITHUB_OUTPUT + echo "total=$TOTAL" >> $GITHUB_OUTPUT + + - name: Post PR comment with test results + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const passed = '${{ steps.test_summary.outputs.passed }}'; + const failed = '${{ steps.test_summary.outputs.failed }}'; + const total = '${{ steps.test_summary.outputs.total }}'; + const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`; + const pagesUrl = '${{ steps.deployment.outputs.page_url }}'; + + const status = failed === '0' ? 'βœ… All tests passed!' : '❌ Some tests failed'; + const emoji = failed === '0' ? 'πŸŽ‰' : '⚠️'; + + const comment = `## ${emoji} Robot Framework Test Results (Label-Triggered Full Suite) + + **Status**: ${status} + + 🏷️ **Note**: This run was triggered by the \`test-with-api-keys\` label. + All tests including API-dependent tests have been executed. + + | Metric | Count | + |--------|-------| + | βœ… Passed | ${passed} | + | ❌ Failed | ${failed} | + | πŸ“Š Total | ${total} | + + ### πŸ“Š View Reports + + **GitHub Pages (Live Reports):** + - [πŸ“‹ Test Report](${pagesUrl}report.html) + - [πŸ“ Detailed Log](${pagesUrl}log.html) + + **Download Artifacts:** + - [robot-test-reports-html-pr-labeled](${runUrl}) - HTML reports + - [robot-test-results-xml-pr-labeled](${runUrl}) - XML output + + --- + *[View full workflow run](${runUrl})*`; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: comment + }); + + - name: Upload Robot Framework XML output + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-artifact@v4 + with: + name: robot-test-results-xml-pr-labeled + path: tests/results/output.xml + retention-days: 30 + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: robot-test-logs-pr-labeled + path: | + backends/advanced/logs/*.log + backends/advanced/.env + tests/setup/.env.test + retention-days: 7 + + - name: Display test results summary + if: always() + run: | + if [ -f tests/results/output.xml ]; then + echo "Label-triggered full test results generated successfully" + echo "========================================" + python3 << 'PYTHON_SCRIPT' + import xml.etree.ElementTree as ET + tree = ET.parse('tests/results/output.xml') + root = tree.getroot() + stats = root.find('.//total/stat') + if stats is not None: + passed = stats.get("pass", "0") + failed = stats.get("fail", "0") + print(f'βœ… Passed: {passed}') + print(f'❌ Failed: {failed}') + print(f'πŸ“Š Total: {int(passed) + int(failed)}') + PYTHON_SCRIPT + echo "========================================" + echo "" + echo "🏷️ This run was triggered by the 'test-with-api-keys' label" + echo "ℹ️ Full test suite including API-dependent tests" + echo "" + echo "πŸ“Š FULL TEST REPORTS AVAILABLE:" + echo " 1. Go to the 'Summary' tab at the top of this page" + echo " 2. Scroll down to 'Artifacts' section" + echo " 3. Download 'robot-test-reports-html-pr-labeled'" + echo " 4. Extract and open report.html or log.html in your browser" + echo "" + fi + + - name: Cleanup + if: always() + working-directory: backends/advanced + run: | + docker compose -f docker-compose-test.yml down -v + + - name: Fail workflow if tests failed + if: always() + run: | + if [ "${{ env.test_exit_code }}" != "0" ]; then + echo "❌ Tests failed with exit code ${{ env.test_exit_code }}" + exit 1 + else + echo "βœ… All tests passed" + fi diff --git a/.github/workflows/robot-tests.yml b/.github/workflows/robot-tests.yml index 3333266d..35e4dffa 100644 --- a/.github/workflows/robot-tests.yml +++ b/.github/workflows/robot-tests.yml @@ -1,4 +1,4 @@ -name: Robot Framework Tests +name: Robot Framework Tests (No API Keys) on: pull_request: @@ -24,30 +24,6 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Verify required secrets - env: - DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - echo "Verifying required secrets..." - if [ -z "$DEEPGRAM_API_KEY" ]; then - echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" - exit 1 - fi - if [ -z "$OPENAI_API_KEY" ]; then - echo "❌ ERROR: OPENAI_API_KEY secret is not set" - exit 1 - fi - if [ -z "$HF_TOKEN" ]; then - echo "❌ ERROR: HF_TOKEN secret is not set" - exit 1 - fi - echo "βœ“ DEEPGRAM_API_KEY is set (length: ${#DEEPGRAM_API_KEY})" - echo "βœ“ OPENAI_API_KEY is set (length: ${#OPENAI_API_KEY})" - echo "βœ“ HF_TOKEN is set (length: ${#HF_TOKEN})" - echo "βœ“ All required secrets verified" - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: @@ -79,64 +55,81 @@ jobs: - name: Create test config.yml run: | - echo "Copying test configuration file..." + echo "Copying mock services configuration file..." mkdir -p config - cp tests/configs/deepgram-openai.yml config/config.yml - echo "βœ“ Test config.yml created from tests/configs/deepgram-openai.yml" + cp tests/configs/mock-services.yml config/config.yml + echo "βœ“ Test config.yml created from tests/configs/mock-services.yml" + echo "ℹ️ This config disables external API dependencies (transcription, LLM)" ls -lh config/config.yml - - name: Run Robot Framework tests + - name: Create plugins.yml from template + run: | + echo "Creating plugins.yml from template..." + if [ -f "config/plugins.yml.template" ]; then + cp config/plugins.yml.template config/plugins.yml + echo "βœ“ plugins.yml created from template" + ls -lh config/plugins.yml + else + echo "❌ ERROR: config/plugins.yml.template not found" + exit 1 + fi + + - name: Run Robot Framework tests (No API Keys) working-directory: tests env: - # Required for test runner script - DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} CLEANUP_CONTAINERS: "false" # Don't cleanup in CI - handled by workflow run: | - # Use the unified test script that mirrors local development - ./run-robot-tests.sh + # Use the no-API test script (excludes tests tagged with requires-api-keys) + ./run-no-api-tests.sh TEST_EXIT_CODE=$? echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_ENV exit 0 # Don't fail here, we'll fail at the end after uploading artifacts - - name: Show service logs + - name: Save service logs to files if: always() working-directory: backends/advanced run: | - echo "=== Backend Logs (last 50 lines) ===" - docker compose -f docker-compose-test.yml logs --tail=50 chronicle-backend-test + echo "Checking running containers..." + docker compose -f docker-compose-test.yml ps -a echo "" - echo "=== Worker Logs (last 50 lines) ===" - docker compose -f docker-compose-test.yml logs --tail=50 workers-test + echo "Saving service logs to files..." + mkdir -p logs + docker compose -f docker-compose-test.yml logs chronicle-backend-test > logs/backend.log 2>&1 || true + docker compose -f docker-compose-test.yml logs workers-test > logs/workers.log 2>&1 || true + docker compose -f docker-compose-test.yml logs mongo-test > logs/mongo.log 2>&1 || true + docker compose -f docker-compose-test.yml logs redis-test > logs/redis.log 2>&1 || true + docker compose -f docker-compose-test.yml logs qdrant-test > logs/qdrant.log 2>&1 || true + docker compose -f docker-compose-test.yml logs speaker-service-test > logs/speaker.log 2>&1 || true + echo "βœ“ Logs saved to backends/advanced/logs/" + ls -lh logs/ - name: Check if test results exist if: always() id: check_results run: | - if [ -f tests/results/output.xml ]; then + if [ -f tests/results-no-api/output.xml ]; then echo "results_exist=true" >> $GITHUB_OUTPUT else echo "results_exist=false" >> $GITHUB_OUTPUT - echo "⚠️ No test results found in tests/results/" - ls -la tests/results/ || echo "Results directory doesn't exist" + echo "⚠️ No test results found in tests/results-no-api/" + ls -la tests/results-no-api/ || echo "Results directory doesn't exist" fi - name: Upload Robot Framework HTML reports if: always() && steps.check_results.outputs.results_exist == 'true' uses: actions/upload-artifact@v4 with: - name: robot-test-reports-html + name: robot-test-reports-html-no-api path: | - tests/results/report.html - tests/results/log.html + tests/results-no-api/report.html + tests/results-no-api/log.html retention-days: 30 - name: Publish HTML Report as GitHub Pages artifact if: always() && steps.check_results.outputs.results_exist == 'true' uses: actions/upload-pages-artifact@v3 with: - path: tests/results + path: tests/results-no-api - name: Deploy to GitHub Pages if: always() && steps.check_results.outputs.results_exist == 'true' @@ -150,7 +143,7 @@ jobs: # Parse test results python3 << 'PYTHON_SCRIPT' > test_summary.txt import xml.etree.ElementTree as ET - tree = ET.parse('tests/results/output.xml') + tree = ET.parse('tests/results-no-api/output.xml') root = tree.getroot() stats = root.find('.//total/stat') if stats is not None: @@ -185,10 +178,13 @@ jobs: const status = failed === '0' ? 'βœ… All tests passed!' : '❌ Some tests failed'; const emoji = failed === '0' ? 'πŸŽ‰' : '⚠️'; - const comment = `## ${emoji} Robot Framework Test Results + const comment = `## ${emoji} Robot Framework Test Results (No API Keys) **Status**: ${status} + ℹ️ **Note**: This run excludes tests requiring external API keys (Deepgram, OpenAI). + Tests tagged with \`requires-api-keys\` will run on dev/main branches. + | Metric | Count | |--------|-------| | βœ… Passed | ${passed} | @@ -202,8 +198,8 @@ jobs: - [πŸ“ Detailed Log](${pagesUrl}log.html) **Download Artifacts:** - - [robot-test-reports-html](${runUrl}) - HTML reports - - [robot-test-results-xml](${runUrl}) - XML output + - [robot-test-reports-html-no-api](${runUrl}) - HTML reports + - [robot-test-results-xml-no-api](${runUrl}) - XML output --- *[View full workflow run](${runUrl})*`; @@ -219,16 +215,17 @@ jobs: if: always() && steps.check_results.outputs.results_exist == 'true' uses: actions/upload-artifact@v4 with: - name: robot-test-results-xml - path: tests/results/output.xml + name: robot-test-results-xml-no-api + path: tests/results-no-api/output.xml retention-days: 30 - name: Upload logs on failure if: failure() uses: actions/upload-artifact@v4 with: - name: robot-test-logs + name: robot-test-logs-no-api path: | + backends/advanced/logs/*.log backends/advanced/.env tests/setup/.env.test retention-days: 7 @@ -236,12 +233,12 @@ jobs: - name: Display test results summary if: always() run: | - if [ -f tests/results/output.xml ]; then - echo "Test results generated successfully" + if [ -f tests/results-no-api/output.xml ]; then + echo "Test results generated successfully (No API Keys mode)" echo "========================================" python3 << 'PYTHON_SCRIPT' import xml.etree.ElementTree as ET - tree = ET.parse('tests/results/output.xml') + tree = ET.parse('tests/results-no-api/output.xml') root = tree.getroot() stats = root.find('.//total/stat') if stats is not None: @@ -253,10 +250,12 @@ jobs: PYTHON_SCRIPT echo "========================================" echo "" + echo "ℹ️ Tests excluded: requires-api-keys (run on dev/main branches)" + echo "" echo "πŸ“Š FULL TEST REPORTS AVAILABLE:" echo " 1. Go to the 'Summary' tab at the top of this page" echo " 2. Scroll down to 'Artifacts' section" - echo " 3. Download 'robot-test-reports-html'" + echo " 3. Download 'robot-test-reports-html-no-api'" echo " 4. Extract and open report.html or log.html in your browser" echo "" echo "The HTML reports provide:" diff --git a/.gitignore b/.gitignore index 23141c6b..6fa02d7f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,10 @@ tests/setup/.env.test config/config.yml !config/config.yml.template +# Plugins config (contains secrets) +config/plugins.yml +!config/plugins.yml.template + # Config backups config/*.backup.* config/*.backup* diff --git a/CLAUDE.md b/CLAUDE.md index 7f5f5507..faed99c2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -18,7 +18,7 @@ This supports a comprehensive web dashboard for management. Chronicle includes an **interactive setup wizard** for easy configuration. The wizard guides you through: - Service selection (backend + optional services) - Authentication setup (admin account, JWT secrets) -- Transcription provider configuration (Deepgram, Mistral, or offline ASR) +- Transcription provider configuration (Deepgram or offline ASR) - LLM provider setup (OpenAI or Ollama) - Memory provider selection (Chronicle Native with Qdrant or OpenMemory MCP) - Network configuration and HTTPS setup @@ -86,72 +86,54 @@ cp .env.template .env # Configure environment variables sudo rm -rf backends/advanced/data/ ``` -### Testing Infrastructure +### Running Tests -#### Local Test Scripts -The project includes simplified test scripts that mirror CI workflows: +#### Quick Commands +All test operations are managed through a simple Makefile interface: ```bash -# Run all tests from project root -./run-test.sh [advanced-backend|speaker-recognition|all] +cd tests -# Advanced backend tests only -./run-test.sh advanced-backend +# Full test workflow (recommended) +make test # Start containers + run all tests -# Speaker recognition tests only -./run-test.sh speaker-recognition +# Or step by step +make start # Start test containers (with health checks) +make test-all # Run all test suites +make stop # Stop containers (preserves volumes) -# Run all test suites (default) -./run-test.sh all -``` +# Run specific test suites +make test-endpoints # API endpoint tests (~40 tests, fast) +make test-integration # End-to-end workflows (~15 tests, slower) +make test-infra # Infrastructure resilience (~5 tests) -#### Advanced Backend Integration Tests -```bash -cd backends/advanced +# Quick iteration (reuse existing containers) +make test-quick # Run tests without restarting containers +``` -# Requires .env file with DEEPGRAM_API_KEY and OPENAI_API_KEY -cp .env.template .env # Configure API keys +#### Container Management +All container operations automatically preserve logs before cleanup: -# Run full integration test suite -./run-test.sh +```bash +make start # Start test containers +make stop # Stop containers (keep volumes) +make restart # Restart without rebuild +make rebuild # Rebuild images + restart (for code changes) +make containers-clean # SAVES LOGS β†’ removes everything +make status # Show container health +make logs SERVICE= # View specific service logs +``` -# Manual test execution (for debugging) -source .env && export DEEPGRAM_API_KEY && export OPENAI_API_KEY -uv run robot --outputdir test-results --loglevel INFO ../../tests/integration/integration_test.robot +**Log Preservation:** All cleanup operations save container logs to `tests/logs/YYYY-MM-DD_HH-MM-SS/` -# Leave test containers running for debugging (don't auto-cleanup) -CLEANUP_CONTAINERS=false source .env && export DEEPGRAM_API_KEY && export OPENAI_API_KEY -uv run robot --outputdir test-results --loglevel INFO ../../tests/integration/integration_test.robot +#### Test Environment -# Manual cleanup when needed -docker compose -f docker-compose-test.yml down -v -``` +Test services use isolated ports and database: +- **Ports:** Backend (8001), MongoDB (27018), Redis (6380), Qdrant (6337/6338) +- **Database:** `test_db` (separate from production) +- **Credentials:** `test-admin@example.com` / `test-admin-password-123` -#### Test Configuration Flags -- **CLEANUP_CONTAINERS** (default: true): Automatically stop and remove test containers after test completion - - Set to `false` for debugging: `CLEANUP_CONTAINERS=false ./run-test.sh` -- **REBUILD** (default: true): Force rebuild containers with latest code changes -- **FRESH_RUN** (default: true): Start with clean database and fresh containers -- **TRANSCRIPTION_PROVIDER** (default: deepgram): Choose transcription provider (deepgram or parakeet) - -#### Test Environment Variables -Tests use isolated test environment with overridden credentials: -- **Test Database**: `test_db` (MongoDB on port 27018, separate from production) -- **Test Ports**: Backend (8001), Qdrant (6337/6338), WebUI (3001) -- **Test Credentials**: - - `AUTH_SECRET_KEY`: test-jwt-signing-key-for-integration-tests - - `ADMIN_EMAIL`: test-admin@example.com - - `ADMIN_PASSWORD`: test-admin-password-123 -- **API Keys**: Loaded from `.env` file (DEEPGRAM_API_KEY, OPENAI_API_KEY) -- **Test Settings**: `DISABLE_SPEAKER_RECOGNITION=true` to prevent segment duplication - -#### Test Script Features -- **Environment Compatibility**: Works with both local .env files and CI environment variables -- **Isolated Test Environment**: Separate ports and database prevent conflicts with running services -- **Automatic Cleanup**: Configurable via CLEANUP_CONTAINERS flag (default: true) -- **Colored Output**: Clear progress indicators and error reporting -- **Timeout Protection**: 15-minute timeout for advanced backend, 30-minute for speaker recognition -- **Fresh Testing**: Clean database and containers for each test run +**For complete test documentation, see `tests/README.md`** ### Mobile App Development ```bash @@ -185,12 +167,12 @@ docker compose up --build ## Architecture Overview ### Key Components -- **Audio Pipeline**: Real-time Opus/PCM β†’ Application-level processing β†’ Deepgram/Mistral transcription β†’ memory extraction +- **Audio Pipeline**: Real-time Opus/PCM β†’ Application-level processing β†’ Deepgram transcription β†’ memory extraction - **Wyoming Protocol**: WebSocket communication uses Wyoming protocol (JSONL + binary) for structured audio sessions - **Unified Pipeline**: Job-based tracking system for all audio processing (WebSocket and file uploads) - **Job Tracker**: Tracks pipeline jobs with stage events (audio β†’ transcription β†’ memory) and completion status - **Task Management**: BackgroundTaskManager tracks all async tasks to prevent orphaned processes -- **Unified Transcription**: Deepgram/Mistral transcription with fallback to offline ASR services +- **Unified Transcription**: Deepgram transcription with fallback to offline ASR services - **Memory System**: Pluggable providers (Chronicle native or OpenMemory MCP) - **Authentication**: Email-based login with MongoDB ObjectId user system - **Client Management**: Auto-generated client IDs as `{user_id_suffix}-{device_name}`, centralized ClientManager @@ -206,7 +188,7 @@ Required: Recommended: - Vector Storage: Qdrant (Chronicle provider) or OpenMemory MCP server - - Transcription: Deepgram, Mistral, or offline ASR services + - Transcription: Deepgram or offline ASR services Optional: - Parakeet ASR: Offline transcription service @@ -330,12 +312,7 @@ Chronicle supports multiple transcription services: TRANSCRIPTION_PROVIDER=deepgram DEEPGRAM_API_KEY=your-deepgram-key-here -# Option 2: Mistral (Voxtral models) -TRANSCRIPTION_PROVIDER=mistral -MISTRAL_API_KEY=your-mistral-key-here -MISTRAL_MODEL=voxtral-mini-2507 - -# Option 3: Local ASR (Parakeet) +# Option 2: Local ASR (Parakeet) PARAKEET_ASR_URL=http://host.docker.internal:8767 ``` @@ -348,12 +325,37 @@ OLLAMA_BASE_URL=http://ollama:11434 SPEAKER_SERVICE_URL=http://speaker-recognition:8085 ``` +### Plugin Security Architecture + +**Three-File Separation**: + +1. **backends/advanced/.env** - Secrets (gitignored) + ```bash + SMTP_PASSWORD=abcdefghijklmnop + OPENAI_API_KEY=sk-proj-... + ``` + +2. **config/plugins.yml** - Orchestration (uses env var references) + ```yaml + plugins: + email_summarizer: + enabled: true + smtp_password: ${SMTP_PASSWORD} # Reference, not actual value! + ``` + +3. **plugins/{plugin_id}/config.yml** - Non-secret defaults + ```yaml + subject_prefix: "Conversation Summary" + ``` + +**CRITICAL**: Never hardcode secrets in `config/plugins.yml`. Always use `${ENV_VAR}` syntax. + ## Quick API Reference ### Common Endpoints - **GET /health**: Basic application health check - **GET /readiness**: Service dependency validation -- **WS /ws_pcm**: Primary audio streaming endpoint (Wyoming protocol + raw PCM fallback) +- **WS /ws**: Audio streaming endpoint with codec parameter (Wyoming protocol, supports pcm and opus codecs) - **GET /api/conversations**: User's conversations with transcripts - **GET /api/memories/search**: Semantic memory search with relevance scoring - **POST /auth/jwt/login**: Email-based login (returns JWT token) @@ -518,12 +520,11 @@ tailscale ip -4 - **Docker**: Primary deployment method with docker-compose ### Testing Strategy -- **Local Test Scripts**: Simplified scripts (`./run-test.sh`) mirror CI workflows for local development -- **End-to-End Integration**: Robot Framework tests (`tests/integration/integration_test.robot`) validate complete audio processing pipeline -- **Speaker Recognition Tests**: `test_speaker_service_integration.py` validates speaker identification +- **Makefile-Based**: All test operations through simple `make` commands (`make test`, `make start`, `make stop`) +- **Log Preservation**: Container logs always saved before cleanup (never lose debugging info) +- **End-to-End Integration**: Robot Framework validates complete audio processing pipeline - **Environment Flexibility**: Tests work with both local .env files and CI environment variables -- **Automated Cleanup**: Test containers are automatically removed after execution -- **CI/CD Integration**: GitHub Actions use the same local test scripts for consistency +- **CI/CD Integration**: Same test logic locally and in GitHub Actions ### Code Style - **Python**: Black formatter with 100-character line length, isort for imports @@ -550,14 +551,10 @@ The system includes comprehensive health checks: - Memory debug system for transcript processing monitoring ### Integration Test Infrastructure -- **Unified Test Scripts**: Local `./run-test.sh` scripts mirror GitHub Actions workflows -- **Test Environment**: `docker-compose-test.yml` provides isolated services on separate ports -- **Test Database**: Uses `test_db` database with isolated collections -- **Service Ports**: Backend (8001), MongoDB (27018), Qdrant (6335/6336), WebUI (5174) -- **Test Credentials**: Auto-generated `.env.test` files with secure test configurations -- **Ground Truth**: Expected transcript established via `scripts/test_deepgram_direct.py` -- **AI Validation**: OpenAI-powered transcript similarity comparison -- **Test Audio**: 4-minute glass blowing tutorial (`extras/test-audios/DIY*mono*.wav`) +- **Makefile Interface**: Simple `make` commands for all operations (see `tests/README.md`) +- **Test Environment**: `docker-compose-test.yml` with isolated services on separate ports +- **Test Database**: Uses `test_db` database (separate from production) +- **Log Preservation**: All cleanup operations save logs to `tests/logs/` automatically - **CI Compatibility**: Same test logic runs locally and in GitHub Actions ### Cursor Rule Integration diff --git a/Docs/audio-pipeline-architecture.md b/Docs/audio-pipeline-architecture.md new file mode 100644 index 00000000..afba52db --- /dev/null +++ b/Docs/audio-pipeline-architecture.md @@ -0,0 +1,1241 @@ +# Audio Pipeline Architecture + +This document explains how audio flows through the Chronicle system from initial capture to final storage, including all intermediate processing stages, Redis streams, and data storage locations. + +## Table of Contents + +- [Overview](#overview) +- [Architecture Diagram](#architecture-diagram) +- [Data Sources](#data-sources) +- [Redis Streams: The Central Pipeline](#redis-streams-the-central-pipeline) +- [Producer: AudioStreamProducer](#producer-audiostreamproducer) +- [Dual-Consumer Architecture](#dual-consumer-architecture) +- [Transcription Results Aggregator](#transcription-results-aggregator) +- [Job Queue Orchestration (RQ)](#job-queue-orchestration-rq) +- [Data Storage](#data-storage) +- [Complete End-to-End Flow](#complete-end-to-end-flow) +- [Key Design Patterns](#key-design-patterns) +- [Failure Handling](#failure-handling) + +## Overview + +Chronicle's audio pipeline is built on three core technologies: + +- **Redis Streams**: Distributed message queues for audio chunks and transcription results +- **Background Tasks**: Async consumers that process streams independently +- **RQ Job Queue**: Orchestrates session-level and conversation-level workflows + +**Key Insight**: Multiple workers can independently consume the **same audio stream** using Redis Consumer Groups, enabling parallel processing paths (transcription + disk persistence) without duplication. + +## Architecture Diagram + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ AUDIO INPUT β”‚ +β”‚ WebSocket (/ws) β”‚ File Upload (/audio/upload) β”‚ Google Drive β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ AudioStreamProducer β”‚ + β”‚ - Chunk audio (0.25s) β”‚ + β”‚ - Session metadata β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Redis Stream (Per Client) β”‚ + β”‚ audio:stream:{client_id} β”‚ + β””β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Transcription Consumerβ”‚ β”‚ Audio Persistence β”‚ + β”‚ Group (streaming/batch)β”‚ β”‚ Consumer Group β”‚ + β”‚ β”‚ β”‚ β”‚ + β”‚ β†’ Deepgram WebSocket β”‚ β”‚ β†’ Writes WAV files β”‚ + β”‚ β†’ Batch buffering β”‚ β”‚ β†’ Monitors rotation β”‚ + β”‚ β†’ Publish results β”‚ β”‚ β†’ Stores file paths β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ transcription:results β”‚ β”‚ Disk Storage β”‚ + β”‚ :{session_id} β”‚ β”‚ data/chunks/*.wav β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ TranscriptionResults β”‚ + β”‚ Aggregator β”‚ + β”‚ - Combines chunks β”‚ + β”‚ - Merges timestamps β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ RQ Job Pipeline β”‚ + β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ + β”‚ speech_detection_job β”‚ ← Session-level + β”‚ ↓ β”‚ + β”‚ open_conversation_job β”‚ ← Conversation-level + β”‚ ↓ β”‚ + β”‚ Post-Conversation: β”‚ + β”‚ β€’ transcribe_full β”‚ + β”‚ β€’ speaker_recognition β”‚ + β”‚ β€’ memory_extraction β”‚ + β”‚ β€’ title_generation β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Final Storage β”‚ + β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ + β”‚ MongoDB: conversationsβ”‚ + β”‚ Disk: WAV files β”‚ + β”‚ Qdrant: Memories β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Data Sources + +### 1. WebSocket Streaming (`/ws`) + +**Endpoint**: `/ws?codec=pcm|opus&token=xxx&device_name=xxx` + +**Handlers**: +- `handle_pcm_websocket()` - Raw PCM audio +- `handle_omi_websocket()` - Opus-encoded audio (compressed, used by OMI devices) + +**Protocol**: Wyoming Protocol (JSON lines + binary frames) + +**Authentication**: JWT token required + +**Location**: `backends/advanced/src/advanced_omi_backend/routers/websocket_routes.py` + +**Container**: `chronicle-backend` + +### 2. File Upload (`/audio/upload`) + +**Endpoint**: `POST /api/audio/upload` + +**Accepts**: Multiple WAV files (multipart form data) + +**Authentication**: Admin only + +**Device ID**: Auto-generated as `{user_id_suffix}-upload` or custom `device_name` + +**Location**: `backends/advanced/src/advanced_omi_backend/routers/api_router.py` + +**Container**: `chronicle-backend` + +### 3. Google Drive Upload + +**Endpoint**: `POST /api/audio/upload_audio_from_gdrive` + +**Source**: Google Drive folder ID + +**Processing**: Downloads files and enqueues for processing + +**Container**: `chronicle-backend` + +## Redis Streams: The Central Pipeline + +### Stream Naming Convention + +``` +audio:stream:{client_id} +``` + +**Examples**: +- `audio:stream:user01-phone` +- `audio:stream:user01-omi-device` +- `audio:stream:user01-upload` + +**Characteristics**: +- **Client-specific isolation**: Each device has its own stream +- **Fan-out pattern**: Multiple consumer groups read the same stream +- **MAXLEN constraint**: Keeps last 25,000 entries (auto-trimming) +- **No TTL**: Streams persist until manually deleted +- **Container**: `redis` service + +### Session Metadata Storage + +``` +audio:session:{session_id} +``` + +**Type**: Redis Hash + +**Fields**: +- `user_id`: MongoDB ObjectId +- `client_id`: Device identifier +- `connection_id`: WebSocket connection ID +- `stream_name`: `audio:stream:{client_id}` +- `status`: `"active"` β†’ `"finalizing"` β†’ `"complete"` +- `chunks_published`: Integer count +- `speech_detection_job_id`: RQ job ID +- `audio_persistence_job_id`: RQ job ID +- `websocket_connected`: `true|false` +- `transcription_error`: Error message (if any) + +**TTL**: 1 hour + +**Container**: `redis` + +### Transcription Results Stream + +``` +transcription:results:{session_id} +``` + +**Type**: Redis Stream + +**Written by**: Transcription consumers (streaming or batch) + +**Read by**: `TranscriptionResultsAggregator` + +**Message Fields**: +- `text`: Transcribed text for this chunk +- `chunk_id`: Redis message ID from audio stream +- `provider`: `"deepgram"` or `"parakeet"` +- `confidence`: Float (0.0-1.0) +- `words`: JSON array of word-level timestamps +- `segments`: JSON array of speaker segments + +**Lifecycle**: Deleted when conversation completes + +**Container**: `redis` + +### Conversation Tracking + +``` +conversation:current:{session_id} +``` + +**Type**: Redis String + +**Value**: Current `conversation_id` (UUID) + +**Purpose**: Signals audio persistence job to rotate WAV file + +**TTL**: 24 hours + +**Container**: `redis` + +### Audio File Path Mapping + +``` +audio:file:{conversation_id} +``` + +**Type**: Redis String + +**Value**: File path (e.g., `1704067200000_user01-phone_convid.wav`) + +**Purpose**: Links conversation to its audio file on disk + +**TTL**: 24 hours + +**Container**: `redis` + +## Producer: AudioStreamProducer + +**File**: `backends/advanced/src/advanced_omi_backend/services/audio_stream/producer.py` + +**Container**: `chronicle-backend` (in-memory, no persistence) + +### Responsibilities + +#### 1. Session Initialization + +```python +async def init_session( + session_id: str, + user_id: str, + client_id: str, + provider: str, + mode: str +) -> None +``` + +**Actions**: +- Creates `audio:session:{session_id}` hash in Redis +- Initializes in-memory buffer for chunking +- Stores session metadata (user, client, provider) + +#### 2. Audio Chunking + +```python +async def add_audio_chunk( + session_id: str, + audio_data: bytes +) -> list[str] +``` + +**Process**: +1. Buffers incoming audio (arbitrary size from WebSocket) +2. Creates **fixed-size chunks**: 0.25 seconds = 8,000 bytes + - Assumes: 16kHz sample rate, 16-bit mono PCM +3. Prevents cutting audio mid-word (aligned chunks) +4. Publishes each chunk to `audio:stream:{client_id}` via `XADD` +5. Returns Redis message IDs for tracking + +**In-Memory Storage**: Session buffers stored in `AudioStreamProducer._session_buffers` dict + +#### 3. Session End Signal + +```python +async def send_session_end_signal(session_id: str) -> None +``` + +**Actions**: +- Publishes special `{"type": "END"}` message to stream +- Signals all consumers to flush buffers and finalize +- Updates session status to `"finalizing"` + +### Data Location + +**Memory**: `chronicle-backend` container (in-memory buffers) + +**Redis**: Published chunks in `audio:stream:{client_id}` (redis container) + +## Dual-Consumer Architecture + +Chronicle uses **Redis Consumer Groups** to enable multiple independent consumers to read the **same audio stream** without message duplication. + +### Consumer Group 1: Transcription + +Two implementations available: + +#### A. Streaming Transcription Consumer + +**File**: `backends/advanced/src/advanced_omi_backend/services/transcription/streaming_consumer.py` + +**Class**: `StreamingTranscriptionConsumer` + +**Consumer Group**: `streaming-transcription` + +**Provider**: Deepgram (WebSocket-based) + +**Process**: +1. Discovers `audio:stream:*` streams dynamically using `SCAN` +2. Opens persistent WebSocket connection to Deepgram per stream +3. Sends audio chunks **immediately** (no buffering) +4. Publishes **interim results** to `transcription:interim:{session_id}` (Redis Pub/Sub) +5. Publishes **final results** to `transcription:results:{session_id}` (Redis Stream) +6. Triggers plugins on final results only +7. ACKs messages with `XACK` to prevent reprocessing +8. Handles END signal: closes WebSocket, cleans up + +**Container**: `chronicle-backend` (Background Task via `BackgroundTaskManager`) + +**Real-time Updates**: Interim results pushed to WebSocket clients via Pub/Sub + +#### B. Batch Transcription Consumer + +**File**: `backends/advanced/src/advanced_omi_backend/services/audio_stream/consumer.py` + +**Class**: `BaseAudioStreamConsumer` + +**Consumer Group**: `{provider_name}_workers` (e.g., `deepgram_workers`, `parakeet_workers`) + +**Providers**: Deepgram (batch), Parakeet ASR (offline) + +**Process**: +1. Reads from `audio:stream:{client_id}` using `XREADGROUP` +2. Buffers chunks per session (default: 30 chunks = ~7.5 seconds) +3. When buffer full: + - Combines chunks into single audio buffer + - Transcribes using provider API + - Adjusts word/segment timestamps relative to session start + - Publishes result to `transcription:results:{session_id}` +4. Flushes remaining buffer on END signal +5. ACKs all buffered messages with `XACK` +6. Trims stream to keep only last 1,000 entries (`XTRIM MAXLEN`) + +**Container**: `chronicle-backend` (Background Task) + +**Batching Benefits**: Reduces API calls, improves transcription accuracy (more context) + +### Consumer Group 2: Audio Persistence + +**File**: `backends/advanced/src/advanced_omi_backend/workers/audio_jobs.py` + +**Function**: `audio_streaming_persistence_job()` + +**Consumer Group**: `audio_persistence` + +**Consumer Name**: `persistence-worker-{session_id}` + +**Process**: +1. Reads audio chunks from `audio:stream:{client_id}` using `XREADGROUP` +2. Monitors `conversation:current:{session_id}` for rotation signals +3. On conversation rotation: + - Closes current WAV file + - Opens new WAV file with new conversation ID +4. Writes chunks immediately to disk (real-time persistence) +5. Stores file path in `audio:file:{conversation_id}` (Redis) +6. Handles END signal: closes file, returns statistics +7. ACKs messages after writing to disk + +**Container**: `chronicle-backend` (RQ Worker) + +**Output Location**: `backends/advanced/data/chunks/` (volume-mounted) + +**File Format**: `{timestamp_ms}_{client_id}_{conversation_id}.wav` + +### Fan-Out Pattern Visualization + +``` +audio:stream:user01-phone + ↓ + β”œβ”€ Consumer Group: "streaming-transcription" + β”‚ └─ Worker: streaming-worker-12345 + β”‚ β†’ Reads: chunks β†’ Deepgram WS β†’ Results stream + β”‚ + β”œβ”€ Consumer Group: "deepgram_workers" + β”‚ β”œβ”€ Worker: deepgram-worker-67890 + β”‚ β”œβ”€ Worker: deepgram-worker-67891 + β”‚ └─ Reads: chunks β†’ Buffer (30) β†’ Batch API β†’ Results stream + β”‚ + └─ Consumer Group: "audio_persistence" + └─ Worker: persistence-worker-sessionXYZ + β†’ Reads: chunks β†’ WAV file (disk) +``` + +**Key Benefits**: +- **Horizontal scaling**: Multiple workers per group +- **Independent processing**: Each group processes all messages +- **No message loss**: Messages ACKed only after processing +- **Decoupled**: Producer doesn't know about consumers + +## Transcription Results Aggregator + +**File**: `backends/advanced/src/advanced_omi_backend/services/audio_stream/aggregator.py` + +**Class**: `TranscriptionResultsAggregator` + +**Container**: `chronicle-backend` (in-memory, stateless) + +### Methods + +#### Get Combined Results + +```python +async def get_combined_results(session_id: str) -> dict +``` + +**Returns**: +```python +{ + "text": "Full transcript...", + "segments": [SpeakerSegment, ...], + "words": [Word, ...], + "provider": "deepgram", + "chunk_count": 42 +} +``` + +**Process**: +- Reads all entries from `transcription:results:{session_id}` +- For **streaming mode**: Uses latest final result only (supersedes interim) +- For **batch mode**: Combines all chunks sequentially +- Adjusts timestamps across chunks (adds audio offset) +- Merges speaker segments, words + +#### Get Session Results (Raw) + +```python +async def get_session_results(session_id: str) -> list[dict] +``` + +**Returns**: Raw list of transcription result messages + +#### Get Real-time Results + +```python +async def get_realtime_results( + session_id: str, + last_id: str = "0-0" +) -> tuple[list[dict], str] +``` + +**Returns**: `(new_results, new_last_id)` + +**Purpose**: Incremental polling for live UI updates + +### Data Location + +**Input**: `transcription:results:{session_id}` stream (redis container) + +**Processing**: In-memory (chronicle-backend container) + +**Output**: Returned to caller (no persistence) + +## Job Queue Orchestration (RQ) + +**Library**: Python RQ (Redis Queue) + +**File**: `backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py` + +**Containers**: +- `chronicle-backend` (enqueues jobs) +- `rq-worker` (executes jobs) + +### Job Pipeline + +``` +Session Starts + ↓ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ stream_speech_detection_job β”‚ ← Session-level (long-running) +β”‚ - Polls transcription results β”‚ +β”‚ - Analyzes speech content β”‚ +β”‚ - Checks speaker filters β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ (when speech detected) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ open_conversation_job β”‚ ← Conversation-level (long-running) +β”‚ - Creates conversation β”‚ +β”‚ - Signals file rotation β”‚ +β”‚ - Monitors activity β”‚ +β”‚ - Detects end conditions β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ (when conversation ends) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Post-Conversation Pipeline β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β€’ recognize_speakers_job β”‚ +β”‚ β€’ memory_extraction_job β”‚ +β”‚ β€’ generate_title_summary_job β”‚ +β”‚ β€’ dispatch_conversation_completeβ”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Session-Level Jobs + +#### Speech Detection Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/transcription_jobs.py` + +**Function**: `stream_speech_detection_job()` + +**Scope**: Entire session (can handle multiple conversations) + +**Max Duration**: 24 hours + +**Process**: +1. Polls `TranscriptionResultsAggregator.get_combined_results()` (1-second intervals) +2. Analyzes speech content: + - Word count > 10 + - Duration > 5 seconds + - Confidence > threshold +3. If speaker filter enabled: checks for enrolled speakers +4. When speech detected: + - Creates conversation in MongoDB + - Enqueues `open_conversation_job` + - **Exits** (restarts when conversation completes) +5. Handles transcription errors (marks session with error flag) + +**RQ Queue**: `speech_detection_queue` (dedicated queue) + +**Container**: `rq-worker` + +### Conversation-Level Jobs + +#### Open Conversation Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/conversation_jobs.py` + +**Function**: `open_conversation_job()` + +**Scope**: Single conversation + +**Max Duration**: 3 hours + +**Process**: +1. Creates conversation document in MongoDB `conversations` collection +2. Sets `conversation:current:{session_id}` = `conversation_id` (Redis) + - **Triggers audio persistence job to rotate WAV file** +3. Polls for transcription updates (1-second intervals) +4. Tracks speech activity (inactivity timeout = 60 seconds default) +5. Detects end conditions: + - WebSocket disconnect + - User manual stop + - Inactivity timeout +6. Waits for audio file path from persistence job +7. Saves `audio_path` to conversation document +8. Triggers conversation-level plugins +9. Enqueues post-conversation jobs +10. Calls `handle_end_of_conversation()` for cleanup + restart + +**RQ Queue**: `default` + +**Container**: `rq-worker` + +#### Audio Persistence Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/audio_jobs.py` + +**Function**: `audio_streaming_persistence_job()` + +**Scope**: Entire session (parallel with open_conversation_job) + +**Max Duration**: 24 hours + +**Process**: +1. Monitors `conversation:current:{session_id}` for rotation signals +2. For each conversation: + - Opens new WAV file: `{timestamp}_{client_id}_{conversation_id}.wav` + - Writes chunks immediately as they arrive from stream + - Stores file path in `audio:file:{conversation_id}` +3. On rotation signal: + - Closes current file + - Opens new file for next conversation +4. On END signal: + - Closes file + - Returns statistics (chunk count, bytes, duration) + +**Output**: WAV files in `backends/advanced/data/chunks/` + +**Container**: `rq-worker` + +### Post-Conversation Pipeline + +**Streaming conversations**: Use streaming transcript saved during conversation. No batch re-transcription. + +**File uploads**: Batch transcription job runs first, then post-conversation jobs depend on it. + +#### 1. Recognize Speakers Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/transcription_jobs.py` + +**Function**: `recognize_speakers_job()` + +**Process**: +- Sends audio + segments to speaker recognition service +- Identifies speakers using voice embeddings +- Updates segment speaker labels in MongoDB + +**Optional**: Only runs if `DISABLE_SPEAKER_RECOGNITION=false` + +**Container**: `rq-worker` + +**External Service**: `speaker-recognition` container (if enabled) + +#### 2. Memory Extraction Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/memory_jobs.py` + +**Function**: `memory_extraction_job()` + +**Prerequisite**: Speaker recognition job + +**Process**: +- Uses LLM (OpenAI/Ollama) to extract semantic facts +- Stores embeddings in vector database: + - **Chronicle provider**: Qdrant + - **OpenMemory MCP provider**: External OpenMemory server + +**Container**: `rq-worker` + +**External Services**: +- `ollama` or OpenAI API (LLM) +- `qdrant` or OpenMemory MCP (vector storage) + +#### 3. Generate Title Summary Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/conversation_jobs.py` + +**Function**: `generate_title_summary_job()` + +**Prerequisite**: Speaker recognition job + +**Process**: +- Uses LLM to generate title, summary, detailed summary +- Updates conversation document in MongoDB + +**Container**: `rq-worker` + +#### 4. Dispatch Conversation Complete Event + +**File**: `backends/advanced/src/advanced_omi_backend/workers/conversation_jobs.py` + +**Function**: `dispatch_conversation_complete_event_job()` + +**Process**: +- Triggers `conversation.complete` plugin event + +**Container**: `rq-worker` + +#### Batch Transcription Job + +**File**: `backends/advanced/src/advanced_omi_backend/workers/transcription_jobs.py` + +**Function**: `transcribe_full_audio_job()` + +**When used**: +- File uploads via `/api/process-audio-files` +- Manual reprocessing via `/api/conversations/{id}/reprocess-transcript` +- NOT used for streaming conversations + +**Process**: +- Reconstructs audio from MongoDB chunks +- Batch transcribes entire audio +- Stores transcript with word-level timestamps + +**Container**: `rq-worker` + +### Session Restart + +**File**: `backends/advanced/src/advanced_omi_backend/utils/conversation_utils.py` + +**Function**: `handle_end_of_conversation()` + +**Process**: +1. Deletes transcription results stream: `transcription:results:{session_id}` +2. Increments `session:conversation_count:{session_id}` +3. Checks if session still active (WebSocket connected) +4. If active: Re-enqueues `stream_speech_detection_job` for next conversation +5. Cleans up consumer groups and pending messages + +**Purpose**: Allows continuous recording with multiple conversations per session + +## Data Storage + +### MongoDB Collections + +**Database**: `chronicle` + +**Container**: `mongo` + +**Volume**: `mongodb_data` (persistent) + +#### `conversations` Collection + +**Schema**: +```python +{ + "_id": ObjectId, + "conversation_id": "uuid-string", + "audio_uuid": "session_id", + "user_id": ObjectId, + "client_id": "user01-phone", + + # Content + "title": "Meeting notes", + "summary": "Discussion about...", + "detailed_summary": "Longer summary...", + "transcript": "Full transcript text", + "audio_path": "1704067200000_user01-phone_convid.wav", + + # Versioned Transcripts + "active_transcript_version": "v1", + "transcript_versions": { + "v1": { + "text": "Full transcript", + "segments": [SpeakerSegment], + "words": [Word], + "provider": "deepgram", + "processing_time_seconds": 45.2, + "created_at": "2025-01-11T12:00:00Z" + } + }, + "segments": [SpeakerSegment], # From active version + + # Metadata + "created_at": "2025-01-11T12:00:00Z", + "completed_at": "2025-01-11T12:15:00Z", + "end_reason": "user_stopped|inactivity_timeout|websocket_disconnect", + "deleted": false +} +``` + +**Indexes**: +- `user_id` (for user-scoped queries) +- `client_id` (for device filtering) +- `conversation_id` (unique) + +#### `audio_chunks` Collection + +**Purpose**: Stores raw audio session data + +**Schema**: +```python +{ + "_id": ObjectId, + "audio_uuid": "session_id", + "user_id": ObjectId, + "client_id": "user01-phone", + "created_at": "2025-01-11T12:00:00Z", + "metadata": { ... } +} +``` + +**Use Case**: Speech-driven architecture (sessions without conversations) + +#### `users` Collection + +**Purpose**: User accounts, authentication, preferences + +**Schema**: +```python +{ + "_id": ObjectId, + "email": "user@example.com", + "hashed_password": "...", + "is_active": true, + "is_superuser": false, + "created_at": "2025-01-11T12:00:00Z" +} +``` + +### Disk Storage + +**Location**: `backends/advanced/data/chunks/` + +**Container**: `chronicle-backend` (volume-mounted) + +**Volume**: `./backends/advanced/data/chunks:/app/data/chunks` + +**File Format**: WAV files + +**Naming Convention**: `{timestamp_ms}_{client_id}_{conversation_id}.wav` + +**Example**: `1704067200000_user01-phone_550e8400-e29b-41d4-a716-446655440000.wav` + +**Created by**: `audio_streaming_persistence_job()` + +**Read by**: Post-conversation transcription jobs + +**Retention**: Manual cleanup (no automatic deletion) + +### Redis Storage + +**Container**: `redis` + +**Volume**: `redis_data` (persistent) + +| Key Pattern | Type | Purpose | TTL | Created By | +|-------------|------|---------|-----|------------| +| `audio:stream:{client_id}` | Stream | Audio chunks for transcription | None (MAXLEN=25k) | AudioStreamProducer | +| `audio:session:{session_id}` | Hash | Session metadata | 1 hour | AudioStreamProducer | +| `transcription:results:{session_id}` | Stream | Transcription results | Manual delete | Transcription consumers | +| `transcription:interim:{session_id}` | Pub/Sub | Real-time interim results | N/A (ephemeral) | Streaming consumer | +| `conversation:current:{session_id}` | String | Current conversation ID | 24 hours | open_conversation_job | +| `audio:file:{conversation_id}` | String | Audio file path | 24 hours | audio_persistence_job | +| `session:conversation_count:{session_id}` | Counter | Conversation count | 1 hour | handle_end_of_conversation | +| `speech_detection_job:{client_id}` | String | Job ID for cleanup | 1 hour | speech_detection_job | +| `rq:job:{job_id}` | Hash | RQ job metadata | 24 hours (default) | RQ | + +### Vector Storage (Memory) + +#### Option A: Qdrant (Chronicle Native Provider) + +**Container**: `qdrant` + +**Volume**: `qdrant_data` (persistent) + +**Ports**: 6333 (HTTP), 6334 (gRPC) + +**Collections**: User-specific collections for semantic embeddings + +**Written by**: `memory_extraction_job()` + +**Read by**: Memory search API (`/api/memories/search`) + +#### Option B: OpenMemory MCP + +**Container**: `openmemory-mcp` (external service) + +**Port**: 8765 + +**Protocol**: MCP (Model Context Protocol) + +**Collections**: Cross-client memory storage + +**Written by**: `memory_extraction_job()` (via MCP provider) + +**Read by**: Memory search API (via MCP provider) + +## Complete End-to-End Flow + +### Step-by-Step Data Journey + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 1. AUDIO INPUT β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + WebSocket (/ws) or File Upload (/audio/upload) + ↓ + Container: chronicle-backend + ↓ + AudioStreamProducer.init_session() + - Creates: audio:session:{session_id} (Redis) + - Initializes: In-memory buffer (chronicle-backend container) + ↓ + AudioStreamProducer.add_audio_chunk() + - Buffers: In-memory (chronicle-backend) + - Chunks: Fixed 0.25s chunks (8,000 bytes) + - Publishes: audio:stream:{client_id} (Redis) + - Returns: Redis message IDs + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 2. SESSION-LEVEL JOB (RQ) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + stream_speech_detection_job + Container: rq-worker + ↓ + Polls: TranscriptionResultsAggregator.get_combined_results() + Reads: transcription:results:{session_id} (Redis) + ↓ + Analyzes: Word count, duration, confidence + ↓ + When speech detected: + - Creates: Conversation document (MongoDB) + - Enqueues: open_conversation_job (RQ) + - Exits (restarts when conversation ends) + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 3a. TRANSCRIPTION CONSUMER (Background Task) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + StreamingTranscriptionConsumer (or BaseAudioStreamConsumer) + Container: chronicle-backend (Background Task) + ↓ + Reads: audio:stream:{client_id} (Redis, via XREADGROUP) + Consumer Group: streaming-transcription (or batch provider) + ↓ + STREAMING PATH: + β€’ Opens: WebSocket to Deepgram + β€’ Sends: Chunks immediately (no buffering) + β€’ Publishes Interim: transcription:interim:{session_id} (Redis Pub/Sub) + β€’ Publishes Final: transcription:results:{session_id} (Redis Stream) + β€’ Triggers: Plugins on final results + + BATCH PATH: + β€’ Buffers: 30 chunks (~7.5s) in memory (chronicle-backend) + β€’ Combines: All buffered chunks + β€’ Transcribes: Via provider API (Deepgram/Parakeet) + β€’ Adjusts: Timestamps relative to session start + β€’ Publishes: transcription:results:{session_id} (Redis Stream) + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 3b. AUDIO PERSISTENCE CONSUMER (RQ Job) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + audio_streaming_persistence_job + Container: rq-worker + ↓ + Reads: audio:stream:{client_id} (Redis, via XREADGROUP) + Consumer Group: audio_persistence + ↓ + Monitors: conversation:current:{session_id} (Redis) + ↓ + For each conversation: + β€’ Opens: New WAV file (data/chunks/, chronicle-backend volume) + β€’ Writes: Chunks immediately (real-time) + β€’ Stores: audio:file:{conversation_id} = path (Redis) + ↓ + On rotation signal: + β€’ Closes: Current file + β€’ Opens: New file for next conversation + ↓ + On END signal: + β€’ Closes: File + β€’ Returns: Statistics (chunks, bytes, duration) + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 4. CONVERSATION-LEVEL JOB (RQ) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + open_conversation_job + Container: rq-worker + ↓ + Creates: Conversation document (MongoDB conversations collection) + ↓ + Sets: conversation:current:{session_id} = conversation_id (Redis) + β†’ Triggers audio persistence job to rotate WAV file + ↓ + Polls: TranscriptionResultsAggregator for updates (1s intervals) + Reads: transcription:results:{session_id} (Redis) + ↓ + Tracks: Speech activity (inactivity timeout = 60s) + ↓ + Detects End: + - Inactivity (60s) + - User manual stop + - WebSocket disconnect + ↓ + Waits: For audio file path from persistence job + Reads: audio:file:{conversation_id} (Redis) + ↓ + Saves: audio_path to conversation document (MongoDB) + ↓ + Enqueues: POST-CONVERSATION PIPELINE (RQ) + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 5. POST-CONVERSATION PIPELINE (RQ - Parallel Jobs) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + All jobs run in parallel + Container: rq-worker + ↓ + Reads: Audio file from disk (data/chunks/*.wav) + + β”Œβ”€ transcribe_full_audio_job + β”‚ - Batch transcribes: Complete audio file + β”‚ - Validates: Meaningful speech + β”‚ - Marks deleted: If no speech + β”‚ - Stores: MongoDB (transcript, segments, words) + β”‚ + β”‚ └─ recognize_speakers_job (if enabled) + β”‚ - Sends: Audio + segments to speaker-recognition service + β”‚ - Identifies: Speakers via voice embeddings + β”‚ - Updates: MongoDB (segment speaker labels) + β”‚ + β”‚ └─ memory_extraction_job + β”‚ - Uses: LLM (OpenAI/Ollama) to extract facts + β”‚ - Stores: Qdrant (Chronicle) or OpenMemory MCP (vector DB) + β”‚ + └─ generate_title_summary_job + - Uses: LLM (OpenAI/Ollama) + - Generates: Title, summary, detailed_summary + - Stores: MongoDB (conversation document) + + └─ dispatch_conversation_complete_event_job + - Triggers: conversation.complete plugins + - Only for: File uploads (not streaming) + + All results stored: MongoDB conversations collection + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 6. SESSION RESTART β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + handle_end_of_conversation() + Container: chronicle-backend + ↓ + Deletes: transcription:results:{session_id} (Redis) + ↓ + Increments: session:conversation_count:{session_id} (Redis) + ↓ + Checks: Session still active? (WebSocket connected) + ↓ + If active: + - Re-enqueues: stream_speech_detection_job (RQ) + - Session remains: "active" for next conversation +``` + +### Data Locations Summary + +| Stage | Data Type | Location | Container | +|-------|-----------|----------|-----------| +| Input | Audio bytes | In-memory buffers | chronicle-backend | +| Producer | Fixed chunks | `audio:stream:{client_id}` | redis | +| Session metadata | Hash | `audio:session:{session_id}` | redis | +| Transcription consumer | Interim results | `transcription:interim:{session_id}` (Pub/Sub) | redis | +| Transcription consumer | Final results | `transcription:results:{session_id}` (Stream) | redis | +| Audio persistence | WAV files | `data/chunks/*.wav` (disk volume) | chronicle-backend (volume) | +| Audio persistence | File paths | `audio:file:{conversation_id}` | redis | +| Conversation job | Conversation doc | MongoDB `conversations` | mongo | +| Post-processing | Transcript | MongoDB `conversations` | mongo | +| Post-processing | Memories | Qdrant or OpenMemory MCP | qdrant / openmemory-mcp | +| Post-processing | Title/summary | MongoDB `conversations` | mongo | + +## Key Design Patterns + +### 1. Speech-Driven Architecture + +**Principle**: Conversations only created when speech is detected + +**Benefits**: +- Clean user experience (no noise-only sessions in UI) +- Reduced memory processing load +- Automatic quality filtering + +**Implementation**: +- `audio_chunks` collection: Always stores sessions +- `conversations` collection: Only created with speech +- Speech detection: Analyzes word count, duration, confidence + +### 2. Versioned Processing + +**Principle**: Store multiple versions of transcripts/memories + +**Benefits**: +- Reprocess without losing originals +- A/B testing different providers +- Rollback to previous versions + +**Implementation**: +- `transcript_versions` dict with version IDs (v1, v2, ...) +- `active_transcript_version` pointer +- `segments` field mirrors active version (quick access) + +### 3. Session-Level vs Conversation-Level + +**Session**: WebSocket connection lifetime (multiple conversations) +- Duration: Up to 24 hours +- Job: `stream_speech_detection_job` +- Purpose: Continuous monitoring for speech + +**Conversation**: Speech burst between silence periods +- Duration: Typically minutes +- Job: `open_conversation_job` +- Purpose: Process single meaningful exchange + +**Benefits**: +- Continuous recording without manual start/stop +- Automatic conversation segmentation +- Efficient resource usage (one session, many conversations) + +### 4. Job Metadata Cascading + +**Pattern**: Parent jobs link to child jobs + +**Example**: +``` +speech_detection_job + ↓ job_id stored in +audio:session:{session_id} + ↓ creates +open_conversation_job + ↓ job_id stored in +conversation document + ↓ creates +post-conversation jobs (parallel) +``` + +**Benefits**: +- Job grouping and cleanup +- Dependency tracking +- Debugging (trace job lineage) + +### 5. Real-Time + Batch Hybrid + +**Real-Time Path** (Streaming Consumer): +- Low latency (interim results in <1 second) +- WebSocket to Deepgram +- Publishes to Pub/Sub for live UI updates + +**Batch Path** (Batch Consumer): +- High accuracy (more context) +- Buffers 7.5 seconds +- API-based transcription + +**Both paths** write to same `transcription:results:{session_id}` stream + +**Benefits**: +- Live UI updates (interim results) +- Accurate final results (batch processing) +- Provider flexibility (switch between streaming/batch) + +### 6. Fan-Out via Redis Consumer Groups + +**Pattern**: Multiple consumer groups read same stream + +**Example**: `audio:stream:{client_id}` consumed by: +- Transcription consumer group +- Audio persistence consumer group + +**Benefits**: +- Parallel processing paths +- Horizontal scaling (multiple workers per group) +- No message duplication (each group processes independently) + +### 7. File Rotation via Redis Signals + +**Pattern**: Conversation job signals persistence job via Redis key + +**Implementation**: +```python +# Conversation job +redis.set(f"conversation:current:{session_id}", conversation_id) + +# Persistence job (monitors key) +current_conv = redis.get(f"conversation:current:{session_id}") +if current_conv != last_conv: + close_current_file() + open_new_file(current_conv) +``` + +**Benefits**: +- Decoupled jobs (no direct communication) +- Real-time file rotation +- Multiple files per session (one per conversation) + +## Failure Handling + +### Transcription Errors + +**Detection**: `stream_speech_detection_job` polls results + +**Action**: +- Sets `transcription_error` field in `audio:session:{session_id}` +- Logs error for debugging +- Session remains active (can recover) + +### No Meaningful Speech + +**Detection**: `transcribe_full_audio_job` validates transcript + +**Criteria**: +- Word count < 10 +- Duration < 5 seconds +- All words low confidence + +**Action**: +- Marks conversation `deleted=True` +- Sets `end_reason="no_meaningful_speech"` +- Conversation hidden from UI + +### Audio File Not Ready + +**Detection**: `open_conversation_job` waits for file path + +**Timeout**: 30 seconds (configurable) + +**Action**: +- Marks conversation `deleted=True` +- Sets `end_reason="audio_file_not_ready"` +- Logs error for debugging + +### Job Zombies (Stuck Jobs) + +**Detection**: `check_job_alive()` utility + +**Method**: Checks Redis for job existence + +**Action**: +- Returns `False` if job missing +- Caller can retry or fail gracefully + +### Dead Consumers + +**Detection**: Consumer group lag monitoring + +**Cleanup**: +- Removes idle consumers (>30 seconds) +- Claims pending messages from dead consumers +- Redistributes to active workers + +### Stream Trimming + +**Prevention**: Streams don't grow unbounded + +**Implementation**: +- `XTRIM MAXLEN 25000` on `audio:stream:{client_id}` +- Keeps last 25k messages (~104 minutes @ 0.25s chunks) +- Deletes `transcription:results:{session_id}` after conversation ends + +### Session Timeout + +**Max Duration**: 24 hours + +**Action**: +- Jobs exit gracefully +- Session marked `"complete"` +- Resources cleaned up (streams deleted, consumer groups removed) + +--- + +## Conclusion + +Chronicle's audio pipeline is designed for: +- **Real-time processing**: Low-latency transcription and live UI updates +- **Horizontal scalability**: Redis Consumer Groups enable multiple workers +- **Fault tolerance**: Decoupled components, job retries, graceful error handling +- **Resource efficiency**: Speech-driven architecture filters noise automatically +- **Flexibility**: Pluggable providers (Deepgram/Parakeet, OpenAI/Ollama, Qdrant/OpenMemory) + +All coordinated through **Redis Streams** for data flow and **RQ** for orchestration, with **MongoDB** for final storage and **disk** for audio archives. diff --git a/app/README.md b/app/README.md index d73dd748..e85e83e5 100644 --- a/app/README.md +++ b/app/README.md @@ -120,14 +120,14 @@ The app connects to any backend that accepts OPUS audio streams: 2. **Advanced Backend** (`backends/advanced/`) - Full transcription and memory features - Real-time processing with speaker recognition - - WebSocket endpoint: `/ws_pcm` + - WebSocket endpoint: `/ws?codec=pcm` ### Connection Setup #### Local Development ``` -Backend URL: ws://[machine-ip]:8000/ws_pcm -Example: ws://192.168.1.100:8000/ws_pcm +Backend URL: ws://[machine-ip]:8000/ws?codec=pcm +Example: ws://192.168.1.100:8000/ws?codec=pcm ``` #### Public Access (Production) @@ -138,7 +138,7 @@ Use ngrok or similar tunneling service: ngrok http 8000 # Use provided URL in app -Backend URL: wss://[ngrok-subdomain].ngrok.io/ws_pcm +Backend URL: wss://[ngrok-subdomain].ngrok.io/ws?codec=pcm ``` ### Configuration Steps @@ -147,8 +147,8 @@ Backend URL: wss://[ngrok-subdomain].ngrok.io/ws_pcm 2. **Open the mobile app** 3. **Navigate to Settings** 4. **Enter Backend URL**: - - Local: `ws://[your-ip]:8000/ws_pcm` - - Public: `wss://[your-domain]/ws_pcm` + - Local: `ws://[your-ip]:8000/ws?codec=pcm` + - Public: `wss://[your-domain]/ws?codec=pcm` 5. **Save configuration** ## Phone Audio Streaming (NEW) @@ -176,7 +176,7 @@ Stream audio directly from your phone's microphone to Chronicle backend, bypassi - **iOS**: iOS 13+ with microphone permissions - **Android**: Android API 21+ with microphone permissions - **Network**: Stable connection to Chronicle backend -- **Backend**: Advanced backend running with `/ws_pcm` endpoint +- **Backend**: Advanced backend running with `/ws?codec=pcm` endpoint #### Switching Audio Sources - **Mutual Exclusion**: Cannot use Bluetooth and phone audio simultaneously @@ -187,7 +187,7 @@ Stream audio directly from your phone's microphone to Chronicle backend, bypassi #### Audio Not Streaming - **Check Permissions**: Ensure microphone access granted -- **Verify Backend URL**: Confirm `ws://[ip]:8000/ws_pcm` format +- **Verify Backend URL**: Confirm `ws://[ip]:8000/ws?codec=pcm` format - **Network Connection**: Test backend connectivity - **Authentication**: Verify JWT token is valid @@ -292,7 +292,7 @@ curl -i -N -H "Connection: Upgrade" \ -H "Upgrade: websocket" \ -H "Sec-WebSocket-Key: test" \ -H "Sec-WebSocket-Version: 13" \ - http://[backend-ip]:8000/ws_pcm + http://[backend-ip]:8000/ws?codec=pcm ``` ## Development @@ -338,7 +338,7 @@ npx expo build:android ### WebSocket Communication ```javascript // Connect to backend -const ws = new WebSocket('ws://backend-url:8000/ws_pcm'); +const ws = new WebSocket('ws://backend-url:8000/ws?codec=pcm'); // Send audio data ws.send(audioBuffer); diff --git a/app/app/components/BackendStatus.tsx b/app/app/components/BackendStatus.tsx index 75fdd7a8..4f55d37f 100644 --- a/app/app/components/BackendStatus.tsx +++ b/app/app/components/BackendStatus.tsx @@ -208,9 +208,9 @@ export const BackendStatus: React.FC = ({ - Enter the WebSocket URL of your backend server. Simple backend: http://localhost:8000/ (no auth). + Enter the WebSocket URL of your backend server. Simple backend: http://localhost:8000/ (no auth). Advanced backend: http://localhost:8080/ (requires login). Status is automatically checked. - The websocket URL can be different or the same as the HTTP URL, with /ws_omi suffix + The websocket URL can be different or the same as the HTTP URL, with /ws endpoint and codec parameter (e.g., /ws?codec=pcm) ); diff --git a/app/app/index.tsx b/app/app/index.tsx index fc924d92..649a2e2b 100644 --- a/app/app/index.tsx +++ b/app/app/index.tsx @@ -322,10 +322,16 @@ export default function App() { // Convert HTTP/HTTPS to WS/WSS protocol finalWebSocketUrl = finalWebSocketUrl.replace(/^http:/, 'ws:').replace(/^https:/, 'wss:'); - // Ensure /ws_pcm endpoint is included - if (!finalWebSocketUrl.includes('/ws_pcm')) { - // Remove trailing slash if present, then add /ws_pcm - finalWebSocketUrl = finalWebSocketUrl.replace(/\/$/, '') + '/ws_pcm'; + // Ensure /ws endpoint is included + if (!finalWebSocketUrl.includes('/ws')) { + // Remove trailing slash if present, then add /ws + finalWebSocketUrl = finalWebSocketUrl.replace(/\/$/, '') + '/ws'; + } + + // Add codec parameter if not present + if (!finalWebSocketUrl.includes('codec=')) { + const separator = finalWebSocketUrl.includes('?') ? '&' : '?'; + finalWebSocketUrl = finalWebSocketUrl + separator + 'codec=pcm'; } // Check if this is the advanced backend (requires authentication) or simple backend diff --git a/backends/advanced/.dockerignore b/backends/advanced/.dockerignore index 2dd9b44f..f0f7f05c 100644 --- a/backends/advanced/.dockerignore +++ b/backends/advanced/.dockerignore @@ -17,5 +17,5 @@ !nginx.conf.template !start.sh !start-k8s.sh -!start-workers.sh +!worker_orchestrator.py !Caddyfile \ No newline at end of file diff --git a/backends/advanced/.env.template b/backends/advanced/.env.template index a63ab6f5..818b47b6 100644 --- a/backends/advanced/.env.template +++ b/backends/advanced/.env.template @@ -1,219 +1,106 @@ # ======================================== -# GETTING STARTED +# Chronicle Backend - Secrets Only # ======================================== +# This file contains ONLY secret values (API keys, passwords, tokens). +# All other configuration is in config/config.yml. +# +# Setup: # 1. Copy this file to .env: cp .env.template .env -# 2. Fill in your API keys below (at minimum: DEEPGRAM_API_KEY, OPENAI_API_KEY) -# 3. Run: docker compose up --build -d -# 4. For testing: ./run-test.sh (requires API keys to be set) - -# This key is used to sign your JWT token, just make it random and long -AUTH_SECRET_KEY= - -# This is the password for the admin user -ADMIN_PASSWORD= - -# Admin email (defaults to admin@example.com if not set) -ADMIN_EMAIL=admin@example.com +# 2. Fill in your API keys and secrets below +# 3. Configure non-secret settings in config/config.yml +# 4. Run: docker compose up --build -d # ======================================== -# LLM CONFIGURATION (Standard) +# Authentication Secrets # ======================================== -# LLM Provider: "openai" or "ollama" (default: openai) -LLM_PROVIDER=openai +# JWT signing key (generate a long random string) +AUTH_SECRET_KEY= -# OpenAI or OpenAI-compatible API configuration -OPENAI_API_KEY=your-openai-key-here -OPENAI_BASE_URL=https://api.openai.com/v1 -OPENAI_MODEL=gpt-4o-mini +# Admin account password +ADMIN_PASSWORD= -# For Ollama (OpenAI-compatible mode): -# LLM_PROVIDER=ollama -# OLLAMA_BASE_URL=dummy -# OLLAMA_BASE_URL=http://ollama:11434/v1 -# OLLAMA_MODEL=llama3.1:latest -# OLLAMA_EMBEDDER_MODEL=nomic-embed-text:latest +# Admin email address +ADMIN_EMAIL=admin@example.com # ======================================== -# CHAT INTERFACE CONFIGURATION (Optional) +# LLM API Keys # ======================================== -# Chat-specific LLM model (defaults to OPENAI_MODEL if not set) -# CHAT_LLM_MODEL=gpt-4o-mini - -# Chat temperature for more conversational responses (defaults to 0.7) -# CHAT_TEMPERATURE=0.7 +# OpenAI API key (or OpenAI-compatible provider) +OPENAI_API_KEY= # ======================================== -# SPEECH-TO-TEXT CONFIGURATION (API Keys Only) +# Transcription API Keys # ======================================== -# Provider selection is in config.yml (defaults.stt) -# Deepgram (cloud-based, recommended) +# Deepgram API key (for cloud-based transcription) DEEPGRAM_API_KEY= -# Note: Parakeet ASR URL configured in config.yml - # ======================================== -# SPEECH DETECTION CONFIGURATION +# Speaker Recognition # ======================================== -# Speech detection settings for conversation creation (speech-driven architecture) -# Only meaningful speech creates conversations - silence/noise is filtered out - -# Minimum words required to create a conversation (default: 5) -SPEECH_DETECTION_MIN_WORDS=5 - -# Minimum word confidence threshold (0.0-1.0, default: 0.5) -# Used for both conversation creation and speech gap analysis -SPEECH_DETECTION_MIN_CONFIDENCE=0.5 - -# Batch transcription monitoring (for batch providers like Parakeet) -TRANSCRIPTION_BUFFER_SECONDS=120 # Trigger transcription every N seconds - -# Auto-stop thresholds -SPEECH_INACTIVITY_THRESHOLD_SECONDS=60 # Close conversation after N seconds of no speech - -# Speaker enrollment filter (default: false) -# When enabled, only creates conversations when enrolled speakers are detected -# Requires speaker recognition service to be running and speakers to be enrolled -# Set to "true" to enable, "false" or omit to disable -RECORD_ONLY_ENROLLED_SPEAKERS=false +# Hugging Face token (for PyAnnote speaker recognition models) +HF_TOKEN= # ======================================== -# DATABASE CONFIGURATION +# Optional Services # ======================================== -# MongoDB for conversations and user data (defaults to mongodb://mongo:27017) -MONGODB_URI=mongodb://mongo:27017 - -# MongoDB database name (new installations use 'chronicle', legacy installations use 'friend-lite') -MONGODB_DATABASE=chronicle +# Neo4j password (if using Neo4j for graph memory) +NEO4J_PASSWORD= -# Qdrant for vector memory storage (defaults to qdrant) -QDRANT_BASE_URL=qdrant +# Langfuse API keys (for LLM observability) +LANGFUSE_PUBLIC_KEY= +LANGFUSE_SECRET_KEY= +# Tailscale auth key (for remote service access) +TS_AUTHKEY= # ======================================== -# MEMORY PROVIDER CONFIGURATION +# Plugin Configuration # ======================================== +# Plugin-specific configuration is in: backends/advanced/src/advanced_omi_backend/plugins/{plugin_id}/config.yml +# Plugin orchestration (enabled, events) is in: config/plugins.yml +# This section contains ONLY plugin secrets -# Memory Provider: "chronicle" (default), "openmemory_mcp", or "mycelia" -# -# Chronicle (default): In-house memory system with full control -# - Custom LLM-powered extraction with individual fact storage -# - Smart deduplication and memory updates (ADD/UPDATE/DELETE) -# - Direct Qdrant vector storage -# - No external dependencies -# -# OpenMemory MCP: Delegates to external OpenMemory MCP server -# - Professional memory processing with cross-client compatibility -# - Works with Claude Desktop, Cursor, Windsurf, etc. -# - Web UI at http://localhost:8765 -# - Requires external server setup -# -# Mycelia: Full-featured personal memory timeline -# - Voice, screenshots, and text capture -# - Timeline UI with waveform playback -# - Conversation extraction and semantic search -# - OAuth federation for cross-instance sharing -# - Requires Mycelia server setup (extras/mycelia) -# -# See MEMORY_PROVIDERS.md for detailed comparison -MEMORY_PROVIDER=chronicle - -# ---------------------------------------- -# OpenMemory MCP Configuration -# (Only needed if MEMORY_PROVIDER=openmemory_mcp) -# ---------------------------------------- -# First start the external server: -# cd extras/openmemory-mcp && docker compose up -d -# -# OPENMEMORY_MCP_URL=http://host.docker.internal:8765 -# OPENMEMORY_CLIENT_NAME=chronicle -# OPENMEMORY_USER_ID=openmemory -# OPENMEMORY_TIMEOUT=30 - -# ---------------------------------------- -# Mycelia Configuration -# (Only needed if MEMORY_PROVIDER=mycelia) -# ---------------------------------------- -# First start Mycelia: -# cd extras/mycelia && docker compose up -d redis mongo mongo-search -# cd extras/mycelia/backend && deno task dev -# -# IMPORTANT: JWT_SECRET in Mycelia backend/.env must match AUTH_SECRET_KEY above -# MYCELIA_URL=http://host.docker.internal:5173 -# MYCELIA_DB=mycelia # Database name (use mycelia_test for test environment) -# MYCELIA_TIMEOUT=30 +# --------------------------------------- +# Home Assistant Plugin +# --------------------------------------- +# Enable in config/plugins.yml +# Configure in backends/advanced/src/advanced_omi_backend/plugins/homeassistant/config.yml -# ======================================== -# OPTIONAL FEATURES -# ======================================== +# Home Assistant server URL +HA_URL=http://homeassistant.local:8123 -NEO4J_HOST=neo4j-mem0 -NEO4J_USER=neo4j -NEO4J_PASSWORD= - -# Debug directory for troubleshooting -DEBUG_DIR=./data/debug_dir - -# Ngrok for external access (if using ngrok from docker-compose) -# NGROK_AUTHTOKEN= - -# Speaker recognition service -# HF_TOKEN= -# SPEAKER_SERVICE_URL=http://speaker-recognition:8001 +# Home Assistant long-lived access token +# Get from: Profile β†’ Security β†’ Long-Lived Access Tokens +HA_TOKEN= -# Audio processing settings -# NEW_CONVERSATION_TIMEOUT_MINUTES=1.5 -# AUDIO_CROPPING_ENABLED=true -# MIN_SPEECH_SEGMENT_DURATION=1.0 -# CROPPING_CONTEXT_PADDING=0.1 +# Wake word for voice commands (optional, default: vivi) +HA_WAKE_WORD=vivi -# ======================================== -# SPEECH-DRIVEN CONVERSATIONS CONFIGURATION -# ======================================== +# Request timeout in seconds (optional, default: 30) +HA_TIMEOUT=30 -# Note: File rotation for long sessions is not yet implemented -# Audio sessions currently create single files that grow until the session ends +# --------------------------------------- +# Email Summarizer Plugin +# --------------------------------------- +# Enable in config/plugins.yml +# Configure in backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/config.yml +# SMTP server configuration +# For Gmail: Use App Password (requires 2FA enabled) +# 1. Go to Google Account β†’ Security β†’ 2-Step Verification +# 2. Scroll to "App passwords" β†’ Generate password for "Mail" +# 3. Use the 16-character password below (no spaces) +SMTP_HOST=smtp.gmail.com +SMTP_PORT=587 +SMTP_USERNAME=your-email@gmail.com +SMTP_PASSWORD=your-app-password-here +SMTP_USE_TLS=true -# ======================================== -# PUBLIC ACCESS CONFIGURATION -# ======================================== -# These settings control how the browser accesses the backend for audio playback - -# The IP address or hostname where your backend is publicly accessible from the browser -# Examples: -# - For local development: localhost or 127.0.0.1 -# - For LAN access: your machine's IP (e.g., 192.168.1.100) -# - For VPN/Tailscale access: your VPN IP (e.g., 100.64.x.x for Tailscale) -# - For internet access: your domain or public IP (e.g., friend.example.com) -# Note: This must be accessible from your browser, not from the Docker container -HOST_IP=localhost - -# Backend API port (where audio files are served) -BACKEND_PUBLIC_PORT=8000 - -# WebUI port (defaults to 5173 for Vite dev server) -WEBUI_PORT=5173 - -# CORS origins (comma-separated list of allowed origins for browser requests) -# Note: Tailscale IPs (100.x.x.x) are automatically supported via regex -# For HTTPS access, add HTTPS origins after running ./init.sh -# Examples: -# - Local HTTP: http://localhost:5173,http://127.0.0.1:5173 -# - Local HTTPS: https://localhost,https://127.0.0.1 -# - Tailscale HTTPS: https://100.x.x.x -# - Custom: http://192.168.1.100:5173,https://192.168.1.100 -CORS_ORIGINS=http://localhost:5173,http://localhost:3000,http://127.0.0.1:5173,http://127.0.0.1:3000 - -# Memory settings -# MEM0_TELEMETRY=False - -# Langfuse settings -LANGFUSE_PUBLIC_KEY="" -LANGFUSE_SECRET_KEY="" -LANGFUSE_HOST="http://x.x.x.x:3002" -LANGFUSE_ENABLE_TELEMETRY=False \ No newline at end of file +# Email sender information +FROM_EMAIL=noreply@chronicle.ai +FROM_NAME=Chronicle AI diff --git a/backends/advanced/Dockerfile b/backends/advanced/Dockerfile index 352bcfe9..886c1f32 100644 --- a/backends/advanced/Dockerfile +++ b/backends/advanced/Dockerfile @@ -1,6 +1,9 @@ -FROM python:3.12-slim-bookworm AS builder +# ============================================ +# Base stage - common setup +# ============================================ +FROM python:3.12-slim-bookworm AS base -# Install system dependencies for building +# Install system dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ @@ -9,40 +12,59 @@ RUN apt-get update && \ curl \ ffmpeg \ && rm -rf /var/lib/apt/lists/* - # portaudio19-dev \ # Install uv COPY --from=ghcr.io/astral-sh/uv:0.6.10 /uv /uvx /bin/ -# Set up the working directory +# Set up working directory WORKDIR /app -# Copy package structure and dependency files first +# Copy package structure and dependency files COPY pyproject.toml README.md ./ COPY uv.lock . RUN mkdir -p src/advanced_omi_backend COPY src/advanced_omi_backend/__init__.py src/advanced_omi_backend/ -# Install dependencies using uv with deepgram extra -# Use cache mount for BuildKit, fallback for legacy builds -# RUN --mount=type=cache,target=/root/.cache/uv \ -# uv sync --extra deepgram -# Fallback for legacy Docker builds (CI compatibility) + +# ============================================ +# Production stage - production dependencies only +# ============================================ +FROM base AS prod + +# Install production dependencies only RUN uv sync --extra deepgram # Copy all application code COPY . . -# Copy configuration files if they exist, otherwise they will be created from templates at runtime -# The files are expected to exist, but we handle the case where they don't gracefully - +# Copy configuration files if they exist COPY diarization_config.json* ./ +# Copy and make startup script executable +COPY start.sh ./ +RUN chmod +x start.sh + +# Run the application +CMD ["./start.sh"] + + +# ============================================ +# Dev/Test stage - includes test dependencies +# ============================================ +FROM base AS dev + +# Install production + test dependencies +RUN uv sync --extra deepgram --group test + +# Copy all application code +COPY . . + +# Copy configuration files if they exist +COPY diarization_config.json* ./ -# Copy and make startup scripts executable +# Copy and make startup script executable COPY start.sh ./ -COPY start-workers.sh ./ -RUN chmod +x start.sh start-workers.sh +RUN chmod +x start.sh -# Run the application with workers +# Run the application CMD ["./start.sh"] diff --git a/backends/advanced/Dockerfile.k8s b/backends/advanced/Dockerfile.k8s index b746752a..6500ccf5 100644 --- a/backends/advanced/Dockerfile.k8s +++ b/backends/advanced/Dockerfile.k8s @@ -36,9 +36,9 @@ COPY . . # Copy memory config (created by init.sh from template) -# Copy and make K8s startup scripts executable -COPY start-k8s.sh start-workers.sh ./ -RUN chmod +x start-k8s.sh start-workers.sh +# Copy and make K8s startup script executable +COPY start-k8s.sh ./ +RUN chmod +x start-k8s.sh # Activate virtual environment in PATH ENV PATH="/app/.venv/bin:$PATH" diff --git a/backends/advanced/Docs/architecture.md b/backends/advanced/Docs/architecture.md index 7c6427bb..739f0ed7 100644 --- a/backends/advanced/Docs/architecture.md +++ b/backends/advanced/Docs/architecture.md @@ -22,7 +22,7 @@ graph TB %% Main WebSocket Server subgraph "WebSocket Server" - WS["/ws_pcm endpoint"] + WS["/ws?codec=pcm endpoint"] AUTH[JWT Auth] end @@ -237,13 +237,13 @@ Wyoming is a peer-to-peer protocol for voice assistants that combines JSONL (JSO #### Backend Implementation -**Advanced Backend (`/ws_pcm`)**: +**Advanced Backend (`/ws?codec=pcm`)**: - **Full Wyoming Protocol Support**: Parses all Wyoming events for comprehensive session management - **Session State Tracking**: Only processes audio chunks when session is active (after receiving audio-start) - **Conversation Boundaries**: Uses Wyoming audio-start/stop events to define precise conversation segments - **PCM Audio Processing**: Direct processing of PCM audio data from all apps -**Advanced Backend (`/ws_omi`)**: +**Advanced Backend (`/ws?codec=opus`)**: - **Wyoming Protocol + Opus Decoding**: Combines Wyoming session management with OMI Opus decoding - **Continuous Streaming**: OMI devices stream continuously, audio-start/stop events are optional - **Timestamp Preservation**: Uses timestamps from Wyoming headers when provided @@ -1006,8 +1006,8 @@ src/advanced_omi_backend/ - `POST /api/conversations/{conversation_id}/activate-transcript` - Switch transcript version - `POST /api/conversations/{conversation_id}/activate-memory` - Switch memory version - `POST /api/audio/upload` - Batch audio file upload and processing -- WebSocket `/ws_omi` - Real-time Opus audio streaming with Wyoming protocol (OMI devices) -- WebSocket `/ws_pcm` - Real-time PCM audio streaming with Wyoming protocol (all apps) +- WebSocket `/ws?codec=opus` - Real-time Opus audio streaming with Wyoming protocol (OMI devices) +- WebSocket `/ws?codec=pcm` - Real-time PCM audio streaming with Wyoming protocol (all apps) ### Authentication & Authorization - **JWT Tokens**: All API endpoints require valid JWT authentication diff --git a/backends/advanced/Docs/auth.md b/backends/advanced/Docs/auth.md index acbf8df4..7998750e 100644 --- a/backends/advanced/Docs/auth.md +++ b/backends/advanced/Docs/auth.md @@ -100,13 +100,13 @@ curl -X POST "http://localhost:8000/auth/jwt/login" \ #### Token-based (Recommended) ```javascript -const ws = new WebSocket('ws://localhost:8000/ws_pcm?token=JWT_TOKEN&device_name=phone'); +const ws = new WebSocket('ws://localhost:8000/ws?codec=pcm?token=JWT_TOKEN&device_name=phone'); ``` #### Cookie-based ```javascript // Requires existing cookie from web login -const ws = new WebSocket('ws://localhost:8000/ws_pcm?device_name=phone'); +const ws = new WebSocket('ws://localhost:8000/ws?codec=pcm?device_name=phone'); ``` ## Client ID Management @@ -183,8 +183,8 @@ COOKIE_SECURE=false - `PATCH /api/users/me` - Update user profile ### WebSocket Endpoints -- `ws://host/ws` - Opus audio stream with auth -- `ws://host/ws_pcm` - PCM audio stream with auth +- `ws://host/ws?codec=opus` - Opus audio stream with auth +- `ws://host/ws?codec=pcm` - PCM audio stream with auth (default) ## Error Handling diff --git a/backends/advanced/Docs/memories.md b/backends/advanced/Docs/memories.md index cae98383..08ae393e 100644 --- a/backends/advanced/Docs/memories.md +++ b/backends/advanced/Docs/memories.md @@ -98,7 +98,7 @@ MEM0_CONFIG = { "vector_store": { "provider": "qdrant", "config": { - "collection_name": "omi_memories", + "collection_name": "chronicle_memories", "embedding_model_dims": 768, "host": QDRANT_BASE_URL, "port": 6333, @@ -499,7 +499,7 @@ This will: 3. **Search Not Working** - Ensure embedding model is available in Ollama - Check vector dimensions match between embedder and Qdrant - - Verify collection has vectors: `curl http://localhost:6333/collections/omi_memories` + - Verify collection has vectors: `curl http://localhost:6333/collections/chronicle_memories` ### Required Ollama Models diff --git a/backends/advanced/Docs/memory-configuration-guide.md b/backends/advanced/Docs/memory-configuration-guide.md index 12796e13..66244003 100644 --- a/backends/advanced/Docs/memory-configuration-guide.md +++ b/backends/advanced/Docs/memory-configuration-guide.md @@ -65,7 +65,7 @@ memory: - **Embeddings**: `text-embedding-3-small`, `text-embedding-3-large` #### Ollama Models (Local) -- **LLM**: `llama3`, `mistral`, `qwen2.5` +- **LLM**: `llama3`, `qwen2.5` - **Embeddings**: `nomic-embed-text`, `all-minilm` ## Hot Reload diff --git a/backends/advanced/Docs/plugin-configuration.md b/backends/advanced/Docs/plugin-configuration.md new file mode 100644 index 00000000..a4c7b222 --- /dev/null +++ b/backends/advanced/Docs/plugin-configuration.md @@ -0,0 +1,399 @@ +# Plugin Configuration Architecture + +Chronicle uses a clean separation of concerns for plugin configuration, dividing settings across three locations based on their purpose. + +## Configuration Files + +### 1. `config/plugins.yml` - Orchestration Only + +**Purpose**: Controls which plugins are enabled and what events they listen to + +**Contains**: +- Plugin enable/disable flags +- Event subscriptions +- Trigger conditions (wake words, etc.) + +**Example**: +```yaml +plugins: + email_summarizer: + enabled: true + events: + - conversation.complete + condition: + type: always + + homeassistant: + enabled: false + events: + - transcript.streaming + condition: + type: wake_word + wake_words: + - hey vivi +``` + +### 2. `backends/advanced/src/advanced_omi_backend/plugins/{plugin_id}/config.yml` - Plugin Settings + +**Purpose**: Plugin-specific non-secret configuration + +**Contains**: +- Feature flags +- Timeouts and limits +- Display preferences +- References to environment variables using `${VAR_NAME}` syntax + +**Example** (`plugins/email_summarizer/config.yml`): +```yaml +# Email content settings +subject_prefix: "Conversation Summary" +summary_max_sentences: 3 +include_conversation_id: true + +# SMTP config (reads from .env) +smtp_host: ${SMTP_HOST} +smtp_port: ${SMTP_PORT:-587} +smtp_username: ${SMTP_USERNAME} +smtp_password: ${SMTP_PASSWORD} +``` + +### 3. `backends/advanced/.env` - Secrets Only + +**Purpose**: All secret values (API keys, passwords, tokens) + +**Contains**: +- API keys +- Authentication tokens +- SMTP credentials +- Database passwords + +**Example**: +```bash +# Email Summarizer Plugin +SMTP_HOST=smtp.gmail.com +SMTP_PORT=587 +SMTP_USERNAME=your-email@gmail.com +SMTP_PASSWORD=your-app-password-here + +# Home Assistant Plugin +HA_URL=http://homeassistant.local:8123 +HA_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9... +``` + +## Configuration Loading Process + +When a plugin is initialized, Chronicle merges configuration from all three sources: + +``` +1. Load plugins/{plugin_id}/config.yml + ↓ +2. Expand ${ENV_VAR} references from .env + ↓ +3. Merge orchestration settings from config/plugins.yml + ↓ +4. Pass complete config to plugin constructor +``` + +### Example Configuration Flow + +**Email Summarizer Plugin**: + +1. **Load** `plugins/email_summarizer/config.yml`: + ```yaml + subject_prefix: "Conversation Summary" + smtp_host: ${SMTP_HOST} + smtp_password: ${SMTP_PASSWORD} + ``` + +2. **Expand env vars** from `.env`: + ```yaml + subject_prefix: "Conversation Summary" + smtp_host: "smtp.gmail.com" # ← Expanded + smtp_password: "app-password-123" # ← Expanded + ``` + +3. **Merge orchestration** from `config/plugins.yml`: + ```yaml + enabled: true # ← Added + events: ["conversation.complete"] # ← Added + condition: {type: "always"} # ← Added + subject_prefix: "Conversation Summary" + smtp_host: "smtp.gmail.com" + smtp_password: "app-password-123" + ``` + +4. **Pass to plugin** constructor with complete config + +## Environment Variable Expansion + +Plugin config files use `${VAR_NAME}` syntax for environment variable references: + +- **Simple reference**: `${SMTP_HOST}` β†’ expands to env value +- **With default**: `${SMTP_PORT:-587}` β†’ uses 587 if SMTP_PORT not set +- **Missing vars**: Logs warning and keeps placeholder + +**Example**: +```yaml +# In plugin config.yml +smtp_host: ${SMTP_HOST} +smtp_port: ${SMTP_PORT:-587} +timeout: ${HA_TIMEOUT:-30} + +# With .env: +# SMTP_HOST=smtp.gmail.com +# (SMTP_PORT not set) +# HA_TIMEOUT=60 + +# Results in: +# smtp_host: "smtp.gmail.com" +# smtp_port: "587" # ← Used default +# timeout: "60" # ← From .env +``` + +## Creating a New Plugin + +To add a new plugin with proper configuration: + +### 1. Create plugin directory structure + +```bash +backends/advanced/src/advanced_omi_backend/plugins/my_plugin/ +β”œβ”€β”€ __init__.py # Export plugin class +β”œβ”€β”€ plugin.py # Plugin implementation +└── config.yml # Plugin-specific config +``` + +### 2. Add plugin config file + +**`plugins/my_plugin/config.yml`**: +```yaml +# My Plugin Configuration +# Non-secret settings only + +# Feature settings +feature_enabled: true +timeout: ${MY_PLUGIN_TIMEOUT:-30} + +# API configuration (secrets from .env) +api_url: ${MY_PLUGIN_API_URL} +api_key: ${MY_PLUGIN_API_KEY} +``` + +### 3. Add secrets to `.env.template` + +**`backends/advanced/.env.template`**: +```bash +# My Plugin +MY_PLUGIN_API_URL=https://api.example.com +MY_PLUGIN_API_KEY= +MY_PLUGIN_TIMEOUT=30 +``` + +### 4. Add orchestration settings + +**`config/plugins.yml`**: +```yaml +plugins: + my_plugin: + enabled: false + events: + - conversation.complete + condition: + type: always +``` + +### 5. Implement plugin class + +**`plugins/my_plugin/plugin.py`**: +```python +from ..base import BasePlugin, PluginContext, PluginResult + +class MyPlugin(BasePlugin): + def __init__(self, config: Dict[str, Any]): + super().__init__(config) + # Config automatically merged from all sources + self.api_url = config.get('api_url') + self.api_key = config.get('api_key') + self.timeout = config.get('timeout', 30) + + async def initialize(self): + # Plugin initialization + pass + + async def on_conversation_complete(self, context: PluginContext): + # Event handler + pass +``` + +## Benefits of This Architecture + +βœ… **Clean separation**: Secrets (.env) vs Config (yml) vs Orchestration (plugins.yml) + +βœ… **Plugin portability**: Each plugin has self-contained config.yml + +βœ… **No secret duplication**: Secrets only in .env, referenced via ${VAR} + +βœ… **Easy discovery**: Want to configure a plugin? β†’ `plugins/{plugin_id}/config.yml` + +βœ… **Main config.yml stays clean**: No plugin pollution in main backend config + +βœ… **Unified interface**: All plugins loaded with same pattern via `load_plugin_config()` + +## Troubleshooting + +### Plugin not loading + +**Check logs** for: +- "Plugin 'X' not found" β†’ Directory/file structure issue +- "Environment variable 'X' not found" β†’ Missing .env entry +- "Failed to load config.yml" β†’ YAML syntax error + +**Verify**: +```bash +# Check plugin directory exists +ls backends/advanced/src/advanced_omi_backend/plugins/my_plugin/ + +# Validate config.yml syntax +python -c "import yaml; yaml.safe_load(open('plugins/my_plugin/config.yml'))" + +# Check .env has required vars +grep MY_PLUGIN .env +``` + +### Environment variables not expanding + +**Problem**: `${SMTP_HOST}` stays as literal text + +**Solution**: +- Ensure `.env` file exists in `backends/advanced/.env` +- Check variable name matches exactly (case-sensitive) +- Restart backend after .env changes +- Check logs for "Environment variable 'X' not found" warnings + +### Plugin enabled but not running + +**Check**: +1. `config/plugins.yml` has `enabled: true` +2. Plugin subscribed to correct events +3. Conditions are met (wake words, etc.) +4. Plugin initialized without errors (check logs) + +## Using Shared Setup Utilities in Plugin Setup Scripts + +Chronicle provides shared utilities (`setup_utils.py`) for creating interactive plugin setup wizards with password masking and existing value detection. + +### Quick Reference + +```python +#!/usr/bin/env python3 +import sys +from pathlib import Path + +# Import shared utilities +project_root = Path(__file__).resolve().parents[6] +sys.path.insert(0, str(project_root)) + +from setup_utils import ( + prompt_with_existing_masked, # Main function for masked prompts + prompt_value, # Simple value prompts + prompt_password, # Password with validation + mask_value, # Mask a value manually + read_env_value # Read from .env +) +from dotenv import set_key + +# Path to backend .env +env_path = str(project_root / "backends" / "advanced" / ".env") + +# Prompt for password/token with masking +api_key = prompt_with_existing_masked( + prompt_text="API Key", + env_file_path=env_path, + env_key="MY_PLUGIN_API_KEY", + placeholders=['your-key-here'], + is_password=True # ← Shows masked existing value +) + +# Save to .env +set_key(env_path, "MY_PLUGIN_API_KEY", api_key) +``` + +### Function Details + +**`prompt_with_existing_masked()`** - Primary function for secrets + +Shows masked existing values and allows users to reuse them: +```python +smtp_password = prompt_with_existing_masked( + prompt_text="SMTP Password", + env_file_path="../../.env", # Path to .env file + env_key="SMTP_PASSWORD", # Environment variable name + placeholders=['your-password-here'], # Values to treat as "not set" + is_password=True, # Use masking and hidden input + default="" # Fallback if no existing value +) +# Output: SMTP Password (smtp_***********word) [press Enter to reuse, or enter new]: +``` + +**Benefits:** +- βœ… Shows previously configured values as masked (e.g., `sk-pr***********xyz`) +- βœ… Lets users press Enter to keep existing value (no re-entry needed) +- βœ… Automatically reads from .env if path/key provided +- βœ… Works with placeholders - treats them as "not configured" + +**`prompt_password()`** - Password with validation + +```python +admin_pass = prompt_password( + prompt_text="Admin Password", + min_length=8, # Minimum length requirement + allow_generated=True # Auto-generate in non-interactive mode +) +``` + +**`prompt_value()`** - Simple value prompts + +```python +port = prompt_value("SMTP Port", default="587") +``` + +### Complete Plugin Setup Example + +See `backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/setup.py` for a complete working example showing: +- Masked password/token prompts with existing value reuse +- Saving credentials to backend .env +- Clean user-facing instructions +- Error handling + +### Best Practices + +1. **Always show masked values for secrets** - Use `is_password=True` +2. **Auto-read from .env** - Provide `env_file_path` and `env_key` parameters +3. **Use placeholders** - Define common placeholder values to detect "not configured" +4. **Save to backend .env** - All plugin secrets go in `backends/advanced/.env` +5. **Clear instructions** - Tell users what to do next (enable in plugins.yml, restart) + +### Convenience Functions + +For common patterns, use the convenience wrappers: + +```python +from setup_utils import prompt_api_key, prompt_token + +# API keys +openai_key = prompt_api_key("OpenAI", env_file_path="../../.env") +# Prompts: "OpenAI API Key" +# Env var: OPENAI_API_KEY + +# Auth tokens +ha_token = prompt_token("Home Assistant", env_file_path="../../.env") +# Prompts: "Home Assistant Token" +# Env var: HOME_ASSISTANT_TOKEN +``` + +## See Also + +- [CLAUDE.md](../../../CLAUDE.md) - Main documentation +- [Plugin Development Guide](plugin-development.md) - Creating custom plugins +- [Environment Variables](environment-variables.md) - Complete .env reference +- [setup_utils.py](../../../setup_utils.py) - Shared setup utility reference diff --git a/backends/advanced/README.md b/backends/advanced/README.md index 0f5a4490..7f3d5a24 100644 --- a/backends/advanced/README.md +++ b/backends/advanced/README.md @@ -31,7 +31,7 @@ Modern React-based web dashboard located in `./webui/` with: **The setup wizard guides you through:** - **Authentication**: Admin email/password setup with secure keys -- **Transcription Provider**: Choose between Deepgram, Mistral, or Offline (Parakeet) +- **Transcription Provider**: Choose between Deepgram or Offline (Parakeet) - **LLM Provider**: Choose between OpenAI (recommended) or Ollama for memory extraction - **Memory Provider**: Choose between Friend-Lite Native or OpenMemory MCP - **HTTPS Configuration**: Optional SSL setup for microphone access (uses Caddy) diff --git a/backends/advanced/cleanup.sh b/backends/advanced/cleanup.sh new file mode 100755 index 00000000..041e6364 --- /dev/null +++ b/backends/advanced/cleanup.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Wrapper script for cleanup_state.py +# Usage: ./cleanup.sh --backup --export-audio +# +# This script runs the cleanup_state.py script inside the chronicle-backend container +# to handle data ownership and permissions correctly. +# +# Examples: +# ./cleanup.sh --dry-run # Preview what would be deleted +# ./cleanup.sh --backup # Cleanup with metadata backup +# ./cleanup.sh --backup --export-audio # Full backup including audio +# ./cleanup.sh --backup --force # Skip confirmation prompts + +cd "$(dirname "$0")" +docker compose exec chronicle-backend uv run python src/scripts/cleanup_state.py "$@" diff --git a/backends/advanced/diarization_config.json.template b/backends/advanced/diarization_config.json.template deleted file mode 100644 index d760df85..00000000 --- a/backends/advanced/diarization_config.json.template +++ /dev/null @@ -1,9 +0,0 @@ -{ - "diarization_source": "pyannote", - "similarity_threshold": 0.15, - "min_duration": 0.5, - "collar": 2.0, - "min_duration_off": 1.5, - "min_speakers": 2, - "max_speakers": 6 -} \ No newline at end of file diff --git a/backends/advanced/docker-compose-test.yml b/backends/advanced/docker-compose-test.yml index e4203f91..43aa1a83 100644 --- a/backends/advanced/docker-compose-test.yml +++ b/backends/advanced/docker-compose-test.yml @@ -2,26 +2,34 @@ # Isolated test environment for integration tests # Uses different ports to avoid conflicts with development environment +name: backend-test + services: chronicle-backend-test: build: context: . dockerfile: Dockerfile + target: dev # Use dev stage with test dependencies + command: ["./start.sh", "--test"] ports: - "8001:8000" # Avoid conflict with dev on 8000 volumes: - ./src:/app/src # Mount source code for easier development - ./data/test_audio_chunks:/app/audio_chunks - - ./data/test_debug_dir:/app/debug_dir + - ./data/test_debug_dir:/app/debug # Fixed: mount to /app/debug for plugin database - ./data/test_data:/app/data - - ${CONFIG_FILE:-../../config/config.yml}:/app/config.yml # Mount config.yml for model registry and memory settings (writable for admin config updates) + - ../../config:/app/config # Mount config directory with defaults.yml + - ../../tests/configs:/app/test-configs:ro # Mount test-specific configs + - ${PLUGINS_CONFIG:-../../tests/config/plugins.test.yml}:/app/config/plugins.yml # Mount test plugins config to correct location environment: # Override with test-specific settings - MONGODB_URI=mongodb://mongo-test:27017/test_db - QDRANT_BASE_URL=qdrant-test - QDRANT_PORT=6333 - REDIS_URL=redis://redis-test:6379/0 - - DEBUG_DIR=/app/debug_dir + - DEBUG_DIR=/app/debug # Fixed: match plugin database mount path + # Test configuration file + - CONFIG_FILE=${TEST_CONFIG_FILE:-/app/test-configs/deepgram-openai.yml} # Import API keys from environment - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY} @@ -42,10 +50,17 @@ services: # Speaker recognition controlled by config.yml (disabled in test config for CI performance) - SPEAKER_SERVICE_URL=http://speaker-service-test:8085 - CORS_ORIGINS=http://localhost:3001,http://localhost:8001,https://localhost:3001,https://localhost:8001 - # Set low inactivity timeout for tests (2 seconds instead of 60) - - SPEECH_INACTIVITY_THRESHOLD_SECONDS=2 + # Set inactivity timeout for tests (20 seconds of audio time) + # This is audio duration, not wall-clock time + - SPEECH_INACTIVITY_THRESHOLD_SECONDS=20 + # Set low speech detection thresholds for tests + - SPEECH_DETECTION_MIN_DURATION=2.0 # 2 seconds instead of 10 + - SPEECH_DETECTION_MIN_WORDS=5 # 5 words instead of 10 # Wait for audio queue to drain before timing out (test mode) - WAIT_FOR_AUDIO_QUEUE_DRAIN=true + # Mock speaker recognition for tests (avoids resource-intensive ML service) + # To test with REAL speaker recognition: set to 'false' and start extras/speaker-recognition service + - USE_MOCK_SPEAKER_CLIENT=true depends_on: qdrant-test: condition: service_started @@ -53,8 +68,6 @@ services: condition: service_healthy redis-test: condition: service_started - speaker-service-test: - condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/readiness"] interval: 10s @@ -125,7 +138,7 @@ services: context: ../../extras/speaker-recognition dockerfile: Dockerfile args: - PYTORCH_CUDA_VERSION: cpu + PYTORCH_CUDA_VERSION: cu12.6 image: speaker-recognition-test:latest ports: - "8086:8085" # Avoid conflict with dev speaker service on 8085 @@ -149,25 +162,59 @@ services: retries: 5 start_period: 60s restart: unless-stopped + profiles: + - speaker # Optional service - only start when explicitly enabled + + mock-streaming-stt: + build: + context: ../.. + dockerfile: tests/Dockerfile.mock-streaming-stt + ports: + - "9999:9999" + healthcheck: + test: ["CMD", "python", "-c", "import socket; s=socket.socket(); s.connect(('localhost',9999)); s.close()"] + interval: 10s + timeout: 5s + retries: 3 + restart: unless-stopped + + mock-llm: + build: + context: ../.. + dockerfile: tests/Dockerfile.mock-llm + ports: + - "11435:11435" + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:11435/health').read()"] + interval: 10s + timeout: 5s + retries: 3 + restart: unless-stopped workers-test: build: context: . dockerfile: Dockerfile - command: ./start-workers.sh + target: dev # Use dev stage with test dependencies + command: ["uv", "run", "--group", "test", "python", "worker_orchestrator.py"] volumes: - ./src:/app/src + - ./worker_orchestrator.py:/app/worker_orchestrator.py - ./data/test_audio_chunks:/app/audio_chunks - - ./data/test_debug_dir:/app/debug_dir + - ./data/test_debug_dir:/app/debug # Fixed: mount to /app/debug for plugin database - ./data/test_data:/app/data - - ${CONFIG_FILE:-../../config/config.yml}:/app/config.yml # Mount config.yml for model registry and memory settings (writable for admin config updates) + - ../../config:/app/config # Mount config directory with defaults.yml + - ../../tests/configs:/app/test-configs:ro # Mount test-specific configs + - ${PLUGINS_CONFIG:-../../tests/config/plugins.test.yml}:/app/config/plugins.yml # Mount test plugins config to correct location environment: # Same environment as backend - MONGODB_URI=mongodb://mongo-test:27017/test_db - QDRANT_BASE_URL=qdrant-test - QDRANT_PORT=6333 - REDIS_URL=redis://redis-test:6379/0 - - DEBUG_DIR=/app/debug_dir + - DEBUG_DIR=/app/debug # Fixed: match plugin database mount path + # Test configuration file + - CONFIG_FILE=${TEST_CONFIG_FILE:-/app/test-configs/deepgram-openai.yml} - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY} - GROQ_API_KEY=${GROQ_API_KEY} @@ -183,10 +230,17 @@ services: - MYCELIA_DB=mycelia_test # Speaker recognition controlled by config.yml (disabled in test config for CI performance) - SPEAKER_SERVICE_URL=http://speaker-service-test:8085 - # Set low inactivity timeout for tests (2 seconds instead of 60) - - SPEECH_INACTIVITY_THRESHOLD_SECONDS=2 + # Set inactivity timeout for tests (20 seconds of audio time) + # This is audio duration, not wall-clock time + - SPEECH_INACTIVITY_THRESHOLD_SECONDS=20 + # Set low speech detection thresholds for tests + - SPEECH_DETECTION_MIN_DURATION=2.0 # 2 seconds instead of 10 + - SPEECH_DETECTION_MIN_WORDS=5 # 5 words instead of 10 # Wait for audio queue to drain before timing out (test mode) - WAIT_FOR_AUDIO_QUEUE_DRAIN=true + # Mock speaker recognition for tests (avoids resource-intensive ML service) + # To test with REAL speaker recognition: set to 'false' and start extras/speaker-recognition service + - USE_MOCK_SPEAKER_CLIENT=true depends_on: chronicle-backend-test: condition: service_healthy @@ -196,8 +250,6 @@ services: condition: service_started qdrant-test: condition: service_started - speaker-service-test: - condition: service_healthy restart: unless-stopped # Mycelia - AI memory and timeline service (test environment) diff --git a/backends/advanced/docker-compose.yml b/backends/advanced/docker-compose.yml index f46a23fa..230f40c9 100644 --- a/backends/advanced/docker-compose.yml +++ b/backends/advanced/docker-compose.yml @@ -1,8 +1,35 @@ services: + tailscale: + image: tailscale/tailscale:latest + container_name: advanced-tailscale + hostname: chronicle-tailscale + environment: + - TS_AUTHKEY=${TS_AUTHKEY} + - TS_STATE_DIR=/var/lib/tailscale + - TS_USERSPACE=false + - TS_ACCEPT_DNS=true + volumes: + - tailscale-state:/var/lib/tailscale + devices: + - /dev/net/tun:/dev/net/tun + cap_add: + - NET_ADMIN + restart: unless-stopped + profiles: + - tailscale # Optional profile + ports: + - "18123:18123" # HA proxy port + command: > + sh -c "tailscaled & + tailscale up --authkey=$${TS_AUTHKEY} --accept-dns=true && + apk add --no-cache socat 2>/dev/null || true && + socat TCP-LISTEN:18123,fork,reuseaddr TCP:100.99.62.5:8123" + chronicle-backend: build: context: . dockerfile: Dockerfile + target: prod # Use prod stage without test dependencies ports: - "8000:8000" env_file: @@ -12,7 +39,7 @@ services: - ./data/audio_chunks:/app/audio_chunks - ./data/debug_dir:/app/debug_dir - ./data:/app/data - - ../../config/config.yml:/app/config.yml # Removed :ro to allow UI config saving + - ../../config:/app/config # Mount entire config directory (includes config.yml, defaults.yml, plugins.yml) environment: - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} - PARAKEET_ASR_URL=${PARAKEET_ASR_URL} @@ -26,8 +53,10 @@ services: - NEO4J_HOST=${NEO4J_HOST} - NEO4J_USER=${NEO4J_USER} - NEO4J_PASSWORD=${NEO4J_PASSWORD} + - HA_TOKEN=${HA_TOKEN} - CORS_ORIGINS=http://localhost:3010,http://localhost:8000,http://192.168.1.153:3010,http://192.168.1.153:8000,https://localhost:3010,https://localhost:8000,https://100.105.225.45,https://localhost - REDIS_URL=redis://redis:6379/0 + - MONGODB_URI=mongodb://mongo:27017 depends_on: qdrant: condition: service_started @@ -35,6 +64,8 @@ services: condition: service_healthy redis: condition: service_healthy + extra_hosts: + - "host.docker.internal:host-gateway" # Access host's Tailscale network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/readiness"] interval: 30s @@ -46,27 +77,37 @@ services: # Unified Worker Container # No CUDA needed for chronicle-backend and workers, workers only orchestrate jobs and call external services # Runs all workers in a single container for efficiency: - # - 3 RQ workers (transcription, memory, default queues) - # - 1 Audio stream worker (Redis Streams consumer - must be single to maintain sequential chunks) + # - 6 RQ workers (transcription, memory, default queues) + # - 1 Audio persistence worker (audio queue) + # - 1+ Stream workers (conditional based on config.yml - Deepgram/Parakeet) + # Uses Python orchestrator for process management, health monitoring, and self-healing workers: build: context: . dockerfile: Dockerfile - command: ["./start-workers.sh"] + target: prod # Use prod stage without test dependencies + command: ["uv", "run", "python", "worker_orchestrator.py"] env_file: - .env volumes: - ./src:/app/src - - ./start-workers.sh:/app/start-workers.sh + - ./worker_orchestrator.py:/app/worker_orchestrator.py - ./data/audio_chunks:/app/audio_chunks - ./data:/app/data - - ../../config/config.yml:/app/config.yml # Removed :ro for consistency + - ../../config:/app/config # Mount entire config directory (includes config.yml, defaults.yml, plugins.yml) environment: - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} - PARAKEET_ASR_URL=${PARAKEET_ASR_URL} - OPENAI_API_KEY=${OPENAI_API_KEY} - GROQ_API_KEY=${GROQ_API_KEY} + - HA_TOKEN=${HA_TOKEN} - REDIS_URL=redis://redis:6379/0 + - MONGODB_URI=mongodb://mongo:27017 + # Worker orchestrator configuration (optional - defaults shown) + - WORKER_CHECK_INTERVAL=${WORKER_CHECK_INTERVAL:-10} + - MIN_RQ_WORKERS=${MIN_RQ_WORKERS:-6} + - WORKER_STARTUP_GRACE_PERIOD=${WORKER_STARTUP_GRACE_PERIOD:-30} + - WORKER_SHUTDOWN_TIMEOUT=${WORKER_SHUTDOWN_TIMEOUT:-30} depends_on: redis: condition: service_healthy @@ -76,6 +117,33 @@ services: condition: service_started restart: unless-stopped + # Annotation Cron Scheduler + # Runs periodic jobs for AI-powered annotation suggestions: + # - Daily: Surface potential errors in transcripts/memories + # - Weekly: Fine-tune error detection models using user feedback + # Set DEV_MODE=true in .env for 1-minute intervals (testing) + annotation-cron: + build: + context: . + dockerfile: Dockerfile + target: prod + command: ["uv", "run", "python", "-m", "advanced_omi_backend.cron"] + container_name: chronicle-annotation-cron + env_file: + - .env + environment: + - MONGODB_URI=mongodb://mongo:27017 + - DEV_MODE=${DEV_MODE:-false} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL} + depends_on: + mongo: + condition: service_healthy + restart: unless-stopped + profiles: + - annotation # Optional profile - enable with: docker compose --profile annotation up + webui: build: context: ./webui @@ -226,3 +294,5 @@ volumes: driver: local neo4j_logs: driver: local + tailscale-state: + driver: local diff --git a/backends/advanced/docs/plugin-development-guide.md b/backends/advanced/docs/plugin-development-guide.md new file mode 100644 index 00000000..17c53b4a --- /dev/null +++ b/backends/advanced/docs/plugin-development-guide.md @@ -0,0 +1,776 @@ +# Chronicle Plugin Development Guide + +A comprehensive guide to creating custom plugins for Chronicle. + +## Table of Contents + +1. [Introduction](#introduction) +2. [Quick Start](#quick-start) +3. [Plugin Architecture](#plugin-architecture) +4. [Event Types](#event-types) +5. [Creating Your First Plugin](#creating-your-first-plugin) +6. [Configuration](#configuration) +7. [Testing Plugins](#testing-plugins) +8. [Best Practices](#best-practices) +9. [Examples](#examples) +10. [Troubleshooting](#troubleshooting) + +## Introduction + +Chronicle's plugin system allows you to extend functionality by subscribing to events and executing custom logic. Plugins are: + +- **Event-driven**: React to transcripts, conversations, or memory processing +- **Auto-discovered**: Drop plugins into the `plugins/` directory +- **Configurable**: YAML-based configuration with environment variable support +- **Isolated**: Each plugin runs independently with proper error handling + +### Plugin Types + +- **Core Plugins**: Built-in plugins (`homeassistant`, `test_event`) +- **Community Plugins**: Auto-discovered plugins in `plugins/` directory + +## Quick Start + +### 1. Generate Plugin Boilerplate + +```bash +cd backends/advanced +uv run python scripts/create_plugin.py my_awesome_plugin +``` + +This creates: +``` +plugins/my_awesome_plugin/ +β”œβ”€β”€ __init__.py # Plugin exports +β”œβ”€β”€ plugin.py # Main plugin logic +└── README.md # Plugin documentation +``` + +### 2. Implement Plugin Logic + +Edit `plugins/my_awesome_plugin/plugin.py`: + +```python +async def on_conversation_complete(self, context: PluginContext) -> Optional[PluginResult]: + """Handle conversation completion.""" + transcript = context.data.get('transcript', '') + + # Your custom logic here + print(f"Processing: {transcript}") + + return PluginResult(success=True, message="Processing complete") +``` + +### 3. Configure Plugin + +Add to `config/plugins.yml`: + +```yaml +plugins: + my_awesome_plugin: + enabled: true + events: + - conversation.complete + condition: + type: always +``` + +### 4. Restart Backend + +```bash +cd backends/advanced +docker compose restart +``` + +Your plugin will be auto-discovered and loaded! + +## Plugin Architecture + +### Base Plugin Class + +All plugins inherit from `BasePlugin`: + +```python +from advanced_omi_backend.plugins.base import BasePlugin, PluginContext, PluginResult + +class MyPlugin(BasePlugin): + SUPPORTED_ACCESS_LEVELS = ['conversation'] # Which events you support + + async def initialize(self): + """Initialize resources (called on app startup)""" + pass + + async def cleanup(self): + """Clean up resources (called on app shutdown)""" + pass + + async def on_conversation_complete(self, context: PluginContext): + """Handle conversation.complete events""" + pass +``` + +### Plugin Context + +Context passed to plugin methods: + +```python +@dataclass +class PluginContext: + user_id: str # User identifier + event: str # Event name (e.g., "conversation.complete") + data: Dict[str, Any] # Event-specific data + metadata: Dict[str, Any] # Additional metadata +``` + +### Plugin Result + +Return value from plugin methods: + +```python +@dataclass +class PluginResult: + success: bool # Whether operation succeeded + data: Optional[Dict[str, Any]] # Optional result data + message: Optional[str] # Optional status message + should_continue: bool # Whether to continue normal processing (default: True) +``` + +## Event Types + +### 1. Transcript Events (`transcript.streaming`) + +**When**: Real-time transcript segments arrive from WebSocket +**Context Data**: +- `transcript` (str): The transcript text +- `segment_id` (str): Unique segment identifier +- `conversation_id` (str): Current conversation ID + +**Use Cases**: +- Wake word detection +- Real-time command processing +- Live transcript analysis + +**Example**: +```python +async def on_transcript(self, context: PluginContext): + transcript = context.data.get('transcript', '') + if 'urgent' in transcript.lower(): + await self.send_notification(transcript) +``` + +### 2. Conversation Events (`conversation.complete`) + +**When**: Conversation processing finishes +**Context Data**: +- `conversation` (dict): Full conversation data +- `transcript` (str): Complete transcript +- `duration` (float): Conversation duration in seconds +- `conversation_id` (str): Conversation identifier + +**Use Cases**: +- Email summaries +- Analytics tracking +- External integrations +- Conversation archiving + +**Example**: +```python +async def on_conversation_complete(self, context: PluginContext): + conversation = context.data.get('conversation', {}) + duration = context.data.get('duration', 0) + + if duration > 300: # 5 minutes + await self.archive_long_conversation(conversation) +``` + +### 3. Memory Events (`memory.processed`) + +**When**: Memory extraction finishes +**Context Data**: +- `memories` (list): Extracted memories +- `conversation` (dict): Source conversation +- `memory_count` (int): Number of memories created +- `conversation_id` (str): Conversation identifier + +**Use Cases**: +- Memory indexing +- Knowledge graph updates +- Memory notifications +- Analytics + +**Example**: +```python +async def on_memory_processed(self, context: PluginContext): + memories = context.data.get('memories', []) + + for memory in memories: + await self.index_memory(memory) +``` + +## Creating Your First Plugin + +### Step 1: Generate Boilerplate + +```bash +uv run python scripts/create_plugin.py todo_extractor +``` + +### Step 2: Define Plugin Logic + +```python +""" +Todo Extractor Plugin - Extracts action items from conversations. +""" +import logging +import re +from typing import Any, Dict, List, Optional + +from ..base import BasePlugin, PluginContext, PluginResult + +logger = logging.getLogger(__name__) + + +class TodoExtractorPlugin(BasePlugin): + """Extract and save action items from conversations.""" + + SUPPORTED_ACCESS_LEVELS = ['conversation'] + + def __init__(self, config: Dict[str, Any]): + super().__init__(config) + self.todo_patterns = [ + r'I need to (.+)', + r'I should (.+)', + r'TODO: (.+)', + r'reminder to (.+)', + ] + + async def initialize(self): + if not self.enabled: + return + + logger.info("TodoExtractor plugin initialized") + + async def on_conversation_complete(self, context: PluginContext): + try: + transcript = context.data.get('transcript', '') + todos = self._extract_todos(transcript) + + if todos: + await self._save_todos(context.user_id, todos) + + return PluginResult( + success=True, + message=f"Extracted {len(todos)} action items", + data={'todos': todos} + ) + + return PluginResult(success=True, message="No action items found") + + except Exception as e: + logger.error(f"Error extracting todos: {e}") + return PluginResult(success=False, message=str(e)) + + def _extract_todos(self, transcript: str) -> List[str]: + """Extract todo items from transcript.""" + todos = [] + + for pattern in self.todo_patterns: + matches = re.findall(pattern, transcript, re.IGNORECASE) + todos.extend(matches) + + return list(set(todos)) # Remove duplicates + + async def _save_todos(self, user_id: str, todos: List[str]): + """Save todos to database or external service.""" + from advanced_omi_backend.database import get_database + + db = get_database() + for todo in todos: + await db['todos'].insert_one({ + 'user_id': user_id, + 'task': todo, + 'completed': False, + 'created_at': datetime.utcnow() + }) +``` + +### Step 3: Configure Plugin + +`config/plugins.yml`: + +```yaml +plugins: + todo_extractor: + enabled: true + events: + - conversation.complete + condition: + type: always +``` + +### Step 4: Test Plugin + +1. Restart backend: `docker compose restart` +2. Create a conversation with phrases like "I need to buy milk" +3. Check logs: `docker compose logs -f chronicle-backend | grep TodoExtractor` +4. Verify todos in database + +## Configuration + +### YAML Configuration + +`config/plugins.yml`: + +```yaml +plugins: + my_plugin: + # Basic Configuration + enabled: true # Enable/disable plugin + + # Event Subscriptions + events: + - conversation.complete + - memory.processed + + # Execution Conditions + condition: + type: always # always, wake_word, regex + # wake_words: ["hey assistant"] # For wake_word type + # pattern: "urgent" # For regex type + + # Custom Configuration + api_url: ${MY_API_URL} # Environment variable + timeout: 30 + max_retries: 3 +``` + +### Environment Variables + +Use `${VAR_NAME}` syntax: + +```yaml +api_key: ${MY_API_KEY} +base_url: ${BASE_URL:-http://localhost:8000} # With default +``` + +Add to `.env`: + +```bash +MY_API_KEY=your-key-here +BASE_URL=https://api.example.com +``` + +### Condition Types + +**Always Execute**: +```yaml +condition: + type: always +``` + +**Wake Word** (transcript events only): +```yaml +condition: + type: wake_word + wake_words: + - hey assistant + - computer +``` + +**Regex Pattern**: +```yaml +condition: + type: regex + pattern: "urgent|important" +``` + +## Testing Plugins + +### Unit Tests + +`tests/test_my_plugin.py`: + +```python +import pytest +from plugins.my_plugin import MyPlugin +from plugins.base import PluginContext + +class TestMyPlugin: + def test_plugin_initialization(self): + config = {'enabled': True, 'events': ['conversation.complete']} + plugin = MyPlugin(config) + assert plugin.enabled is True + + @pytest.mark.asyncio + async def test_conversation_processing(self): + plugin = MyPlugin({'enabled': True}) + await plugin.initialize() + + context = PluginContext( + user_id='test-user', + event='conversation.complete', + data={'transcript': 'Test transcript'} + ) + + result = await plugin.on_conversation_complete(context) + assert result.success is True +``` + +### Integration Testing + +1. **Enable Test Plugin**: +```yaml +test_event: + enabled: true + events: + - conversation.complete +``` + +2. **Check Logs**: +```bash +docker compose logs -f | grep "test_event" +``` + +3. **Upload Test Audio**: +```bash +curl -X POST http://localhost:8000/api/process-audio-files \ + -H "Authorization: Bearer $TOKEN" \ + -F "files=@test.wav" +``` + +### Manual Testing Checklist + +- [ ] Plugin loads without errors +- [ ] Configuration validates correctly +- [ ] Events trigger plugin execution +- [ ] Plugin logic executes successfully +- [ ] Errors are handled gracefully +- [ ] Logs provide useful information + +## Best Practices + +### 1. Error Handling + +Always wrap logic in try-except: + +```python +async def on_conversation_complete(self, context): + try: + # Your logic + result = await self.process(context) + return PluginResult(success=True, data=result) + except Exception as e: + logger.error(f"Error: {e}", exc_info=True) + return PluginResult(success=False, message=str(e)) +``` + +### 2. Logging + +Use appropriate log levels: + +```python +logger.debug("Detailed debug information") +logger.info("Important milestones") +logger.warning("Non-critical issues") +logger.error("Errors that need attention") +``` + +### 3. Resource Management + +Clean up in `cleanup()`: + +```python +async def initialize(self): + self.client = ExternalClient() + await self.client.connect() + +async def cleanup(self): + if self.client: + await self.client.disconnect() +``` + +### 4. Configuration Validation + +Validate in `initialize()`: + +```python +async def initialize(self): + if not self.config.get('api_key'): + raise ValueError("API key is required") + + if self.config.get('timeout', 0) <= 0: + raise ValueError("Timeout must be positive") +``` + +### 5. Async Best Practices + +Use `asyncio.to_thread()` for blocking operations: + +```python +import asyncio + +async def my_method(self): + # Run blocking operation in thread pool + result = await asyncio.to_thread(blocking_function, arg1, arg2) + return result +``` + +### 6. Database Access + +Use the global database handle: + +```python +from advanced_omi_backend.database import get_database + +async def save_data(self, data): + db = get_database() + await db['my_collection'].insert_one(data) +``` + +### 7. LLM Access + +Use the global LLM client: + +```python +from advanced_omi_backend.llm_client import async_generate + +async def generate_summary(self, text): + prompt = f"Summarize: {text}" + summary = await async_generate(prompt) + return summary +``` + +## Examples + +### Example 1: Slack Notifier + +```python +class SlackNotifierPlugin(BasePlugin): + SUPPORTED_ACCESS_LEVELS = ['conversation'] + + async def initialize(self): + self.webhook_url = self.config.get('slack_webhook_url') + if not self.webhook_url: + raise ValueError("Slack webhook URL required") + + async def on_conversation_complete(self, context): + transcript = context.data.get('transcript', '') + duration = context.data.get('duration', 0) + + message = { + "text": f"New conversation ({duration:.1f}s)", + "blocks": [{ + "type": "section", + "text": {"type": "mrkdwn", "text": f"```{transcript[:500]}```"} + }] + } + + async with aiohttp.ClientSession() as session: + await session.post(self.webhook_url, json=message) + + return PluginResult(success=True, message="Notification sent") +``` + +### Example 2: Keyword Alerter + +```python +class KeywordAlerterPlugin(BasePlugin): + SUPPORTED_ACCESS_LEVELS = ['transcript'] + + async def on_transcript(self, context): + transcript = context.data.get('transcript', '') + keywords = self.config.get('keywords', []) + + for keyword in keywords: + if keyword.lower() in transcript.lower(): + await self.send_alert(keyword, transcript) + return PluginResult( + success=True, + message=f"Alert sent for keyword: {keyword}" + ) + + return PluginResult(success=True) +``` + +### Example 3: Analytics Tracker + +```python +class AnalyticsTrackerPlugin(BasePlugin): + SUPPORTED_ACCESS_LEVELS = ['conversation', 'memory'] + + async def on_conversation_complete(self, context): + duration = context.data.get('duration', 0) + word_count = len(context.data.get('transcript', '').split()) + + await self.track_event('conversation_complete', { + 'user_id': context.user_id, + 'duration': duration, + 'word_count': word_count, + }) + + return PluginResult(success=True) + + async def on_memory_processed(self, context): + memory_count = context.data.get('memory_count', 0) + + await self.track_event('memory_processed', { + 'user_id': context.user_id, + 'memory_count': memory_count, + }) + + return PluginResult(success=True) +``` + +## Troubleshooting + +### Plugin Not Loading + +**Check logs**: +```bash +docker compose logs chronicle-backend | grep "plugin" +``` + +**Common issues**: +- Plugin directory name doesn't match class name convention +- Missing `__init__.py` or incorrect exports +- Syntax errors in plugin.py +- Not inheriting from `BasePlugin` + +**Solution**: +1. Verify directory structure matches: `plugins/my_plugin/` +2. Class name should be: `MyPluginPlugin` +3. Export in `__init__.py`: `from .plugin import MyPluginPlugin` + +### Plugin Enabled But Not Executing + +**Check**: +- Plugin enabled in `plugins.yml` +- Correct events subscribed +- Condition matches (wake_word, regex, etc.) + +**Debug**: +```python +async def on_conversation_complete(self, context): + logger.info(f"Plugin executed! Context: {context}") + # Your logic +``` + +### Configuration Errors + +**Error**: `Environment variable not found` + +**Solution**: +- Add variable to `.env` file +- Use default values: `${VAR:-default}` +- Check variable name spelling + +### Import Errors + +**Error**: `ModuleNotFoundError` + +**Solution**: +- Restart backend after adding dependencies +- Verify imports are from correct modules +- Check relative imports use `..base` for base classes + +### Database Connection Issues + +**Error**: `Database connection failed` + +**Solution**: +```python +from advanced_omi_backend.database import get_database + +async def my_method(self): + db = get_database() # Global database handle + # Use db... +``` + +## Advanced Topics + +### Custom Conditions + +Implement custom condition checking: + +```python +async def on_conversation_complete(self, context): + # Custom condition check + if not self._should_execute(context): + return PluginResult(success=True, message="Skipped") + + # Your logic + ... + +def _should_execute(self, context): + # Custom logic + duration = context.data.get('duration', 0) + return duration > 60 # Only process long conversations +``` + +### Plugin Dependencies + +Share data between plugins using context metadata: + +```python +# Plugin A +async def on_conversation_complete(self, context): + context.metadata['extracted_keywords'] = ['important', 'urgent'] + return PluginResult(success=True) + +# Plugin B (executes after Plugin A) +async def on_conversation_complete(self, context): + keywords = context.metadata.get('extracted_keywords', []) + # Use keywords... +``` + +### External Service Integration + +```python +import aiohttp + +class ExternalServicePlugin(BasePlugin): + async def initialize(self): + self.session = aiohttp.ClientSession() + self.api_url = self.config.get('api_url') + self.api_key = self.config.get('api_key') + + async def cleanup(self): + await self.session.close() + + async def on_conversation_complete(self, context): + async with self.session.post( + self.api_url, + headers={'Authorization': f'Bearer {self.api_key}'}, + json={'transcript': context.data.get('transcript')} + ) as response: + result = await response.json() + return PluginResult(success=True, data=result) +``` + +## Resources + +- **Base Plugin Class**: `backends/advanced/src/advanced_omi_backend/plugins/base.py` +- **Example Plugins**: + - Email Summarizer: `plugins/email_summarizer/` + - Home Assistant: `plugins/homeassistant/` + - Test Event: `plugins/test_event/` +- **Plugin Generator**: `scripts/create_plugin.py` +- **Configuration**: `config/plugins.yml.template` + +## Contributing Plugins + +Want to share your plugin with the community? + +1. Create a well-documented plugin +2. Add comprehensive README +3. Include configuration examples +4. Test thoroughly +5. Submit PR to Chronicle repository + +## Support + +- **GitHub Issues**: [chronicle-ai/chronicle/issues](https://github.com/chronicle-ai/chronicle/issues) +- **Discussions**: [chronicle-ai/chronicle/discussions](https://github.com/chronicle-ai/chronicle/discussions) +- **Documentation**: [Chronicle Docs](https://github.com/chronicle-ai/chronicle) + +Happy plugin development! πŸš€ diff --git a/backends/advanced/init.py b/backends/advanced/init.py index dddbfdcb..7aa4f6aa 100644 --- a/backends/advanced/init.py +++ b/backends/advanced/init.py @@ -5,7 +5,6 @@ """ import argparse -import getpass import os import platform import secrets @@ -22,9 +21,15 @@ from rich.prompt import Confirm, Prompt from rich.text import Text -# Add repo root to path for config_manager import +# Add repo root to path for imports sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) from config_manager import ConfigManager +from setup_utils import ( + prompt_password as util_prompt_password, + prompt_with_existing_masked, + mask_value, + read_env_value +) class ChronicleSetup: @@ -49,6 +54,9 @@ def __init__(self, args=None): self.console.print("[red][ERROR][/red] Run wizard.py from project root to create config.yml") sys.exit(1) + # Ensure plugins.yml exists (copy from template if missing) + self._ensure_plugins_yml_exists() + def print_header(self, title: str): """Print a colorful header""" self.console.print() @@ -76,19 +84,8 @@ def prompt_value(self, prompt: str, default: str = "") -> str: return default def prompt_password(self, prompt: str) -> str: - """Prompt for password (hidden input)""" - while True: - try: - password = getpass.getpass(f"{prompt}: ") - if len(password) >= 8: - return password - self.console.print("[yellow][WARNING][/yellow] Password must be at least 8 characters") - except (EOFError, KeyboardInterrupt): - # For non-interactive environments, generate a secure password - self.console.print("[yellow][WARNING][/yellow] Non-interactive environment detected") - password = f"admin-{secrets.token_hex(8)}" - self.console.print(f"Generated secure password: {password}") - return password + """Prompt for password (delegates to shared utility)""" + return util_prompt_password(prompt, min_length=8, allow_generated=True) def prompt_choice(self, prompt: str, choices: Dict[str, str], default: str = "1") -> str: """Prompt for a choice from options""" @@ -107,6 +104,26 @@ def prompt_choice(self, prompt: str, choices: Dict[str, str], default: str = "1" self.console.print(f"Using default choice: {default}") return default + def _ensure_plugins_yml_exists(self): + """Ensure plugins.yml exists by copying from template if missing.""" + plugins_yml = Path("../../config/plugins.yml") + plugins_template = Path("../../config/plugins.yml.template") + + if not plugins_yml.exists(): + if plugins_template.exists(): + self.console.print("[blue][INFO][/blue] plugins.yml not found, creating from template...") + shutil.copy2(plugins_template, plugins_yml) + self.console.print(f"[green]βœ…[/green] Created {plugins_yml} from template") + self.console.print("[yellow][NOTE][/yellow] Edit config/plugins.yml to configure plugins") + self.console.print("[yellow][NOTE][/yellow] Set HA_TOKEN in .env for Home Assistant integration") + else: + raise RuntimeError( + f"Template file not found: {plugins_template}\n" + f"The repository structure is incomplete. Please ensure config/plugins.yml.template exists." + ) + else: + self.console.print(f"[blue][INFO][/blue] Found existing {plugins_yml}") + def backup_existing_env(self): """Backup existing .env file""" env_path = Path(".env") @@ -117,24 +134,38 @@ def backup_existing_env(self): self.console.print(f"[blue][INFO][/blue] Backed up existing .env file to {backup_path}") def read_existing_env_value(self, key: str) -> str: - """Read a value from existing .env file""" - env_path = Path(".env") - if not env_path.exists(): - return None - - value = get_key(str(env_path), key) - # get_key returns None if key doesn't exist or value is empty - return value if value else None + """Read a value from existing .env file (delegates to shared utility)""" + return read_env_value(".env", key) def mask_api_key(self, key: str, show_chars: int = 5) -> str: - """Mask API key showing only first and last few characters""" - if not key or len(key) <= show_chars * 2: - return key - - # Remove quotes if present - key_clean = key.strip("'\"") - - return f"{key_clean[:show_chars]}{'*' * min(15, len(key_clean) - show_chars * 2)}{key_clean[-show_chars:]}" + """Mask API key (delegates to shared utility)""" + return mask_value(key, show_chars) + + def prompt_with_existing_masked(self, prompt_text: str, env_key: str, placeholders: list, + is_password: bool = False, default: str = "") -> str: + """ + Prompt for a value, showing masked existing value from .env if present. + Delegates to shared utility from setup_utils. + + Args: + prompt_text: The prompt to display + env_key: The .env key to check for existing value + placeholders: List of placeholder values to treat as "not set" + is_password: Whether to mask the value (for passwords/tokens) + default: Default value if no existing value + + Returns: + User input value, existing value if reused, or default + """ + # Use shared utility with auto-read from .env + return prompt_with_existing_masked( + prompt_text=prompt_text, + env_file_path=".env", + env_key=env_key, + placeholders=placeholders, + is_password=is_password, + default=default + ) def setup_authentication(self): @@ -192,15 +223,14 @@ def setup_transcription(self): self.console.print("[blue][INFO][/blue] Deepgram selected") self.console.print("Get your API key from: https://console.deepgram.com/") - # Check for existing API key - existing_key = self.read_existing_env_value("DEEPGRAM_API_KEY") - if existing_key and existing_key not in ['your_deepgram_api_key_here', 'your-deepgram-key-here']: - masked_key = self.mask_api_key(existing_key) - prompt_text = f"Deepgram API key ({masked_key}) [press Enter to reuse, or enter new]" - api_key_input = self.prompt_value(prompt_text, "") - api_key = api_key_input if api_key_input else existing_key - else: - api_key = self.prompt_value("Deepgram API key (leave empty to skip)", "") + # Use the new masked prompt function + api_key = self.prompt_with_existing_masked( + prompt_text="Deepgram API key (leave empty to skip)", + env_key="DEEPGRAM_API_KEY", + placeholders=['your_deepgram_api_key_here', 'your-deepgram-key-here'], + is_password=True, + default="" + ) if api_key: # Write API key to .env @@ -250,15 +280,14 @@ def setup_llm(self): self.console.print("[blue][INFO][/blue] OpenAI selected") self.console.print("Get your API key from: https://platform.openai.com/api-keys") - # Check for existing API key - existing_key = self.read_existing_env_value("OPENAI_API_KEY") - if existing_key and existing_key not in ['your_openai_api_key_here', 'your-openai-key-here']: - masked_key = self.mask_api_key(existing_key) - prompt_text = f"OpenAI API key ({masked_key}) [press Enter to reuse, or enter new]" - api_key_input = self.prompt_value(prompt_text, "") - api_key = api_key_input if api_key_input else existing_key - else: - api_key = self.prompt_value("OpenAI API key (leave empty to skip)", "") + # Use the new masked prompt function + api_key = self.prompt_with_existing_masked( + prompt_text="OpenAI API key (leave empty to skip)", + env_key="OPENAI_API_KEY", + placeholders=['your_openai_api_key_here', 'your-openai-key-here'], + is_password=True, + default="" + ) if api_key: self.config["OPENAI_API_KEY"] = api_key @@ -370,6 +399,11 @@ def setup_optional_services(self): self.config["PARAKEET_ASR_URL"] = self.args.parakeet_asr_url self.console.print(f"[green][SUCCESS][/green] Parakeet ASR configured via args: {self.args.parakeet_asr_url}") + # Check if Tailscale auth key provided via args + if hasattr(self.args, 'ts_authkey') and self.args.ts_authkey: + self.config["TS_AUTHKEY"] = self.args.ts_authkey + self.console.print(f"[green][SUCCESS][/green] Tailscale auth key configured (Docker integration enabled)") + def setup_obsidian(self): """Configure Obsidian/Neo4j integration""" # Check if enabled via command line @@ -413,6 +447,16 @@ def setup_obsidian(self): self.console.print("[green][SUCCESS][/green] Obsidian/Neo4j configured") self.console.print("[blue][INFO][/blue] Neo4j will start automatically with --profile obsidian") + else: + # Explicitly disable Obsidian in config.yml when not enabled + self.config_manager.update_memory_config({ + "obsidian": { + "enabled": False, + "neo4j_host": "neo4j-mem0", + "timeout": 30 + } + }) + self.console.print("[blue][INFO][/blue] Obsidian/Neo4j integration disabled") def setup_network(self): """Configure network settings""" @@ -443,14 +487,14 @@ def setup_https(self): self.console.print("[blue][INFO][/blue] For distributed deployments, use your Tailscale IP (e.g., 100.64.1.2)") self.console.print("[blue][INFO][/blue] For local-only access, use 'localhost'") - # Check for existing SERVER_IP - existing_ip = self.read_existing_env_value("SERVER_IP") - if existing_ip and existing_ip not in ['localhost', 'your-server-ip-here']: - prompt_text = f"Server IP/Domain for SSL certificate ({existing_ip}) [press Enter to reuse, or enter new]" - server_ip_input = self.prompt_value(prompt_text, "") - server_ip = server_ip_input if server_ip_input else existing_ip - else: - server_ip = self.prompt_value("Server IP/Domain for SSL certificate (Tailscale IP or localhost)", "localhost") + # Use the new masked prompt function (not masked for IP, but shows existing) + server_ip = self.prompt_with_existing_masked( + prompt_text="Server IP/Domain for SSL certificate (Tailscale IP or localhost)", + env_key="SERVER_IP", + placeholders=['localhost', 'your-server-ip-here'], + is_password=False, + default="localhost" + ) if enable_https: @@ -707,6 +751,8 @@ def main(): help="Enable Obsidian/Neo4j integration (default: prompt user)") parser.add_argument("--neo4j-password", help="Neo4j password (default: prompt user)") + parser.add_argument("--ts-authkey", + help="Tailscale auth key for Docker integration (default: prompt user)") args = parser.parse_args() diff --git a/backends/advanced/pyproject.toml b/backends/advanced/pyproject.toml index e7bcb50a..c5d17b00 100644 --- a/backends/advanced/pyproject.toml +++ b/backends/advanced/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "httpx>=0.28.0,<1.0.0", "fastapi-users[beanie]>=14.0.1", "PyYAML>=6.0.1", + "omegaconf>=2.3.0", "langfuse>=3.3.0", "spacy>=3.8.2", "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", @@ -114,4 +115,5 @@ test = [ "requests-mock>=1.12.1", "pytest-json-report>=1.5.0", "pytest-html>=4.0.0", + "aiosqlite>=0.20.0", # For test plugin event storage ] diff --git a/backends/advanced/run-test.sh b/backends/advanced/run-test.sh index 01204be6..61fd7d55 100755 --- a/backends/advanced/run-test.sh +++ b/backends/advanced/run-test.sh @@ -91,6 +91,29 @@ if [ -n "$_CONFIG_FILE_OVERRIDE" ]; then print_info "Using command-line override: CONFIG_FILE=$CONFIG_FILE" fi +# Load HF_TOKEN from speaker-recognition/.env (proper location for this credential) +SPEAKER_ENV="../../extras/speaker-recognition/.env" +if [ -f "$SPEAKER_ENV" ] && [ -z "$HF_TOKEN" ]; then + print_info "Loading HF_TOKEN from speaker-recognition service..." + set -a + source "$SPEAKER_ENV" + set +a +fi + +# Display HF_TOKEN status with masking +if [ -n "$HF_TOKEN" ]; then + if [ ${#HF_TOKEN} -gt 15 ]; then + MASKED_TOKEN="${HF_TOKEN:0:5}***************${HF_TOKEN: -5}" + else + MASKED_TOKEN="***************" + fi + print_info "HF_TOKEN configured: $MASKED_TOKEN" + export HF_TOKEN +else + print_warning "HF_TOKEN not found - speaker recognition tests may fail" + print_info "Configure via wizard: uv run --with-requirements ../../setup-requirements.txt python ../../wizard.py" +fi + # Set default CONFIG_FILE if not provided # This allows testing with different provider combinations # Usage: CONFIG_FILE=../../tests/configs/parakeet-ollama.yml ./run-test.sh @@ -166,6 +189,18 @@ if [ ! -f "diarization_config.json" ] && [ -f "diarization_config.json.template" print_success "diarization_config.json created" fi +# Ensure plugins.yml exists (required for Docker volume mount) +if [ ! -f "../../config/plugins.yml" ]; then + if [ -f "../../config/plugins.yml.template" ]; then + print_info "Creating config/plugins.yml from template..." + cp ../../config/plugins.yml.template ../../config/plugins.yml + print_success "config/plugins.yml created" + else + print_error "config/plugins.yml.template not found - repository structure incomplete" + exit 1 + fi +fi + # Note: Robot Framework dependencies are managed via tests/test-requirements.txt # The integration tests use Docker containers for service dependencies @@ -176,10 +211,16 @@ print_info "Using environment variables from .env file for test configuration" # Clean test environment print_info "Cleaning test environment..." -sudo rm -rf ./test_audio_chunks/ ./test_data/ ./test_debug_dir/ ./mongo_data_test/ ./qdrant_data_test/ ./test_neo4j/ || true +rm -rf ./test_audio_chunks/ ./test_data/ ./test_debug_dir/ ./mongo_data_test/ ./qdrant_data_test/ ./test_neo4j/ 2>/dev/null || true + +# If cleanup fails due to permissions, try with docker +if [ -d "./data/test_audio_chunks/" ] || [ -d "./data/test_data/" ] || [ -d "./data/test_debug_dir/" ]; then + print_warning "Permission denied, using docker to clean test directories..." + docker run --rm -v "$(pwd)/data:/data" alpine sh -c 'rm -rf /data/test_*' 2>/dev/null || true +fi -# Use unique project name to avoid conflicts with development environment -export COMPOSE_PROJECT_NAME="advanced-backend-test" +# Note: Project name 'backend-test' is set in docker-compose-test.yml +# No need to export COMPOSE_PROJECT_NAME - it's handled by the compose file # Stop any existing test containers print_info "Stopping existing test containers..." @@ -211,8 +252,9 @@ export TEST_MODE=dev # Run the Robot Framework integration tests with extended timeout (mem0 needs time for comprehensive extraction) # IMPORTANT: Robot tests must be run from the repository root where backends/ and tests/ are siblings +# Run full test suite from tests/integration/ directory (includes all test files) print_info "Starting Robot Framework integration tests (timeout: 15 minutes)..." -if (cd ../.. && timeout 900 robot --outputdir test-results --loglevel INFO tests/integration/integration_test.robot); then +if (cd ../.. && timeout 900 uv run --with-requirements tests/test-requirements.txt robot --outputdir test-results --loglevel INFO tests/integration/); then print_success "Integration tests completed successfully!" else TEST_EXIT_CODE=$? diff --git a/backends/advanced/scripts/create_plugin.py b/backends/advanced/scripts/create_plugin.py new file mode 100755 index 00000000..a38a3570 --- /dev/null +++ b/backends/advanced/scripts/create_plugin.py @@ -0,0 +1,437 @@ +#!/usr/bin/env python3 +""" +Plugin Generator Script for Chronicle. + +Creates boilerplate plugin structure with templates and examples. + +Usage: + uv run python scripts/create_plugin.py my_awesome_plugin +""" +import argparse +import os +import shutil +import sys +from pathlib import Path + + +def snake_to_pascal(snake_str: str) -> str: + """Convert snake_case to PascalCase.""" + return ''.join(word.capitalize() for word in snake_str.split('_')) + + +def create_plugin(plugin_name: str, force: bool = False): + """ + Create a new plugin with boilerplate structure. + + Args: + plugin_name: Plugin name in snake_case (e.g., my_awesome_plugin) + force: Overwrite existing plugin if True + """ + # Validate plugin name + if not plugin_name.replace('_', '').isalnum(): + print(f"❌ Error: Plugin name must be alphanumeric with underscores") + print(f" Got: {plugin_name}") + print(f" Example: my_awesome_plugin") + sys.exit(1) + + # Convert to class name + class_name = snake_to_pascal(plugin_name) + 'Plugin' + + # Get plugins directory + script_dir = Path(__file__).parent + backend_dir = script_dir.parent + plugins_dir = backend_dir / 'src' / 'advanced_omi_backend' / 'plugins' + plugin_dir = plugins_dir / plugin_name + + # Check if plugin already exists + if plugin_dir.exists(): + if not force: + print(f"❌ Error: Plugin '{plugin_name}' already exists at {plugin_dir}") + print(f" Use --force to overwrite") + sys.exit(1) + else: + # Remove existing directory when using --force + print(f"πŸ—‘οΈ Removing existing plugin directory: {plugin_dir}") + shutil.rmtree(plugin_dir) + + # Create plugin directory + print(f"πŸ“ Creating plugin directory: {plugin_dir}") + plugin_dir.mkdir(parents=True, exist_ok=True) + + # Create __init__.py + init_content = f'''""" +{class_name} for Chronicle. + +[Brief description of what your plugin does] +""" + +from .plugin import {class_name} + +__all__ = ['{class_name}'] +''' + + init_file = plugin_dir / '__init__.py' + print(f"πŸ“ Creating {init_file}") + init_file.write_text(init_content) + + # Create plugin.py with template + plugin_content = f'''""" +{class_name} implementation. + +This plugin [describe what it does]. +""" +import logging +from typing import Any, Dict, List, Optional + +from ..base import BasePlugin, PluginContext, PluginResult + +logger = logging.getLogger(__name__) + + +class {class_name}(BasePlugin): + """ + [Plugin description] + + Subscribes to: [list events you want to subscribe to] + - transcript.streaming: Real-time transcript segments + - conversation.complete: When conversation finishes + - memory.processed: After memory extraction + + Configuration (config/plugins.yml): + {plugin_name}: + enabled: true + events: + - conversation.complete # Change to your event + condition: + type: always # or wake_word, regex, etc. + # Your custom config here: + my_setting: ${{MY_ENV_VAR}} + """ + + # Declare which access levels this plugin supports + # Options: 'transcript', 'conversation', 'memory' + SUPPORTED_ACCESS_LEVELS: List[str] = ['conversation'] + + def __init__(self, config: Dict[str, Any]): + """ + Initialize plugin with configuration. + + Args: + config: Plugin configuration from config/plugins.yml + """ + super().__init__(config) + + # Load your custom configuration + self.my_setting = config.get('my_setting', 'default_value') + + logger.info(f"{class_name} configuration loaded") + + async def initialize(self): + """ + Initialize plugin resources. + + Called during application startup. + Use this to: + - Connect to external services + - Initialize clients + - Validate configuration + - Set up resources + + Raises: + Exception: If initialization fails + """ + if not self.enabled: + logger.info(f"{class_name} is disabled, skipping initialization") + return + + logger.info(f"Initializing {class_name}...") + + # TODO: Add your initialization code here + # Example: + # self.client = SomeClient(self.my_setting) + # await self.client.connect() + + logger.info(f"βœ… {class_name} initialized successfully") + + async def cleanup(self): + """ + Clean up plugin resources. + + Called during application shutdown. + Use this to: + - Close connections + - Save state + - Release resources + """ + logger.info(f"{class_name} cleanup complete") + + # Implement the methods for events you subscribed to: + + async def on_transcript(self, context: PluginContext) -> Optional[PluginResult]: + """ + Handle transcript.streaming events. + + Context data contains: + - transcript: str - The transcript text + - segment_id: str - Unique segment identifier + - conversation_id: str - Current conversation ID + + For wake_word conditions, router adds: + - command: str - Command with wake word stripped + - original_transcript: str - Full transcript + + Args: + context: Plugin context with transcript data + + Returns: + PluginResult with success status and optional message + """ + # TODO: Implement if you subscribed to transcript.streaming + pass + + async def on_conversation_complete(self, context: PluginContext) -> Optional[PluginResult]: + """ + Handle conversation.complete events. + + Context data contains: + - conversation: dict - Full conversation data + - transcript: str - Complete transcript + - duration: float - Conversation duration + - conversation_id: str - Conversation identifier + + Args: + context: Plugin context with conversation data + + Returns: + PluginResult with success status and optional message + """ + try: + logger.info(f"Processing conversation complete event for user: {{context.user_id}}") + + # Extract data from context + conversation = context.data.get('conversation', {{}}) + transcript = context.data.get('transcript', '') + duration = context.data.get('duration', 0) + conversation_id = context.data.get('conversation_id', 'unknown') + + # TODO: Add your plugin logic here + # Example: + # - Process the transcript + # - Call external APIs + # - Store data + # - Trigger actions + + logger.info(f"Processed conversation {{conversation_id}}") + + return PluginResult( + success=True, + message="Processing complete", + data={{'conversation_id': conversation_id}} + ) + + except Exception as e: + logger.error(f"Error in {class_name}: {{e}}", exc_info=True) + return PluginResult( + success=False, + message=f"Error: {{str(e)}}" + ) + + async def on_memory_processed(self, context: PluginContext) -> Optional[PluginResult]: + """ + Handle memory.processed events. + + Context data contains: + - memories: list - Extracted memories + - conversation: dict - Source conversation + - memory_count: int - Number of memories created + - conversation_id: str - Conversation identifier + + Args: + context: Plugin context with memory data + + Returns: + PluginResult with success status and optional message + """ + # TODO: Implement if you subscribed to memory.processed + pass + + # Add your custom helper methods here: + + async def _my_helper_method(self, data: Any) -> Any: + """ + Example helper method. + + Args: + data: Input data + + Returns: + Processed data + """ + # TODO: Implement your helper logic + pass +''' + + plugin_file = plugin_dir / 'plugin.py' + print(f"πŸ“ Creating {plugin_file}") + plugin_file.write_text(plugin_content) + + # Create README.md + readme_content = f'''# {class_name} + +[Brief description of what your plugin does] + +## Features + +- Feature 1 +- Feature 2 +- Feature 3 + +## Configuration + +### Step 1: Environment Variables + +Add to `backends/advanced/.env`: + +```bash +# {class_name} Configuration +MY_ENV_VAR=your-value-here +``` + +### Step 2: Plugin Configuration + +Add to `config/plugins.yml`: + +```yaml +plugins: + {plugin_name}: + enabled: true + events: + - conversation.complete # Change to your event + condition: + type: always + + # Your custom configuration + my_setting: ${{MY_ENV_VAR}} +``` + +### Step 3: Restart Backend + +```bash +cd backends/advanced +docker compose restart +``` + +## How It Works + +1. [Step 1 description] +2. [Step 2 description] +3. [Step 3 description] + +## Configuration Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `my_setting` | string | `default` | Description of setting | + +## Testing + +```bash +# Add testing instructions here +``` + +## Troubleshooting + +### Issue 1 + +Solution 1 + +### Issue 2 + +Solution 2 + +## Development + +### File Structure + +``` +plugins/{plugin_name}/ +β”œβ”€β”€ __init__.py # Plugin exports +β”œβ”€β”€ plugin.py # Main plugin logic +└── README.md # This file +``` + +## License + +MIT License - see project LICENSE file for details. +''' + + readme_file = plugin_dir / 'README.md' + print(f"πŸ“ Creating {readme_file}") + readme_file.write_text(readme_content) + + # Print success message and next steps + print(f"\nβœ… Plugin '{plugin_name}' created successfully!\n") + print(f"πŸ“ Location: {plugin_dir}\n") + print(f"πŸ“‹ Next steps:") + print(f" 1. Edit {plugin_file}") + print(f" - Implement your plugin logic") + print(f" - Choose which events to subscribe to") + print(f" - Add your configuration options") + print(f"") + print(f" 2. Update config/plugins.yml:") + print(f" ```yaml") + print(f" plugins:") + print(f" {plugin_name}:") + print(f" enabled: true") + print(f" events:") + print(f" - conversation.complete") + print(f" condition:") + print(f" type: always") + print(f" ```") + print(f"") + print(f" 3. Add environment variables to .env (if needed)") + print(f"") + print(f" 4. Restart backend:") + print(f" cd backends/advanced && docker compose restart") + print(f"") + print(f"πŸ“– Resources:") + print(f" - Plugin development guide: docs/plugin-development-guide.md") + print(f" - Example plugin: plugins/email_summarizer/") + print(f" - Base plugin class: plugins/base.py") + + +def main(): + parser = argparse.ArgumentParser( + description='Create a new Chronicle plugin with boilerplate structure', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + uv run python scripts/create_plugin.py my_awesome_plugin + uv run python scripts/create_plugin.py slack_notifier + uv run python scripts/create_plugin.py todo_extractor --force + ''' + ) + parser.add_argument( + 'plugin_name', + help='Plugin name in snake_case (e.g., my_awesome_plugin)' + ) + parser.add_argument( + '--force', '-f', + action='store_true', + help='Overwrite existing plugin if it exists' + ) + + args = parser.parse_args() + + try: + create_plugin(args.plugin_name, force=args.force) + except KeyboardInterrupt: + print("\n\n❌ Plugin creation cancelled") + sys.exit(1) + except Exception as e: + print(f"\n❌ Error creating plugin: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/backends/advanced/scripts/laptop_client.py b/backends/advanced/scripts/laptop_client.py index 385a4a1b..a0047f3b 100644 --- a/backends/advanced/scripts/laptop_client.py +++ b/backends/advanced/scripts/laptop_client.py @@ -15,7 +15,7 @@ # Default WebSocket settings DEFAULT_HOST = "localhost" DEFAULT_PORT = 8000 -DEFAULT_ENDPOINT = "/ws_pcm" +DEFAULT_ENDPOINT = "/ws?codec=pcm" # Audio format will be determined from the InputMicStream instance diff --git a/backends/advanced/src/advanced_omi_backend/app_config.py b/backends/advanced/src/advanced_omi_backend/app_config.py index 1e24fb54..c87398f3 100644 --- a/backends/advanced/src/advanced_omi_backend/app_config.py +++ b/backends/advanced/src/advanced_omi_backend/app_config.py @@ -29,8 +29,7 @@ class AppConfig: def __init__(self): # MongoDB Configuration self.mongodb_uri = os.getenv("MONGODB_URI", "mongodb://mongo:27017") - # default to legacy value to avoid breaking peoples .env - self.mongodb_database = os.getenv("MONGODB_DATABASE", "friend-lite") + self.mongodb_database = os.getenv("MONGODB_DATABASE", "chronicle") self.mongo_client = AsyncIOMotorClient(self.mongodb_uri) self.db = self.mongo_client.get_default_database(self.mongodb_database) self.users_col = self.db["users"] @@ -47,11 +46,6 @@ def __init__(self): os.getenv("NEW_CONVERSATION_TIMEOUT_MINUTES", "1.5") ) - # Audio cropping configuration - self.audio_cropping_enabled = os.getenv("AUDIO_CROPPING_ENABLED", "true").lower() == "true" - self.min_speech_segment_duration = float(os.getenv("MIN_SPEECH_SEGMENT_DURATION", "1.0")) - self.cropping_context_padding = float(os.getenv("CROPPING_CONTEXT_PADDING", "0.1")) - # Transcription Configuration (registry-based) self.transcription_provider = get_transcription_provider(None) if self.transcription_provider: diff --git a/backends/advanced/src/advanced_omi_backend/app_factory.py b/backends/advanced/src/advanced_omi_backend/app_factory.py index 7ccda184..4458ed9e 100644 --- a/backends/advanced/src/advanced_omi_backend/app_factory.py +++ b/backends/advanced/src/advanced_omi_backend/app_factory.py @@ -42,6 +42,52 @@ application_logger = logging.getLogger("audio_processing") +async def initialize_openmemory_user() -> None: + """Initialize and register OpenMemory user if using OpenMemory MCP provider. + + This function: + - Checks if OpenMemory MCP is configured as the memory provider + - Registers the configured user with OpenMemory server + - Creates a test memory and deletes it to trigger user creation + - Logs success or warning if OpenMemory is not reachable + """ + from advanced_omi_backend.services.memory.config import build_memory_config_from_env, MemoryProvider + + memory_provider_config = build_memory_config_from_env() + + if memory_provider_config.memory_provider != MemoryProvider.OPENMEMORY_MCP: + return + + try: + from advanced_omi_backend.services.memory.providers.mcp_client import MCPClient + + # Get configured user_id and server_url + openmemory_config = memory_provider_config.openmemory_config + user_id = openmemory_config.get("user_id", "openmemory") if openmemory_config else "openmemory" + server_url = openmemory_config.get("server_url", "http://host.docker.internal:8765") if openmemory_config else "http://host.docker.internal:8765" + client_name = openmemory_config.get("client_name", "chronicle") if openmemory_config else "chronicle" + + application_logger.info(f"Registering OpenMemory user: {user_id} at {server_url}") + + # Make a lightweight registration call (create and delete dummy memory) + async with MCPClient(server_url=server_url, client_name=client_name, user_id=user_id) as client: + # Test connection first + is_connected = await client.test_connection() + if is_connected: + # Create and immediately delete a dummy memory to trigger user creation + memory_ids = await client.add_memories("Chronicle initialization - user registration test") + if memory_ids: + # Delete the test memory + await client.delete_memory(memory_ids[0]) + application_logger.info(f"βœ… Registered OpenMemory user: {user_id}") + else: + application_logger.warning(f"⚠️ OpenMemory MCP not reachable at {server_url}") + application_logger.info("User will be auto-created on first memory operation") + except Exception as e: + application_logger.warning(f"⚠️ Could not register OpenMemory user: {e}") + application_logger.info("User will be auto-created on first memory operation") + + @asynccontextmanager async def lifespan(app: FastAPI): """Manage application lifespan events.""" @@ -54,12 +100,14 @@ async def lifespan(app: FastAPI): try: from beanie import init_beanie from advanced_omi_backend.models.conversation import Conversation - from advanced_omi_backend.models.audio_file import AudioFile + from advanced_omi_backend.models.audio_chunk import AudioChunkDocument from advanced_omi_backend.models.user import User + from advanced_omi_backend.models.waveform import WaveformData + from advanced_omi_backend.models.annotation import Annotation await init_beanie( database=config.db, - document_models=[User, Conversation, AudioFile], + document_models=[User, Conversation, AudioChunkDocument, WaveformData, Annotation], ) application_logger.info("Beanie initialized for all document models") except Exception as e: @@ -111,6 +159,11 @@ async def lifespan(app: FastAPI): from advanced_omi_backend.services.audio_stream import AudioStreamProducer app.state.audio_stream_producer = AudioStreamProducer(app.state.redis_audio_stream) application_logger.info("βœ… Redis client for audio streaming producer initialized") + + # Initialize ClientManager Redis for cross-container clientβ†’user mapping + from advanced_omi_backend.client_manager import initialize_redis_for_client_manager + initialize_redis_for_client_manager(config.redis_url) + except Exception as e: application_logger.error(f"Failed to initialize Redis client for audio streaming: {e}", exc_info=True) application_logger.warning("Audio streaming producer will not be available") @@ -119,9 +172,42 @@ async def lifespan(app: FastAPI): # Memory service will be lazily initialized when first used application_logger.info("Memory service will be initialized on first use (lazy loading)") + # Register OpenMemory user if using openmemory_mcp provider + await initialize_openmemory_user() + # SystemTracker is used for monitoring and debugging application_logger.info("Using SystemTracker for monitoring and debugging") + # Initialize plugins using plugin service + try: + from advanced_omi_backend.services.plugin_service import init_plugin_router, set_plugin_router + + plugin_router = init_plugin_router() + + if plugin_router: + # Initialize async resources for each enabled plugin + for plugin_id, plugin in plugin_router.plugins.items(): + if plugin.enabled: + try: + await plugin.initialize() + application_logger.info(f"βœ… Plugin '{plugin_id}' initialized") + except Exception as e: + application_logger.error(f"Failed to initialize plugin '{plugin_id}': {e}", exc_info=True) + + application_logger.info(f"Plugins initialized: {len(plugin_router.plugins)} active") + + # Store in app state for API access + app.state.plugin_router = plugin_router + # Register with plugin service for worker access + set_plugin_router(plugin_router) + else: + application_logger.info("No plugins configured") + app.state.plugin_router = None + + except Exception as e: + application_logger.error(f"Failed to initialize plugin system: {e}", exc_info=True) + app.state.plugin_router = None + application_logger.info("Application ready - using application-level processing architecture.") logger.info("App ready") @@ -162,6 +248,14 @@ async def lifespan(app: FastAPI): # Stop metrics collection and save final report application_logger.info("Metrics collection stopped") + # Shutdown plugins + try: + from advanced_omi_backend.services.plugin_service import cleanup_plugin_router + await cleanup_plugin_router() + application_logger.info("Plugins shut down") + except Exception as e: + application_logger.error(f"Error shutting down plugins: {e}") + # Shutdown memory service and speaker service shutdown_memory_service() application_logger.info("Memory and speaker services shut down.") diff --git a/backends/advanced/src/advanced_omi_backend/auth.py b/backends/advanced/src/advanced_omi_backend/auth.py index f1b7909a..2e14b8b0 100644 --- a/backends/advanced/src/advanced_omi_backend/auth.py +++ b/backends/advanced/src/advanced_omi_backend/auth.py @@ -224,6 +224,9 @@ async def create_admin_user_if_needed(): existing_admin = await user_db.get_by_email(ADMIN_EMAIL) if existing_admin: + logger.debug(f"existing_admin.id = {existing_admin.id}, type = {type(existing_admin.id)}") + logger.debug(f"str(existing_admin.id) = {str(existing_admin.id)}") + logger.debug(f"existing_admin.user_id = {existing_admin.user_id}") logger.info( f"βœ… Admin user already exists: {existing_admin.user_id} ({existing_admin.email})" ) diff --git a/backends/advanced/src/advanced_omi_backend/client_manager.py b/backends/advanced/src/advanced_omi_backend/client_manager.py index 5a3131b5..e55b3502 100644 --- a/backends/advanced/src/advanced_omi_backend/client_manager.py +++ b/backends/advanced/src/advanced_omi_backend/client_manager.py @@ -9,6 +9,7 @@ import logging import uuid from typing import TYPE_CHECKING, Dict, Optional +import redis.asyncio as redis if TYPE_CHECKING: from advanced_omi_backend.client import ClientState @@ -21,6 +22,9 @@ _client_to_user_mapping: Dict[str, str] = {} # Active clients only _all_client_user_mappings: Dict[str, str] = {} # All clients including disconnected +# Redis client for cross-container clientβ†’user mapping +_redis_client: Optional[redis.Redis] = None + class ClientManager: """ @@ -372,9 +376,33 @@ def unregister_client_user_mapping(client_id: str): logger.warning(f"⚠️ Attempted to unregister non-existent client {client_id}") +async def track_client_user_relationship_async(client_id: str, user_id: str, ttl: int = 86400): + """ + Track that a client belongs to a user (async, writes to Redis for cross-container support). + + Args: + client_id: The client ID + user_id: The user ID that owns this client + ttl: Time-to-live in seconds (default 24 hours) + """ + _all_client_user_mappings[client_id] = user_id # In-memory fallback + + if _redis_client: + try: + await _redis_client.setex(f"client:owner:{client_id}", ttl, user_id) + logger.debug(f"βœ… Tracked client {client_id} β†’ user {user_id} in Redis (TTL: {ttl}s)") + except Exception as e: + logger.warning(f"Failed to track client in Redis: {e}") + else: + logger.debug(f"Tracked client {client_id} relationship to user {user_id} (in-memory only)") + + def track_client_user_relationship(client_id: str, user_id: str): """ - Track that a client belongs to a user (persists after disconnection for database queries). + Track that a client belongs to a user (sync version for backward compatibility). + + WARNING: This is synchronous and cannot use Redis. Use track_client_user_relationship_async() + instead in async contexts for cross-container support. Args: client_id: The client ID @@ -444,9 +472,45 @@ def get_user_clients_active(user_id: str) -> list[str]: return user_clients +def initialize_redis_for_client_manager(redis_url: str): + """ + Initialize Redis client for cross-container clientβ†’user mapping. + + Args: + redis_url: Redis connection URL + """ + global _redis_client + _redis_client = redis.from_url(redis_url, decode_responses=True) + logger.info(f"βœ… ClientManager Redis initialized: {redis_url}") + + +async def get_client_owner_async(client_id: str) -> Optional[str]: + """ + Get the user ID that owns a specific client (async Redis lookup). + + Args: + client_id: The client ID to look up + + Returns: + User ID if found, None otherwise + """ + if _redis_client: + try: + user_id = await _redis_client.get(f"client:owner:{client_id}") + return user_id + except Exception as e: + logger.warning(f"Redis lookup failed for client {client_id}: {e}") + + # Fallback to in-memory mapping + return _all_client_user_mappings.get(client_id) + + def get_client_owner(client_id: str) -> Optional[str]: """ - Get the user ID that owns a specific client. + Get the user ID that owns a specific client (sync version for backward compatibility). + + WARNING: This is synchronous and cannot use Redis. Use get_client_owner_async() instead + in async contexts for cross-container support. Args: client_id: The client ID to look up diff --git a/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py b/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py index af89fd51..1f3c695a 100644 --- a/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py +++ b/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py @@ -65,7 +65,7 @@ def __init__( base_url: str, token: str, device_name: str = "python-client", - endpoint: str = "ws_pcm", + endpoint: str = "ws?codec=pcm", ): """Initialize the audio stream client. @@ -73,7 +73,7 @@ def __init__( base_url: Base URL of the backend (e.g., "http://localhost:8000") token: JWT authentication token device_name: Device name for client identification - endpoint: WebSocket endpoint ("ws_pcm" or "ws_omi") + endpoint: WebSocket endpoint ("ws?codec=pcm" or "ws?codec=opus") """ self.base_url = base_url self.token = token @@ -87,7 +87,9 @@ def __init__( def ws_url(self) -> str: """Build WebSocket URL from base URL.""" url = self.base_url.replace("http://", "ws://").replace("https://", "wss://") - return f"{url}/{self.endpoint}?token={self.token}&device_name={self.device_name}" + # Check if endpoint already has query params + separator = "&" if "?" in self.endpoint else "?" + return f"{url}/{self.endpoint}{separator}token={self.token}&device_name={self.device_name}" async def connect(self, wait_for_ready: bool = True) -> WebSocketClientProtocol: """Connect to the WebSocket endpoint. @@ -105,8 +107,8 @@ async def connect(self, wait_for_ready: bool = True) -> WebSocketClientProtocol: self.ws = await websockets.connect(self.ws_url) logger.info("WebSocket connected") - if wait_for_ready and self.endpoint == "ws_pcm": - # PCM endpoint sends "ready" message after auth (line 261-268 in websocket_controller.py) + if wait_for_ready and "codec=pcm" in self.endpoint: + # PCM codec sends "ready" message after auth (line 261-268 in websocket_controller.py) ready_msg = await self.ws.recv() ready = json.loads(ready_msg.strip() if isinstance(ready_msg, str) else ready_msg.decode().strip()) if ready.get("type") != "ready": @@ -121,6 +123,7 @@ async def send_audio_start( sample_rate: int = OMI_SAMPLE_RATE, sample_width: int = OMI_SAMPLE_WIDTH, channels: int = OMI_CHANNELS, + always_persist: bool = False, ) -> None: """Send Wyoming audio-start event. @@ -129,6 +132,7 @@ async def send_audio_start( sample_rate: Audio sample rate in Hz (default: 16000) sample_width: Bytes per sample (default: 2 for 16-bit) channels: Number of audio channels (default: 1) + always_persist: Save audio even if transcription fails (default: False) Note: The mode is inside the "data" dict, matching _handle_audio_session_start @@ -144,11 +148,15 @@ async def send_audio_start( "width": sample_width, "channels": channels, "mode": recording_mode, + "always_persist": always_persist, }, "payload_length": None, } + print(f"πŸ”΅ CLIENT: Sending audio-start message: {header}") + logger.info(f"πŸ”΅ CLIENT: Sending audio-start message: {header}") await self.ws.send(json.dumps(header) + "\n") - logger.info(f"Sent audio-start with mode={recording_mode}") + print(f"βœ… CLIENT: Sent audio-start with mode={recording_mode}, always_persist={always_persist}") + logger.info(f"βœ… CLIENT: Sent audio-start with mode={recording_mode}, always_persist={always_persist}") async def send_audio_chunk_wyoming( self, @@ -230,6 +238,7 @@ async def stream_wav_file( use_wyoming: bool = True, recording_mode: str = "streaming", realtime_factor: float = 0.1, + always_persist: bool = False, ) -> int: """Stream a WAV file in chunks, simulating real-time audio. @@ -239,6 +248,7 @@ async def stream_wav_file( use_wyoming: If True, use Wyoming protocol; if False, send raw binary recording_mode: "streaming" or "batch" realtime_factor: Fraction of real-time to simulate (0.1 = 10x speed) + always_persist: Save audio even if transcription fails (default: False) Returns: Number of chunks sent @@ -266,6 +276,7 @@ async def stream_wav_file( sample_rate=sample_rate, sample_width=sample_width, channels=channels, + always_persist=always_persist, ) # Reset counters @@ -301,9 +312,19 @@ async def stream_wav_file( async def close(self) -> None: """Close the WebSocket connection.""" if self.ws: - await self.ws.close() - self.ws = None - logger.info("WebSocket connection closed") + try: + # Add timeout to WebSocket close to prevent hanging + await asyncio.wait_for(self.ws.close(), timeout=2.0) + logger.info("WebSocket connection closed cleanly") + except asyncio.TimeoutError: + logger.warning("WebSocket close timed out after 2s, forcing close") + # Force close without waiting for handshake + if hasattr(self.ws, 'transport') and self.ws.transport: + self.ws.transport.close() + except Exception as e: + logger.error(f"Error during WebSocket close: {e}") + finally: + self.ws = None async def __aenter__(self) -> "AudioStreamClient": """Async context manager entry.""" @@ -323,6 +344,7 @@ def stream_audio_file( device_name: str = "robot-test", recording_mode: str = "streaming", use_wyoming: bool = True, + always_persist: bool = False, ) -> int: """Synchronous wrapper for streaming audio file. @@ -336,6 +358,7 @@ def stream_audio_file( device_name: Device name for client identification recording_mode: "streaming" or "batch" use_wyoming: If True, use Wyoming protocol + always_persist: Save audio even if transcription fails (default: False) Returns: Number of chunks sent @@ -347,6 +370,7 @@ async def _run() -> int: wav_path, use_wyoming=use_wyoming, recording_mode=recording_mode, + always_persist=always_persist, ) return asyncio.run(_run()) @@ -395,6 +419,7 @@ def start_stream( token: str, device_name: str = "robot-test", recording_mode: str = "streaming", + always_persist: bool = False, ) -> str: """Start a new audio stream (non-blocking). @@ -403,6 +428,7 @@ def start_stream( token: JWT token device_name: Device name for client ID recording_mode: "streaming" or "batch" + always_persist: Save audio even if transcription fails (default: False) Returns: stream_id: Unique ID for this stream session @@ -428,14 +454,16 @@ def run_loop(): # Connect and send audio-start async def _connect_and_start(): try: + logger.info(f"πŸ”΅ CLIENT: Stream {stream_id} connecting for {device_name}...") await client.connect() session.connected = True - await client.send_audio_start(recording_mode=recording_mode) + logger.info(f"βœ… CLIENT: Stream {stream_id} connected, sending audio-start...") + await client.send_audio_start(recording_mode=recording_mode, always_persist=always_persist) session.audio_started = True - logger.info(f"Stream {stream_id} started for {device_name}") + logger.info(f"βœ… CLIENT: Stream {stream_id} started for {device_name}") except Exception as e: session.error = str(e) - logger.error(f"Stream {stream_id} failed to start: {e}") + logger.error(f"❌ CLIENT: Stream {stream_id} failed to start: {e}") future = asyncio.run_coroutine_threadsafe(_connect_and_start(), loop) future.result(timeout=10) # Wait for connection @@ -543,6 +571,39 @@ async def _stop(): logger.info(f"Stream {stream_id} stopped, sent {total_chunks} chunks") return total_chunks + def close_stream_without_stop(self, stream_id: str) -> int: + """Close WebSocket connection without sending audio-stop event. + + This simulates abrupt disconnection (network failure, client crash) + and should trigger websocket_disconnect end_reason. + + Args: + stream_id: Stream session ID + + Returns: + Total chunks sent during this session + """ + session = self._sessions.get(stream_id) + if not session: + raise ValueError(f"Unknown stream_id: {stream_id}") + + async def _close_abruptly(): + # Just close the connection without audio-stop + await session.client.close() + + future = asyncio.run_coroutine_threadsafe(_close_abruptly(), session.loop) + future.result(timeout=10) + + # Stop the event loop + session.loop.call_soon_threadsafe(session.loop.stop) + session.thread.join(timeout=5) + + total_chunks = session.chunk_count + del self._sessions[stream_id] + + logger.info(f"Stream {stream_id} closed abruptly (no audio-stop), sent {total_chunks} chunks") + return total_chunks + def get_session(self, stream_id: str) -> Optional[StreamSession]: """Get session info for a stream.""" return self._sessions.get(stream_id) diff --git a/backends/advanced/src/advanced_omi_backend/config.py b/backends/advanced/src/advanced_omi_backend/config.py index 2b07a8d4..77a842ce 100644 --- a/backends/advanced/src/advanced_omi_backend/config.py +++ b/backends/advanced/src/advanced_omi_backend/config.py @@ -1,15 +1,27 @@ """ Configuration management for Chronicle backend. -Currently contains diarization settings because they were used in multiple places -causing circular imports. Other configurations can be moved here as needed. +Uses OmegaConf for unified YAML configuration with environment variable interpolation. +Secrets are stored in .env files, all other config in config/config.yml. """ -import json import logging import os -import shutil +from dataclasses import dataclass from pathlib import Path +from typing import Optional + +from omegaconf import OmegaConf + +from advanced_omi_backend.config_loader import ( + get_backend_config, + get_config_dir, + load_config, +) +from advanced_omi_backend.config_loader import reload_config as reload_omegaconf_config +from advanced_omi_backend.config_loader import ( + save_config_section, +) logger = logging.getLogger(__name__) @@ -17,152 +29,203 @@ DATA_DIR = Path(os.getenv("DATA_DIR", "/app/data")) CHUNK_DIR = Path("./audio_chunks") # Mounted to ./data/audio_chunks by Docker -# Default diarization settings -DEFAULT_DIARIZATION_SETTINGS = { - "diarization_source": "pyannote", - "similarity_threshold": 0.15, - "min_duration": 0.5, - "collar": 2.0, - "min_duration_off": 1.5, - "min_speakers": 2, - "max_speakers": 6 -} - -# Default speech detection settings -DEFAULT_SPEECH_DETECTION_SETTINGS = { - "min_words": 10, # Minimum words to create conversation (increased from 5) - "min_confidence": 0.7, # Word confidence threshold (increased from 0.5) - "min_duration": 10.0, # Minimum speech duration in seconds (increased from 2.0) -} - -# Default conversation stop settings -DEFAULT_CONVERSATION_STOP_SETTINGS = { - "transcription_buffer_seconds": 120, # Periodic transcription interval (2 minutes) - "speech_inactivity_threshold": 60, # Speech gap threshold for closure (1 minute) -} - -# Default audio storage settings -DEFAULT_AUDIO_STORAGE_SETTINGS = { - "audio_base_path": "/app/data", # Main audio directory (where volume is mounted) - "audio_chunks_path": "/app/audio_chunks", # Full path to audio chunks subfolder -} - -# Global cache for diarization settings -_diarization_settings = None - - -def get_diarization_config_path(): - """Get the path to the diarization config file.""" - # Try different locations in order of preference - # 1. Data directory (for persistence across container restarts) - data_path = Path("/app/data/diarization_config.json") - if data_path.parent.exists(): - return data_path - - # 2. App root directory - app_path = Path("/app/diarization_config.json") - if app_path.parent.exists(): - return app_path - - # 3. Local development path - local_path = Path("diarization_config.json") - return local_path - - -def load_diarization_settings_from_file(): - """Load diarization settings from file or create from template.""" - global _diarization_settings - - config_path = get_diarization_config_path() - template_path = Path("/app/diarization_config.json.template") - - # If no template, try local development path - if not template_path.exists(): - template_path = Path("diarization_config.json.template") - - # If config doesn't exist, try to copy from template - if not config_path.exists(): - if template_path.exists(): - try: - # Ensure parent directory exists - config_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(template_path, config_path) - logger.info(f"Created diarization config from template at {config_path}") - except Exception as e: - logger.warning(f"Could not copy template to {config_path}: {e}") - - # Load from file if it exists - if config_path.exists(): - try: - with open(config_path, 'r') as f: - _diarization_settings = json.load(f) - logger.info(f"Loaded diarization settings from {config_path}") - return _diarization_settings - except Exception as e: - logger.error(f"Error loading diarization settings from {config_path}: {e}") - - # Fall back to defaults - _diarization_settings = DEFAULT_DIARIZATION_SETTINGS.copy() - logger.info("Using default diarization settings") - return _diarization_settings - - -def save_diarization_settings_to_file(settings): - """Save diarization settings to file.""" - global _diarization_settings - - config_path = get_diarization_config_path() - - try: - # Ensure parent directory exists - config_path.parent.mkdir(parents=True, exist_ok=True) - - # Write settings to file - with open(config_path, 'w') as f: - json.dump(settings, f, indent=2) - - # Update cache - _diarization_settings = settings - - logger.info(f"Saved diarization settings to {config_path}") - return True - except Exception as e: - logger.error(f"Error saving diarization settings to {config_path}: {e}") - return False - - -def get_speech_detection_settings(): - """Get speech detection settings from environment or defaults.""" - return { - "min_words": int(os.getenv("SPEECH_DETECTION_MIN_WORDS", DEFAULT_SPEECH_DETECTION_SETTINGS["min_words"])), - "min_confidence": float(os.getenv("SPEECH_DETECTION_MIN_CONFIDENCE", DEFAULT_SPEECH_DETECTION_SETTINGS["min_confidence"])), - "min_duration": float(os.getenv("SPEECH_DETECTION_MIN_DURATION", DEFAULT_SPEECH_DETECTION_SETTINGS["min_duration"])), - } +# ============================================================================ +# Configuration Functions (OmegaConf-based) +# ============================================================================ +def get_config_yml_path() -> Path: + """ + Get path to config.yml file. -def get_conversation_stop_settings(): - """Get conversation stop settings from environment or defaults.""" + Returns: + Path to config.yml + """ + return get_config_dir() / "config.yml" + +def get_config(force_reload: bool = False) -> dict: + """ + Get merged configuration using OmegaConf. + + Wrapper around load_config() from config_loader for backward compatibility. + + Args: + force_reload: If True, reload from disk even if cached + + Returns: + Merged configuration dictionary with all settings + """ + cfg = load_config(force_reload=force_reload) + return OmegaConf.to_container(cfg, resolve=True) + + +def reload_config(): + """Reload configuration from disk (invalidate cache).""" + return reload_omegaconf_config() + + +# ============================================================================ +# Diarization Settings (OmegaConf-based) +# ============================================================================ + +def get_diarization_settings() -> dict: + """ + Get diarization settings using OmegaConf. + + Returns: + Dict with diarization configuration (resolved from YAML + env vars) + """ + cfg = get_backend_config('diarization') + return OmegaConf.to_container(cfg, resolve=True) + + +def save_diarization_settings(settings: dict) -> bool: + """ + Save diarization settings to config.yml using OmegaConf. + + Args: + settings: Dict with diarization settings to save + + Returns: + True if saved successfully, False otherwise + """ + return save_config_section('backend.diarization', settings) + + +# ============================================================================ +# Cleanup Settings (OmegaConf-based) +# ============================================================================ + +@dataclass +class CleanupSettings: + """Cleanup configuration for soft-deleted conversations.""" + auto_cleanup_enabled: bool = False + retention_days: int = 30 + + +def get_cleanup_settings() -> dict: + """ + Get cleanup settings using OmegaConf. + + Returns: + Dict with auto_cleanup_enabled and retention_days + """ + cfg = get_backend_config('cleanup') + return OmegaConf.to_container(cfg, resolve=True) - return { - "transcription_buffer_seconds": float(os.getenv("TRANSCRIPTION_BUFFER_SECONDS", DEFAULT_CONVERSATION_STOP_SETTINGS["transcription_buffer_seconds"])), - "speech_inactivity_threshold": float(os.getenv("SPEECH_INACTIVITY_THRESHOLD_SECONDS", DEFAULT_CONVERSATION_STOP_SETTINGS["speech_inactivity_threshold"])), - "min_word_confidence": float(os.getenv("SPEECH_DETECTION_MIN_CONFIDENCE", DEFAULT_SPEECH_DETECTION_SETTINGS["min_confidence"])), - } +def save_cleanup_settings(settings: CleanupSettings) -> bool: + """ + Save cleanup settings to config.yml using OmegaConf. + + Args: + settings: CleanupSettings dataclass instance + + Returns: + True if saved successfully, False otherwise + """ + from dataclasses import asdict + return save_config_section('backend.cleanup', asdict(settings)) + + +# ============================================================================ +# Speech Detection Settings (OmegaConf-based) +# ============================================================================ + +def get_speech_detection_settings() -> dict: + """ + Get speech detection settings using OmegaConf. + + Returns: + Dict with min_words, min_confidence, min_duration + """ + cfg = get_backend_config('speech_detection') + return OmegaConf.to_container(cfg, resolve=True) + + +# ============================================================================ +# Conversation Stop Settings (OmegaConf-based) +# ============================================================================ + +def get_conversation_stop_settings() -> dict: + """ + Get conversation stop settings using OmegaConf. + + Returns: + Dict with transcription_buffer_seconds, speech_inactivity_threshold + """ + cfg = get_backend_config('conversation_stop') + settings = OmegaConf.to_container(cfg, resolve=True) + + # Add min_word_confidence from speech_detection for backward compatibility + speech_cfg = get_backend_config('speech_detection') + settings['min_word_confidence'] = OmegaConf.to_container(speech_cfg, resolve=True).get('min_confidence', 0.7) + + return settings + + +# ============================================================================ +# Audio Storage Settings (OmegaConf-based) +# ============================================================================ + +def get_audio_storage_settings() -> dict: + """ + Get audio storage settings using OmegaConf. + + Returns: + Dict with audio_base_path, audio_chunks_path + """ + cfg = get_backend_config('audio_storage') + return OmegaConf.to_container(cfg, resolve=True) + + +# ============================================================================ +# Miscellaneous Settings (OmegaConf-based) +# ============================================================================ + +def get_misc_settings() -> dict: + """ + Get miscellaneous configuration settings using OmegaConf. + + Returns: + Dict with always_persist_enabled and use_provider_segments + """ + # Get audio settings for always_persist_enabled + audio_cfg = get_backend_config('audio') + audio_settings = OmegaConf.to_container(audio_cfg, resolve=True) if audio_cfg else {} + + # Get transcription settings for use_provider_segments + transcription_cfg = get_backend_config('transcription') + transcription_settings = OmegaConf.to_container(transcription_cfg, resolve=True) if transcription_cfg else {} -def get_audio_storage_settings(): - """Get audio storage settings from environment or defaults.""" - - # Get base path and derive chunks path - audio_base_path = os.getenv("AUDIO_BASE_PATH", DEFAULT_AUDIO_STORAGE_SETTINGS["audio_base_path"]) - audio_chunks_path = os.getenv("AUDIO_CHUNKS_PATH", f"{audio_base_path}/audio_chunks") - return { - "audio_base_path": audio_base_path, - "audio_chunks_path": audio_chunks_path, + 'always_persist_enabled': audio_settings.get('always_persist_enabled', False), + 'use_provider_segments': transcription_settings.get('use_provider_segments', False) } -# Initialize settings on module load -_diarization_settings = load_diarization_settings_from_file() \ No newline at end of file +def save_misc_settings(settings: dict) -> bool: + """ + Save miscellaneous settings to config.yml using OmegaConf. + + Args: + settings: Dict with always_persist_enabled and/or use_provider_segments + + Returns: + True if saved successfully, False otherwise + """ + success = True + + # Save audio settings if always_persist_enabled is provided + if 'always_persist_enabled' in settings: + audio_settings = {'always_persist_enabled': settings['always_persist_enabled']} + if not save_config_section('backend.audio', audio_settings): + success = False + + # Save transcription settings if use_provider_segments is provided + if 'use_provider_segments' in settings: + transcription_settings = {'use_provider_segments': settings['use_provider_segments']} + if not save_config_section('backend.transcription', transcription_settings): + success = False + + return success \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/config_loader.py b/backends/advanced/src/advanced_omi_backend/config_loader.py new file mode 100644 index 00000000..5d25debd --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/config_loader.py @@ -0,0 +1,169 @@ +""" +OmegaConf-based configuration management for Chronicle. + +Provides unified config loading with environment variable interpolation. +""" + +import logging +import os +from pathlib import Path +from typing import Optional + +from omegaconf import DictConfig, OmegaConf + +logger = logging.getLogger(__name__) + +# Global config cache +_config_cache: Optional[DictConfig] = None + + +def get_config_dir() -> Path: + """Get config directory path (single source of truth).""" + config_dir = os.getenv("CONFIG_DIR", "/app/config") + return Path(config_dir) + + +def get_plugins_yml_path() -> Path: + """ + Get path to plugins.yml file (single source of truth). + + Returns: + Path to plugins.yml + """ + return get_config_dir() / "plugins.yml" + + +def load_config(force_reload: bool = False) -> DictConfig: + """ + Load and merge configuration using OmegaConf. + + Merge priority (later overrides earlier): + 1. config/defaults.yml (shipped defaults) + 2. config/config.yml (user overrides) + 3. Environment variables (via ${oc.env:VAR,default} syntax) + + Args: + force_reload: If True, reload from disk even if cached + + Returns: + Merged DictConfig with all settings + """ + global _config_cache + + if _config_cache is not None and not force_reload: + return _config_cache + + config_dir = get_config_dir() + defaults_path = config_dir / "defaults.yml" + + # Support CONFIG_FILE env var for test configurations + config_file = os.getenv("CONFIG_FILE", "config.yml") + # Handle both absolute paths and relative filenames + if os.path.isabs(config_file): + config_path = Path(config_file) + else: + config_path = config_dir / config_file + + # Load defaults + defaults = {} + if defaults_path.exists(): + try: + defaults = OmegaConf.load(defaults_path) + logger.info(f"Loaded defaults from {defaults_path}") + except Exception as e: + logger.warning(f"Could not load defaults from {defaults_path}: {e}") + + # Load user config + user_config = {} + if config_path.exists(): + try: + user_config = OmegaConf.load(config_path) + logger.info(f"Loaded config from {config_path}") + except Exception as e: + logger.error(f"Error loading config from {config_path}: {e}") + + # Merge configurations (user config overrides defaults) + merged = OmegaConf.merge(defaults, user_config) + + # Cache result + _config_cache = merged + + logger.info("Configuration loaded successfully with OmegaConf") + return merged + + +def reload_config() -> DictConfig: + """Reload configuration from disk (invalidate cache).""" + global _config_cache + _config_cache = None + return load_config(force_reload=True) + + +def get_backend_config(section: Optional[str] = None) -> DictConfig: + """ + Get backend configuration section. + + Args: + section: Optional subsection (e.g., 'diarization', 'cleanup') + + Returns: + DictConfig for backend section or subsection + """ + cfg = load_config() + if 'backend' not in cfg: + return OmegaConf.create({}) + + backend_cfg = cfg.backend + if section: + return backend_cfg.get(section, OmegaConf.create({})) + return backend_cfg + + +def get_service_config(service_name: str) -> DictConfig: + """ + Get service configuration section. + + Args: + service_name: Service name (e.g., 'speaker_recognition', 'asr_services') + + Returns: + DictConfig for service section + """ + cfg = load_config() + return cfg.get(service_name, OmegaConf.create({})) + + +def save_config_section(section_path: str, values: dict) -> bool: + """ + Update a config section and save to config.yml. + + Args: + section_path: Dot-separated path (e.g., 'backend.diarization') + values: Dict with new values + + Returns: + True if saved successfully + """ + try: + config_path = get_config_dir() / "config.yml" + + # Load existing config + existing_config = {} + if config_path.exists(): + existing_config = OmegaConf.load(config_path) + + # Update section using dot notation + OmegaConf.update(existing_config, section_path, values, merge=True) + + # Save back to file + OmegaConf.save(existing_config, config_path) + + # Invalidate cache + reload_config() + + logger.info(f"Saved config section '{section_path}' to {config_path}") + return True + + except Exception as e: + logger.error(f"Error saving config section '{section_path}': {e}") + return False diff --git a/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py index 4810810d..734df6ed 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py @@ -10,19 +10,26 @@ import logging import time import uuid -from pathlib import Path from fastapi import UploadFile from fastapi.responses import JSONResponse +from advanced_omi_backend.controllers.queue_controller import ( + JOB_RESULT_TTL, + start_post_conversation_jobs, + transcription_queue, +) +from advanced_omi_backend.models.conversation import create_conversation +from advanced_omi_backend.models.user import User +from advanced_omi_backend.services.transcription import is_transcription_available +from advanced_omi_backend.utils.audio_chunk_utils import convert_audio_to_chunks from advanced_omi_backend.utils.audio_utils import ( AudioValidationError, - write_audio_file, + validate_and_prepare_audio, +) +from advanced_omi_backend.workers.transcription_jobs import ( + transcribe_full_audio_job, ) -from advanced_omi_backend.models.job import JobPriority -from advanced_omi_backend.models.user import User -from advanced_omi_backend.models.conversation import create_conversation -from advanced_omi_backend.models.conversation import Conversation logger = logging.getLogger(__name__) audio_logger = logging.getLogger("audio_processing") @@ -30,6 +37,7 @@ def generate_client_id(user: User, device_name: str) -> str: """Generate client ID for uploaded files.""" + logger.debug(f"Generating client ID - user.id={user.id}, type={type(user.id)}") user_id_suffix = str(user.id)[-6:] return f"{user_id_suffix}-{device_name}" @@ -38,8 +46,6 @@ async def upload_and_process_audio_files( user: User, files: list[UploadFile], device_name: str = "upload", - auto_generate_client: bool = True, - folder: str = None, source: str = "upload" ) -> dict: """ @@ -54,8 +60,7 @@ async def upload_and_process_audio_files( user: Authenticated user files: List of uploaded audio files device_name: Device identifier - auto_generate_client: Whether to auto-generate client ID - folder: Optional subfolder for audio storage (e.g., 'fixtures') + source: Source of the upload (e.g., 'upload', 'gdrive') """ try: if not files: @@ -83,36 +88,23 @@ async def upload_and_process_audio_files( content = await file.read() - # Generate audio UUID and timestamp + # Track external source for deduplication (Google Drive, etc.) + external_source_id = None + external_source_type = None if source == "gdrive": - audio_uuid = getattr(file, "audio_uuid", None) - if not audio_uuid: - audio_logger.error(f"Missing audio_uuid for gdrive file: {file.filename}") - audio_uuid = str(uuid.uuid4()) - else: - audio_uuid = str(uuid.uuid4()) + external_source_id = getattr(file, "file_id", None) or getattr(file, "audio_uuid", None) + external_source_type = "gdrive" + if not external_source_id: + audio_logger.warning(f"Missing file_id for gdrive file: {file.filename}") timestamp = int(time.time() * 1000) - # Determine output directory (with optional subfolder) - from advanced_omi_backend.config import CHUNK_DIR - if folder: - chunk_dir = CHUNK_DIR / folder - chunk_dir.mkdir(parents=True, exist_ok=True) - else: - chunk_dir = CHUNK_DIR - - # Validate, write audio file and create AudioSession (all in one) + # Validate and prepare audio (read format from WAV file) try: - relative_audio_path, file_path, duration = await write_audio_file( - raw_audio_data=content, - audio_uuid=audio_uuid, - source=source, - client_id=client_id, - user_id=user.user_id, - user_email=user.email, - timestamp=timestamp, - chunk_dir=chunk_dir, - validate=True, # Validate WAV format, convert stereoβ†’mono + audio_data, sample_rate, sample_width, channels, duration = await validate_and_prepare_audio( + audio_data=content, + expected_sample_rate=16000, # Expecting 16kHz + convert_to_mono=True, # Convert stereo to mono + auto_resample=True # Auto-resample if sample rate doesn't match ) except AudioValidationError as e: processed_files.append({ @@ -123,7 +115,7 @@ async def upload_and_process_audio_files( continue audio_logger.info( - f"πŸ“Š {file.filename}: {duration:.1f}s β†’ {relative_audio_path}" + f"πŸ“Š {file.filename}: {duration:.1f}s ({sample_rate}Hz, {channels}ch, {sample_width} bytes/sample)" ) # Create conversation immediately for uploaded files (conversation_id auto-generated) @@ -133,45 +125,112 @@ async def upload_and_process_audio_files( title = file.filename.rsplit('.', 1)[0][:50] if file.filename else "Uploaded Audio" conversation = create_conversation( - audio_uuid=audio_uuid, user_id=user.user_id, client_id=client_id, title=title, - summary="Processing uploaded audio file..." + summary="Processing uploaded audio file...", + external_source_id=external_source_id, + external_source_type=external_source_type, ) - # Use the relative path returned by write_audio_file (already includes folder prefix if applicable) - conversation.audio_path = relative_audio_path await conversation.insert() conversation_id = conversation.conversation_id # Get the auto-generated ID audio_logger.info(f"πŸ“ Created conversation {conversation_id} for uploaded file") - # Enqueue post-conversation processing job chain - from advanced_omi_backend.controllers.queue_controller import start_post_conversation_jobs + # Convert audio directly to MongoDB chunks + try: + num_chunks = await convert_audio_to_chunks( + conversation_id=conversation_id, + audio_data=audio_data, + sample_rate=sample_rate, + channels=channels, + sample_width=sample_width, + ) + audio_logger.info( + f"πŸ“¦ Converted uploaded file to {num_chunks} MongoDB chunks " + f"(conversation {conversation_id[:12]})" + ) + except ValueError as val_error: + # Handle validation errors (e.g., file too long) + audio_logger.error(f"Audio validation failed: {val_error}") + processed_files.append({ + "filename": file.filename, + "status": "error", + "error": str(val_error), + }) + # Delete the conversation since it won't have audio chunks + await conversation.delete() + continue + except Exception as chunk_error: + audio_logger.error( + f"Failed to convert uploaded file to chunks: {chunk_error}", + exc_info=True + ) + processed_files.append({ + "filename": file.filename, + "status": "error", + "error": f"Audio conversion failed: {str(chunk_error)}", + }) + # Delete the conversation since it won't have audio chunks + await conversation.delete() + continue + + # Enqueue batch transcription job first (file uploads need transcription) + version_id = str(uuid.uuid4()) + transcribe_job_id = f"transcribe_{conversation_id[:12]}" + + # Check if transcription provider is available before enqueueing + transcription_job = None + if is_transcription_available(mode="batch"): + transcription_job = transcription_queue.enqueue( + transcribe_full_audio_job, + conversation_id, + version_id, + "batch", # trigger + job_timeout=1800, # 30 minutes + result_ttl=JOB_RESULT_TTL, + job_id=transcribe_job_id, + description=f"Transcribe uploaded file {conversation_id[:8]}", + meta={'conversation_id': conversation_id, 'client_id': client_id} + ) + audio_logger.info(f"πŸ“₯ Enqueued transcription job {transcription_job.id} for uploaded file") + else: + audio_logger.warning( + f"⚠️ Skipping transcription for conversation {conversation_id}: " + "No transcription provider configured" + ) + # Enqueue post-conversation processing job chain (depends on transcription) job_ids = start_post_conversation_jobs( conversation_id=conversation_id, - audio_uuid=audio_uuid, - audio_file_path=file_path, user_id=user.user_id, - post_transcription=True, # Run batch transcription for uploads + transcript_version_id=version_id, # Pass the version_id from transcription job + depends_on_job=transcription_job, # Wait for transcription to complete (or None) client_id=client_id # Pass client_id for UI tracking ) processed_files.append({ "filename": file.filename, - "status": "processing", - "audio_uuid": audio_uuid, + "status": "started", # RQ standard: job has been enqueued "conversation_id": conversation_id, - "transcript_job_id": job_ids['transcription'], + "transcript_job_id": transcription_job.id if transcription_job else None, "speaker_job_id": job_ids['speaker_recognition'], "memory_job_id": job_ids['memory'], "duration_seconds": round(duration, 2), }) + # Build job chain description + job_chain = [] + if transcription_job: + job_chain.append(transcription_job.id) + if job_ids['speaker_recognition']: + job_chain.append(job_ids['speaker_recognition']) + if job_ids['memory']: + job_chain.append(job_ids['memory']) + audio_logger.info( f"βœ… Processed {file.filename} β†’ conversation {conversation_id}, " - f"jobs: {job_ids['transcription']} β†’ {job_ids['speaker_recognition']} β†’ {job_ids['memory']}" + f"jobs: {' β†’ '.join(job_chain) if job_chain else 'none'}" ) except (OSError, IOError) as e: @@ -191,20 +250,33 @@ async def upload_and_process_audio_files( "error": str(e), }) - successful_files = [f for f in processed_files if f.get("status") == "processing"] + successful_files = [f for f in processed_files if f.get("status") == "started"] failed_files = [f for f in processed_files if f.get("status") == "error"] - return { + response_body = { "message": f"Uploaded and processing {len(successful_files)} file(s)", "client_id": client_id, "files": processed_files, "summary": { "total": len(files), - "processing": len(successful_files), + "started": len(successful_files), # RQ standard "failed": len(failed_files), }, } + # Return appropriate HTTP status code based on results + if len(failed_files) == len(files): + # ALL files failed - return 400 Bad Request + audio_logger.error(f"All {len(files)} file(s) failed to upload") + return JSONResponse(status_code=400, content=response_body) + elif len(failed_files) > 0: + # SOME files failed (partial success) - return 207 Multi-Status + audio_logger.warning(f"Partial upload: {len(successful_files)} succeeded, {len(failed_files)} failed") + return JSONResponse(status_code=207, content=response_body) + else: + # All files succeeded - return 200 OK + return response_body + except (OSError, IOError) as e: # File system errors during upload handling audio_logger.exception("File I/O error in upload_and_process_audio_files") @@ -217,83 +289,3 @@ async def upload_and_process_audio_files( return JSONResponse( status_code=500, content={"error": f"File upload failed: {str(e)}"} ) - - -async def get_conversation_audio_path(conversation_id: str, user: User, cropped: bool = False) -> Path: - """ - Get the file path for a conversation's audio file. - - Args: - conversation_id: The conversation ID - user: The authenticated user - cropped: If True, return cropped audio path; if False, return original audio path - - Returns: - Path object for the audio file - - Raises: - ValueError: If conversation not found, access denied, or audio file not available - """ - # Get conversation by conversation_id (UUID field, not _id) - conversation = await Conversation.find_one(Conversation.conversation_id == conversation_id) - - if not conversation: - raise ValueError("Conversation not found") - - # Check ownership (admins can access all files) - if not user.is_superuser and conversation.user_id != str(user.user_id): - raise ValueError("Access denied") - - # Get the appropriate audio path - audio_path = conversation.cropped_audio_path if cropped else conversation.audio_path - - if not audio_path: - audio_type = "cropped" if cropped else "original" - raise ValueError(f"No {audio_type} audio file available for this conversation") - - # Build full file path - from advanced_omi_backend.app_config import get_audio_chunk_dir - audio_dir = get_audio_chunk_dir() - file_path = audio_dir / audio_path - - # Check if file exists - if not file_path.exists() or not file_path.is_file(): - raise ValueError("Audio file not found on disk") - - return file_path - - -async def get_cropped_audio_info(audio_uuid: str, user: User): - """ - Get audio cropping metadata from the conversation. - - This is an audio service operation that retrieves cropping-related metadata - such as speech segments, cropped audio path, and cropping timestamps. - - Used for: Checking cropping status and retrieving audio processing details. - Works with: Conversation model. - """ - try: - # Find the conversation - conversation = await Conversation.find_one(Conversation.audio_uuid == audio_uuid) - if not conversation: - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - # Check ownership for non-admin users - if not user.is_superuser: - if conversation.user_id != str(user.user_id): - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - return { - "audio_uuid": audio_uuid, - "cropped_audio_path": conversation.cropped_audio_path, - "speech_segments": conversation.speech_segments if hasattr(conversation, 'speech_segments') else [], - "cropped_duration": conversation.cropped_duration if hasattr(conversation, 'cropped_duration') else None, - "cropped_at": conversation.cropped_at if hasattr(conversation, 'cropped_at') else None, - "original_audio_path": conversation.audio_path, - } - - except Exception as e: - # Database or unexpected errors when fetching audio metadata - audio_logger.exception("Error fetching cropped audio info") - return JSONResponse(status_code=500, content={"error": "Error fetching cropped audio info"}) diff --git a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py index b9533391..c142aeee 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py @@ -4,26 +4,36 @@ import logging import time +import uuid +from datetime import datetime from pathlib import Path -from typing import Optional + +from fastapi.responses import JSONResponse from advanced_omi_backend.client_manager import ( ClientManager, client_belongs_to_user, ) -from advanced_omi_backend.models.audio_file import AudioFile +from advanced_omi_backend.config_loader import get_service_config +from advanced_omi_backend.controllers.queue_controller import ( + JOB_RESULT_TTL, + default_queue, + memory_queue, + transcription_queue, +) +from advanced_omi_backend.models.audio_chunk import AudioChunkDocument from advanced_omi_backend.models.conversation import Conversation +from advanced_omi_backend.models.job import JobPriority from advanced_omi_backend.users import User -from fastapi.responses import JSONResponse +from advanced_omi_backend.workers.memory_jobs import ( + enqueue_memory_processing, + process_memory_job, +) +from advanced_omi_backend.workers.speaker_jobs import recognise_speakers_job logger = logging.getLogger(__name__) audio_logger = logging.getLogger("audio_processing") -# Legacy audio_chunks collection is still used by some endpoints (speaker assignment, segment updates) -# But conversation queries now use the Conversation model directly -# Audio cropping operations are handled in audio_controller.py - - async def close_current_conversation(client_id: str, user: User, client_manager: ClientManager): """Close the current conversation for a specific client. Users can only close their own conversations.""" # Validate client ownership @@ -99,15 +109,17 @@ async def get_conversation(conversation_id: str, user: User): # Build response with explicit curated fields response = { "conversation_id": conversation.conversation_id, - "audio_uuid": conversation.audio_uuid, "user_id": conversation.user_id, "client_id": conversation.client_id, - "audio_path": conversation.audio_path, - "cropped_audio_path": conversation.cropped_audio_path, + "audio_chunks_count": conversation.audio_chunks_count, + "audio_total_duration": conversation.audio_total_duration, + "audio_compression_ratio": conversation.audio_compression_ratio, "created_at": conversation.created_at.isoformat() if conversation.created_at else None, "deleted": conversation.deleted, "deletion_reason": conversation.deletion_reason, "deleted_at": conversation.deleted_at.isoformat() if conversation.deleted_at else None, + "processing_status": conversation.processing_status, + "always_persist": conversation.always_persist, "end_reason": conversation.end_reason.value if conversation.end_reason else None, "completed_at": conversation.completed_at.isoformat() if conversation.completed_at else None, "title": conversation.title, @@ -123,6 +135,8 @@ async def get_conversation(conversation_id: str, user: User): "active_memory_version": conversation.active_memory_version, "transcript_version_count": conversation.transcript_version_count, "memory_version_count": conversation.memory_version_count, + "active_transcript_version_number": conversation.active_transcript_version_number, + "active_memory_version_number": conversation.active_memory_version_number, } return {"conversation": response} @@ -132,33 +146,48 @@ async def get_conversation(conversation_id: str, user: User): return JSONResponse(status_code=500, content={"error": "Error fetching conversation"}) -async def get_conversations(user: User): +async def get_conversations(user: User, include_deleted: bool = False): """Get conversations with speech only (speech-driven architecture).""" try: # Build query based on user permissions using Beanie if not user.is_superuser: # Regular users can only see their own conversations - user_conversations = await Conversation.find( - Conversation.user_id == str(user.user_id) - ).sort(-Conversation.created_at).to_list() + # Filter by deleted status + if not include_deleted: + user_conversations = await Conversation.find( + Conversation.user_id == str(user.user_id), + Conversation.deleted == False + ).sort(-Conversation.created_at).to_list() + else: + user_conversations = await Conversation.find( + Conversation.user_id == str(user.user_id) + ).sort(-Conversation.created_at).to_list() else: # Admins see all conversations - user_conversations = await Conversation.find_all().sort(-Conversation.created_at).to_list() + # Filter by deleted status + if not include_deleted: + user_conversations = await Conversation.find( + Conversation.deleted == False + ).sort(-Conversation.created_at).to_list() + else: + user_conversations = await Conversation.find_all().sort(-Conversation.created_at).to_list() # Build response with explicit curated fields - minimal for list view conversations = [] for conv in user_conversations: conversations.append({ "conversation_id": conv.conversation_id, - "audio_uuid": conv.audio_uuid, "user_id": conv.user_id, "client_id": conv.client_id, - "audio_path": conv.audio_path, - "cropped_audio_path": conv.cropped_audio_path, + "audio_chunks_count": conv.audio_chunks_count, + "audio_total_duration": conv.audio_total_duration, + "audio_compression_ratio": conv.audio_compression_ratio, "created_at": conv.created_at.isoformat() if conv.created_at else None, "deleted": conv.deleted, "deletion_reason": conv.deletion_reason, "deleted_at": conv.deleted_at.isoformat() if conv.deleted_at else None, + "processing_status": conv.processing_status, + "always_persist": conv.always_persist, "title": conv.title, "summary": conv.summary, "detailed_summary": conv.detailed_summary, @@ -170,6 +199,8 @@ async def get_conversations(user: User): "memory_count": conv.memory_count, "transcript_version_count": conv.transcript_version_count, "memory_version_count": conv.memory_version_count, + "active_transcript_version_number": conv.active_transcript_version_number, + "active_memory_version_number": conv.active_memory_version_number, }) return {"conversations": conversations} @@ -179,12 +210,85 @@ async def get_conversations(user: User): return JSONResponse(status_code=500, content={"error": "Error fetching conversations"}) -async def delete_conversation(conversation_id: str, user: User): - """Delete a conversation and its associated audio files. Users can only delete their own conversations.""" +async def _soft_delete_conversation(conversation: Conversation, user: User) -> JSONResponse: + """Mark conversation and chunks as deleted (soft delete).""" + conversation_id = conversation.conversation_id + + # Mark conversation as deleted + conversation.deleted = True + conversation.deletion_reason = "user_deleted" + conversation.deleted_at = datetime.utcnow() + await conversation.save() + + logger.info(f"Soft deleted conversation {conversation_id} for user {user.user_id}") + + # Soft delete all associated audio chunks + result = await AudioChunkDocument.find( + AudioChunkDocument.conversation_id == conversation_id, + AudioChunkDocument.deleted == False # Only update non-deleted chunks + ).update_many({ + "$set": { + "deleted": True, + "deleted_at": datetime.utcnow() + } + }) + + deleted_chunks = result.modified_count + logger.info(f"Soft deleted {deleted_chunks} audio chunks for conversation {conversation_id}") + + return JSONResponse( + status_code=200, + content={ + "message": f"Successfully soft deleted conversation '{conversation_id}'", + "deleted_chunks": deleted_chunks, + "conversation_id": conversation_id, + "client_id": conversation.client_id, + "deleted_at": conversation.deleted_at.isoformat() if conversation.deleted_at else None + } + ) + + +async def _hard_delete_conversation(conversation: Conversation) -> JSONResponse: + """Permanently delete conversation and chunks (admin only).""" + conversation_id = conversation.conversation_id + client_id = conversation.client_id + + # Delete conversation document + await conversation.delete() + logger.info(f"Hard deleted conversation {conversation_id}") + + # Delete all audio chunks + result = await AudioChunkDocument.find( + AudioChunkDocument.conversation_id == conversation_id + ).delete() + + deleted_chunks = result.deleted_count + logger.info(f"Hard deleted {deleted_chunks} audio chunks for conversation {conversation_id}") + + return JSONResponse( + status_code=200, + content={ + "message": f"Successfully permanently deleted conversation '{conversation_id}'", + "deleted_chunks": deleted_chunks, + "conversation_id": conversation_id, + "client_id": client_id + } + ) + + +async def delete_conversation(conversation_id: str, user: User, permanent: bool = False): + """ + Soft delete a conversation (mark as deleted but keep data). + + Args: + conversation_id: Conversation to delete + user: Requesting user + permanent: If True, permanently delete (admin only) + """ try: # Create masked identifier for logging masked_id = f"{conversation_id[:8]}...{conversation_id[-4:]}" if len(conversation_id) > 12 else "***" - logger.info(f"Attempting to delete conversation: {masked_id}") + logger.info(f"Attempting to {'permanently ' if permanent else ''}delete conversation: {masked_id}") # Find the conversation using Beanie conversation = await Conversation.find_one(Conversation.conversation_id == conversation_id) @@ -208,69 +312,91 @@ async def delete_conversation(conversation_id: str, user: User): } ) - # Get file paths before deletion - audio_path = conversation.audio_path - cropped_audio_path = conversation.cropped_audio_path - audio_uuid = conversation.audio_uuid - client_id = conversation.client_id - - # Delete the conversation from database - await conversation.delete() - logger.info(f"Deleted conversation {conversation_id}") - - # Also delete from legacy AudioFile collection if it exists (backward compatibility) - audio_file = await AudioFile.find_one(AudioFile.audio_uuid == audio_uuid) - if audio_file: - await audio_file.delete() - logger.info(f"Deleted legacy audio file record for {audio_uuid}") - - # Delete associated audio files from disk - deleted_files = [] - if audio_path: - try: - # Construct full path to audio file - full_audio_path = Path("/app/audio_chunks") / audio_path - if full_audio_path.exists(): - full_audio_path.unlink() - deleted_files.append(str(full_audio_path)) - logger.info(f"Deleted audio file: {full_audio_path}") - except Exception as e: - logger.warning(f"Failed to delete audio file {audio_path}: {e}") - - if cropped_audio_path: - try: - # Construct full path to cropped audio file - full_cropped_path = Path("/app/audio_chunks") / cropped_audio_path - if full_cropped_path.exists(): - full_cropped_path.unlink() - deleted_files.append(str(full_cropped_path)) - logger.info(f"Deleted cropped audio file: {full_cropped_path}") - except Exception as e: - logger.warning(f"Failed to delete cropped audio file {cropped_audio_path}: {e}") - - logger.info(f"Successfully deleted conversation {conversation_id} for user {user.user_id}") - - # Prepare response message - delete_summary = ["conversation"] - if deleted_files: - delete_summary.append(f"{len(deleted_files)} audio file(s)") + # Hard delete (admin only, permanent flag) + if permanent and user.is_superuser: + return await _hard_delete_conversation(conversation) + + # Soft delete (default) + return await _soft_delete_conversation(conversation, user) + + except Exception as e: + logger.error(f"Error deleting conversation {conversation_id}: {e}") + return JSONResponse( + status_code=500, + content={"error": f"Failed to delete conversation: {str(e)}"} + ) + + +async def restore_conversation(conversation_id: str, user: User) -> JSONResponse: + """ + Restore a soft-deleted conversation. + + Args: + conversation_id: Conversation to restore + user: Requesting user + """ + try: + conversation = await Conversation.find_one( + Conversation.conversation_id == conversation_id + ) + + if not conversation: + return JSONResponse( + status_code=404, + content={"error": "Conversation not found"} + ) + + # Permission check + if not user.is_superuser and conversation.user_id != str(user.user_id): + return JSONResponse( + status_code=403, + content={"error": "Access denied"} + ) + + if not conversation.deleted: + return JSONResponse( + status_code=400, + content={"error": "Conversation is not deleted"} + ) + + # Restore conversation + conversation.deleted = False + conversation.deletion_reason = None + conversation.deleted_at = None + await conversation.save() + + # Restore audio chunks + result = await AudioChunkDocument.find( + AudioChunkDocument.conversation_id == conversation_id, + AudioChunkDocument.deleted == True + ).update_many({ + "$set": { + "deleted": False, + "deleted_at": None + } + }) + + restored_chunks = result.modified_count + + logger.info( + f"Restored conversation {conversation_id} " + f"({restored_chunks} chunks) for user {user.user_id}" + ) return JSONResponse( status_code=200, content={ - "message": f"Successfully deleted {', '.join(delete_summary)} '{conversation_id}'", - "deleted_files": deleted_files, - "client_id": client_id, + "message": f"Successfully restored conversation '{conversation_id}'", + "restored_chunks": restored_chunks, "conversation_id": conversation_id, - "audio_uuid": audio_uuid } ) except Exception as e: - logger.error(f"Error deleting conversation {conversation_id}: {e}") + logger.error(f"Error restoring conversation {conversation_id}: {e}") return JSONResponse( status_code=500, - content={"error": f"Failed to delete conversation: {str(e)}"} + content={"error": f"Failed to restore conversation: {str(e)}"} ) @@ -286,108 +412,85 @@ async def reprocess_transcript(conversation_id: str, user: User): if not user.is_superuser and conversation_model.user_id != str(user.user_id): return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only reprocess your own conversations."}) - # Get audio_uuid and file path from conversation - audio_uuid = conversation_model.audio_uuid - audio_path = conversation_model.audio_path - - if not audio_path: - return JSONResponse( - status_code=400, content={"error": "No audio file found for this conversation"} - ) - - # Check if file exists - try multiple possible locations - possible_paths = [ - Path("/app/audio_chunks") / audio_path, - Path(audio_path), # fallback to relative path - ] - - full_audio_path = None - for path in possible_paths: - if path.exists(): - full_audio_path = path - break + # Get audio_uuid from conversation + # Validate audio chunks exist in MongoDB + chunks = await AudioChunkDocument.find( + AudioChunkDocument.conversation_id == conversation_id + ).to_list() - if not full_audio_path: + if not chunks: return JSONResponse( - status_code=422, + status_code=404, content={ - "error": "Audio file not found on disk", - "details": f"Conversation exists but audio file '{audio_path}' is missing from expected locations", - "searched_paths": [str(p) for p in possible_paths] + "error": "No audio data found for this conversation", + "details": f"Conversation '{conversation_id}' exists but has no audio chunks in MongoDB" } ) # Create new transcript version ID - import uuid version_id = str(uuid.uuid4()) - # Enqueue job chain with RQ (transcription -> speaker recognition -> cropping -> memory) - from advanced_omi_backend.workers.transcription_jobs import transcribe_full_audio_job - from advanced_omi_backend.workers.speaker_jobs import recognise_speakers_job - from advanced_omi_backend.workers.audio_jobs import process_cropping_job - from advanced_omi_backend.workers.memory_jobs import process_memory_job - from advanced_omi_backend.controllers.queue_controller import transcription_queue, memory_queue, default_queue, JOB_RESULT_TTL + # Enqueue job chain with RQ (transcription -> speaker recognition -> memory) + from advanced_omi_backend.workers.transcription_jobs import ( + transcribe_full_audio_job, + ) - # Job 1: Transcribe audio to text + # Job 1: Transcribe audio to text (reconstructs from MongoDB chunks) transcript_job = transcription_queue.enqueue( transcribe_full_audio_job, conversation_id, - audio_uuid, - str(full_audio_path), version_id, "reprocess", job_timeout=600, result_ttl=JOB_RESULT_TTL, job_id=f"reprocess_{conversation_id[:8]}", description=f"Transcribe audio for {conversation_id[:8]}", - meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} + meta={'conversation_id': conversation_id} ) logger.info(f"πŸ“₯ RQ: Enqueued transcription job {transcript_job.id}") - # Job 2: Recognize speakers (depends on transcription) - speaker_job = transcription_queue.enqueue( - recognise_speakers_job, - conversation_id, - version_id, - str(full_audio_path), - "", # transcript_text - will be read from DB - [], # words - will be read from DB - depends_on=transcript_job, - job_timeout=600, - result_ttl=JOB_RESULT_TTL, - job_id=f"speaker_{conversation_id[:8]}", - description=f"Recognize speakers for {conversation_id[:8]}", - meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} - ) - logger.info(f"πŸ“₯ RQ: Enqueued speaker recognition job {speaker_job.id} (depends on {transcript_job.id})") - - # Job 3: Audio cropping (depends on speaker recognition) - cropping_job = default_queue.enqueue( - process_cropping_job, - conversation_id, - str(full_audio_path), - depends_on=speaker_job, - job_timeout=300, - result_ttl=JOB_RESULT_TTL, - job_id=f"crop_{conversation_id[:8]}", - description=f"Crop audio for {conversation_id[:8]}", - meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} - ) - logger.info(f"πŸ“₯ RQ: Enqueued audio cropping job {cropping_job.id} (depends on {speaker_job.id})") + # Check if speaker recognition is enabled + speaker_config = get_service_config('speaker_recognition') + speaker_enabled = speaker_config.get('enabled', True) # Default to True for backward compatibility + + # Job 2: Recognize speakers (conditional - only if enabled) + speaker_dependency = transcript_job # Start with transcription job + speaker_job = None + + if speaker_enabled: + speaker_job = transcription_queue.enqueue( + recognise_speakers_job, + conversation_id, + version_id, + depends_on=transcript_job, + job_timeout=600, + result_ttl=JOB_RESULT_TTL, + job_id=f"speaker_{conversation_id[:8]}", + description=f"Recognize speakers for {conversation_id[:8]}", + meta={'conversation_id': conversation_id} + ) + speaker_dependency = speaker_job # Chain for next job + logger.info(f"πŸ“₯ RQ: Enqueued speaker recognition job {speaker_job.id} (depends on {transcript_job.id})") + else: + logger.info(f"⏭️ Speaker recognition disabled, skipping speaker job for conversation {conversation_id[:8]}") - # Job 4: Extract memories (depends on cropping) + # Job 3: Extract memories + # Depends on speaker job if it was created, otherwise depends on transcription # Note: redis_client is injected by @async_job decorator, don't pass it directly memory_job = memory_queue.enqueue( process_memory_job, conversation_id, - depends_on=cropping_job, + depends_on=speaker_dependency, # Either speaker_job or transcript_job job_timeout=1800, result_ttl=JOB_RESULT_TTL, job_id=f"memory_{conversation_id[:8]}", description=f"Extract memories for {conversation_id[:8]}", - meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} + meta={'conversation_id': conversation_id} ) - logger.info(f"πŸ“₯ RQ: Enqueued memory job {memory_job.id} (depends on {cropping_job.id})") + if speaker_job: + logger.info(f"πŸ“₯ RQ: Enqueued memory job {memory_job.id} (depends on speaker job {speaker_job.id})") + else: + logger.info(f"πŸ“₯ RQ: Enqueued memory job {memory_job.id} (depends on transcript job {transcript_job.id})") job = transcript_job # For backward compatibility with return value logger.info(f"Created transcript reprocessing job {job.id} (version: {version_id}) for conversation {conversation_id}") @@ -439,12 +542,9 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use ) # Create new memory version ID - import uuid version_id = str(uuid.uuid4()) # Enqueue memory processing job with RQ (RQ handles job tracking) - from advanced_omi_backend.workers.memory_jobs import enqueue_memory_processing - from advanced_omi_backend.models.job import JobPriority job = enqueue_memory_processing( client_id=conversation_model.client_id, @@ -469,6 +569,172 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use return JSONResponse(status_code=500, content={"error": "Error starting memory reprocessing"}) +async def reprocess_speakers( + conversation_id: str, + transcript_version_id: str, + user: User +): + """ + Reprocess speaker identification for a specific transcript version. + Users can only reprocess their own conversations. + + Creates NEW transcript version with same text/words but re-identified speakers. + Automatically chains memory reprocessing since speaker attribution affects meaning. + """ + try: + # 1. Find conversation and validate ownership + conversation_model = await Conversation.find_one( + Conversation.conversation_id == conversation_id + ) + if not conversation_model: + return JSONResponse( + status_code=404, + content={"error": "Conversation not found"} + ) + + # Check ownership for non-admin users + if not user.is_superuser and conversation_model.user_id != str(user.user_id): + return JSONResponse( + status_code=403, + content={"error": "Access forbidden. You can only reprocess your own conversations."} + ) + + # 2. Resolve source transcript version ID (handle "active" special case) + source_version_id = transcript_version_id + if source_version_id == "active": + active_version_id = conversation_model.active_transcript_version + if not active_version_id: + return JSONResponse( + status_code=404, + content={"error": "No active transcript version found"} + ) + source_version_id = active_version_id + + # 3. Find and validate the source transcript version + source_version = None + for version in conversation_model.transcript_versions: + if version.version_id == source_version_id: + source_version = version + break + + if not source_version: + return JSONResponse( + status_code=404, + content={"error": f"Transcript version '{source_version_id}' not found"} + ) + + # 4. Validate transcript has content and words + if not source_version.transcript: + return JSONResponse( + status_code=400, + content={"error": "Cannot re-diarize empty transcript. Transcript version has no text."} + ) + + if not source_version.words: + return JSONResponse( + status_code=400, + content={"error": "Cannot re-diarize transcript without word timings. Words are required for diarization."} + ) + + # 5. Check if speaker recognition is enabled + speaker_config = get_service_config('speaker_recognition') + if not speaker_config.get('enabled', True): + return JSONResponse( + status_code=400, + content={ + "error": "Speaker recognition is disabled", + "details": "Enable speaker service in config to use this feature" + } + ) + + # 6. Create NEW transcript version (copy text/words, empty segments) + new_version_id = str(uuid.uuid4()) + + # Add new version with copied text/words but empty segments + # Speaker job will populate segments with re-identified speakers + conversation_model.add_transcript_version( + version_id=new_version_id, + transcript=source_version.transcript, # COPY transcript text + words=source_version.words, # COPY word timings + segments=[], # Empty - will be populated by speaker job + provider=source_version.provider, + model=source_version.model, + processing_time_seconds=None, # Will be updated by job + metadata={ + "reprocessing_type": "speaker_diarization", + "source_version_id": source_version_id, + "trigger": "manual_reprocess" + }, + set_as_active=True # Set new version as active + ) + + # Save conversation with new version + await conversation_model.save() + + logger.info( + f"Created new transcript version {new_version_id} from source {source_version_id} " + f"for conversation {conversation_id}" + ) + + # 7. Enqueue speaker recognition job with NEW version_id + speaker_job = transcription_queue.enqueue( + recognise_speakers_job, + conversation_id, + new_version_id, # NEW version (not source) + job_timeout=1200, # 20 minutes + result_ttl=JOB_RESULT_TTL, + job_id=f"reprocess_speaker_{conversation_id[:12]}", + description=f"Re-diarize speakers for {conversation_id[:8]}", + meta={ + 'conversation_id': conversation_id, + 'version_id': new_version_id, + 'source_version_id': source_version_id, + 'trigger': 'reprocess' + } + ) + + logger.info( + f"Enqueued speaker reprocessing job {speaker_job.id} " + f"for new version {new_version_id}" + ) + + # 8. Chain memory reprocessing (speaker changes affect memory context) + memory_job = memory_queue.enqueue( + process_memory_job, + conversation_id, + depends_on=speaker_job, + job_timeout=1800, # 30 minutes + result_ttl=JOB_RESULT_TTL, + job_id=f"memory_{conversation_id[:12]}", + description=f"Extract memories for {conversation_id[:8]}", + meta={ + 'conversation_id': conversation_id, + 'trigger': 'reprocess_after_speaker' + } + ) + + logger.info( + f"Chained memory reprocessing job {memory_job.id} " + f"after speaker job {speaker_job.id}" + ) + + # 9. Return job information + return JSONResponse(content={ + "message": "Speaker reprocessing started", + "job_id": speaker_job.id, + "version_id": new_version_id, # NEW version ID + "source_version_id": source_version_id, # Original version used as source + "status": "queued" + }) + + except Exception as e: + logger.error(f"Error starting speaker reprocessing: {e}") + return JSONResponse( + status_code=500, + content={"error": "Error starting speaker reprocessing"} + ) + + async def activate_transcript_version(conversation_id: str, version_id: str, user: User): """Activate a specific transcript version. Users can only modify their own conversations.""" try: diff --git a/backends/advanced/src/advanced_omi_backend/controllers/memory_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/memory_controller.py index f52167de..5abf4b36 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/memory_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/memory_controller.py @@ -139,33 +139,6 @@ async def delete_memory(memory_id: str, user: User): ) -async def get_memories_unfiltered(user: User, limit: int, user_id: Optional[str] = None): - """Get all memories including fallback transcript memories (for debugging). Users see only their own memories, admins can see all or filter by user.""" - try: - memory_service = get_memory_service() - - # Determine which user's memories to fetch - target_user_id = user.user_id - if user.is_superuser and user_id: - target_user_id = user_id - - # Execute memory retrieval directly (now async) - memories = await memory_service.get_all_memories_unfiltered(target_user_id, limit) - - return { - "memories": memories, - "count": len(memories), - "user_id": target_user_id, - "includes_fallback": True, - } - - except Exception as e: - audio_logger.error(f"Error fetching unfiltered memories: {e}", exc_info=True) - return JSONResponse( - status_code=500, content={"message": f"Error fetching unfiltered memories: {str(e)}"} - ) - - async def add_memory(content: str, user: User, source_id: Optional[str] = None): """Add a memory directly from content text. Extracts structured memories from the provided content.""" try: diff --git a/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py index 91773756..2d0577e7 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py @@ -17,11 +17,12 @@ import redis from rq import Queue, Worker -from rq.job import Job +from rq.job import Job, JobStatus from rq.registry import ScheduledJobRegistry, DeferredJobRegistry from advanced_omi_backend.models.job import JobPriority from advanced_omi_backend.models.conversation import Conversation +from advanced_omi_backend.config_loader import get_service_config logger = logging.getLogger(__name__) @@ -29,6 +30,52 @@ REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0") redis_conn = redis.from_url(REDIS_URL) + +def get_job_status_from_rq(job: Job) -> str: + """ + Get job status using RQ's native method. + + Uses job.get_status() which is the Redis Queue standard approach. + Returns RQ's standard status names. + + Returns one of: queued, started, finished, failed, deferred, scheduled, canceled, stopped + + Raises: + RuntimeError: If job status is unexpected (should never happen with RQ's method) + """ + rq_status = job.get_status() + + # RQ returns status as JobStatus enum or string + # Convert to string if it's an enum + if isinstance(rq_status, JobStatus): + status_str = rq_status.value + else: + status_str = str(rq_status) + + # Validate it's a known RQ status + valid_statuses = { + JobStatus.QUEUED.value, + JobStatus.STARTED.value, + JobStatus.FINISHED.value, + JobStatus.FAILED.value, + JobStatus.DEFERRED.value, + JobStatus.SCHEDULED.value, + JobStatus.CANCELED.value, + JobStatus.STOPPED.value, + } + + if status_str not in valid_statuses: + logger.error( + f"Job {job.id} has unexpected RQ status: {status_str}. " + f"This indicates RQ library added a new status we don't know about." + ) + raise RuntimeError( + f"Job {job.id} has unknown RQ status: {status_str}. " + f"Please update get_job_status_from_rq() to handle this new status." + ) + + return status_str + # Queue name constants TRANSCRIPTION_QUEUE = "transcription" MEMORY_QUEUE = "memory" @@ -60,34 +107,34 @@ def get_queue(queue_name: str = DEFAULT_QUEUE) -> Queue: def get_job_stats() -> Dict[str, Any]: - """Get statistics about jobs in all queues matching frontend expectations.""" + """Get statistics about jobs in all queues using RQ standard status names.""" total_jobs = 0 queued_jobs = 0 - processing_jobs = 0 - completed_jobs = 0 + started_jobs = 0 # RQ standard: "started" not "processing" + finished_jobs = 0 # RQ standard: "finished" not "completed" failed_jobs = 0 - cancelled_jobs = 0 + canceled_jobs = 0 # RQ standard: "canceled" not "cancelled" deferred_jobs = 0 # Jobs waiting for dependencies (depends_on) for queue_name in QUEUE_NAMES: queue = get_queue(queue_name) queued_jobs += len(queue) - processing_jobs += len(queue.started_job_registry) - completed_jobs += len(queue.finished_job_registry) + started_jobs += len(queue.started_job_registry) + finished_jobs += len(queue.finished_job_registry) failed_jobs += len(queue.failed_job_registry) - cancelled_jobs += len(queue.canceled_job_registry) + canceled_jobs += len(queue.canceled_job_registry) deferred_jobs += len(queue.deferred_job_registry) - total_jobs = queued_jobs + processing_jobs + completed_jobs + failed_jobs + cancelled_jobs + deferred_jobs + total_jobs = queued_jobs + started_jobs + finished_jobs + failed_jobs + canceled_jobs + deferred_jobs return { "total_jobs": total_jobs, "queued_jobs": queued_jobs, - "processing_jobs": processing_jobs, - "completed_jobs": completed_jobs, + "started_jobs": started_jobs, + "finished_jobs": finished_jobs, "failed_jobs": failed_jobs, - "cancelled_jobs": cancelled_jobs, + "canceled_jobs": canceled_jobs, "deferred_jobs": deferred_jobs, "timestamp": datetime.utcnow().isoformat() } @@ -113,24 +160,32 @@ def get_jobs( Returns: Dict with jobs list and pagination metadata matching frontend expectations """ + logger.info(f"πŸ” DEBUG get_jobs: Filtering - queue_name={queue_name}, job_type={job_type}, client_id={client_id}") all_jobs = [] + seen_job_ids = set() # Track which job IDs we've already processed to avoid duplicates queues_to_check = [queue_name] if queue_name else QUEUE_NAMES + logger.info(f"πŸ” DEBUG get_jobs: Checking queues: {queues_to_check}") for qname in queues_to_check: queue = get_queue(qname) - # Collect jobs from all registries + # Collect jobs from all registries (using RQ standard status names) registries = [ (queue.job_ids, "queued"), - (queue.started_job_registry.get_job_ids(), "processing"), - (queue.finished_job_registry.get_job_ids(), "completed"), + (queue.started_job_registry.get_job_ids(), "started"), # RQ standard, not "processing" + (queue.finished_job_registry.get_job_ids(), "finished"), # RQ standard, not "completed" (queue.failed_job_registry.get_job_ids(), "failed"), (queue.deferred_job_registry.get_job_ids(), "deferred"), # Jobs waiting for dependencies ] for job_ids, status in registries: for job_id in job_ids: + # Skip if we've already processed this job_id (prevents duplicates across registries) + if job_id in seen_job_ids: + continue + seen_job_ids.add(job_id) + try: job = Job.fetch(job_id, connection=redis_conn) @@ -140,16 +195,23 @@ def get_jobs( # Extract just the function name (e.g., "listen_for_speech_job" from "module.listen_for_speech_job") func_name = job.func_name.split('.')[-1] if job.func_name else "unknown" + # Debug: Log job details before filtering + logger.debug(f"πŸ” DEBUG get_jobs: Job {job_id} - func_name={func_name}, full_func_name={job.func_name}, meta_client_id={job.meta.get('client_id', '') if job.meta else ''}, status={status}") + # Apply job_type filter if job_type and job_type not in func_name: + logger.debug(f"πŸ” DEBUG get_jobs: Filtered out {job_id} - job_type '{job_type}' not in func_name '{func_name}'") continue # Apply client_id filter (partial match in meta) if client_id: job_client_id = job.meta.get("client_id", "") if job.meta else "" if client_id not in job_client_id: + logger.debug(f"πŸ” DEBUG get_jobs: Filtered out {job_id} - client_id '{client_id}' not in job_client_id '{job_client_id}'") continue + logger.debug(f"πŸ” DEBUG get_jobs: Including job {job_id} in results") + all_jobs.append({ "job_id": job.id, "job_type": func_name, @@ -182,6 +244,8 @@ def get_jobs( paginated_jobs = all_jobs[offset:offset + limit] has_more = (offset + limit) < total_jobs + logger.info(f"πŸ” DEBUG get_jobs: Found {total_jobs} matching jobs (returning {len(paginated_jobs)} after pagination)") + return { "jobs": paginated_jobs, "pagination": { @@ -193,15 +257,15 @@ def get_jobs( } -def all_jobs_complete_for_session(session_id: str) -> bool: +def all_jobs_complete_for_client(client_id: str) -> bool: """ - Check if all jobs associated with a session are in terminal states. + Check if all jobs associated with a client are in terminal states. - Only checks jobs with audio_uuid in job.meta (no backward compatibility). + Checks jobs with client_id in job.meta. Traverses dependency chains to include dependent jobs. Args: - session_id: The audio_uuid (session ID) to check jobs for + client_id: The client device identifier to check jobs for Returns: True if all jobs are complete (or no jobs found), False if any job is still processing @@ -230,7 +294,7 @@ def is_job_complete(job): return True - # Find all jobs for this session + # Find all jobs for this client all_queues = [transcription_queue, memory_queue, audio_queue, default_queue] for queue in all_queues: registries = [ @@ -248,8 +312,8 @@ def is_job_complete(job): try: job = Job.fetch(job_id, connection=redis_conn) - # Only check jobs with audio_uuid in meta - if job.meta and job.meta.get('audio_uuid') == session_id: + # Only check jobs with client_id in meta + if job.meta and job.meta.get('client_id') == client_id: if not is_job_complete(job): return False except Exception as e: @@ -271,14 +335,16 @@ def start_streaming_jobs( 2. Audio persistence job - writes audio chunks to WAV file (file rotation per conversation) Args: - session_id: Stream session ID (audio_uuid) + session_id: Stream session ID (equals client_id for streaming) user_id: User identifier client_id: Client identifier Returns: Dict with job IDs: {'speech_detection': job_id, 'audio_persistence': job_id} - Note: user_email is fetched from the database when needed. + Note: + - user_email is fetched from the database when needed. + - always_persist setting is read from global config by the audio persistence job. """ from advanced_omi_backend.workers.transcription_jobs import stream_speech_detection_job from advanced_omi_backend.workers.audio_jobs import audio_streaming_persistence_job @@ -290,12 +356,22 @@ def start_streaming_jobs( user_id, client_id, job_timeout=86400, # 24 hours for all-day sessions - result_ttl=JOB_RESULT_TTL, + ttl=None, # No pre-run expiry (job can wait indefinitely in queue) + result_ttl=JOB_RESULT_TTL, # Cleanup AFTER completion + failure_ttl=86400, # Cleanup failed jobs after 24h job_id=f"speech-detect_{session_id[:12]}", description=f"Listening for speech...", - meta={'audio_uuid': session_id, 'client_id': client_id, 'session_level': True} + meta={'client_id': client_id, 'session_level': True} ) + # Log job enqueue with TTL information for debugging + actual_ttl = redis_conn.ttl(f"rq:job:{speech_job.id}") logger.info(f"πŸ“₯ RQ: Enqueued speech detection job {speech_job.id}") + logger.info( + f"πŸ” Job enqueue details: ID={speech_job.id}, " + f"job_timeout={speech_job.timeout}, result_ttl={speech_job.result_ttl}, " + f"failure_ttl={speech_job.failure_ttl}, redis_key_ttl={actual_ttl}, " + f"queue_length={transcription_queue.count}, client_id={client_id}" + ) # Store job ID for cleanup (keyed by client_id for easy WebSocket cleanup) try: @@ -307,18 +383,29 @@ def start_streaming_jobs( # Enqueue audio persistence job on dedicated audio queue # NOTE: This job handles file rotation for multiple conversations automatically # Runs for entire session, not tied to individual conversations + # The job reads always_persist_enabled from global config internally audio_job = audio_queue.enqueue( audio_streaming_persistence_job, session_id, user_id, client_id, job_timeout=86400, # 24 hours for all-day sessions - result_ttl=JOB_RESULT_TTL, + ttl=None, # No pre-run expiry (job can wait indefinitely in queue) + result_ttl=JOB_RESULT_TTL, # Cleanup AFTER completion + failure_ttl=86400, # Cleanup failed jobs after 24h job_id=f"audio-persist_{session_id[:12]}", description=f"Audio persistence for session {session_id[:12]}", - meta={'audio_uuid': session_id, 'session_level': True} # Mark as session-level job + meta={'client_id': client_id, 'session_level': True} # Mark as session-level job ) + # Log job enqueue with TTL information for debugging + actual_ttl = redis_conn.ttl(f"rq:job:{audio_job.id}") logger.info(f"πŸ“₯ RQ: Enqueued audio persistence job {audio_job.id} on audio queue") + logger.info( + f"πŸ” Job enqueue details: ID={audio_job.id}, " + f"job_timeout={audio_job.timeout}, result_ttl={audio_job.result_ttl}, " + f"failure_ttl={audio_job.failure_ttl}, redis_key_ttl={actual_ttl}, " + f"queue_length={audio_queue.count}, client_id={client_id}" + ) return { 'speech_detection': speech_job.id, @@ -328,151 +415,169 @@ def start_streaming_jobs( def start_post_conversation_jobs( conversation_id: str, - audio_uuid: str, - audio_file_path: str, user_id: str, - post_transcription: bool = True, transcript_version_id: Optional[str] = None, depends_on_job = None, - client_id: Optional[str] = None + client_id: Optional[str] = None, + end_reason: str = "file_upload" ) -> Dict[str, str]: """ Start post-conversation processing jobs after conversation is created. This creates the standard processing chain after a conversation is created: - 1. [Optional] Transcription job - Batch transcription (if post_transcription=True) - 2. Audio cropping job - Removes silence from audio - 3. Speaker recognition job - Identifies speakers in audio - 4. Memory extraction job - Extracts memories from conversation (parallel) - 5. Title/summary generation job - Generates title and summary (parallel) + 1. Speaker recognition job - Identifies speakers in audio segments + 2. Memory extraction job - Extracts memories from conversation + 3. Title/summary generation job - Generates title and summary + 4. Event dispatch job - Triggers conversation.complete plugins + + Note: Batch transcription removed - streaming conversations use streaming transcript. + For file uploads, batch transcription must be enqueued separately before calling this function. Args: conversation_id: Conversation identifier - audio_uuid: Audio UUID for job tracking - audio_file_path: Path to audio file user_id: User identifier - post_transcription: If True, run batch transcription step (for uploads) - If False, skip transcription (streaming already has it) transcript_version_id: Transcript version ID (auto-generated if None) - depends_on_job: Optional job dependency for cropping job + depends_on_job: Optional job dependency for first job (e.g., transcription for file uploads) + client_id: Client ID for UI tracking + end_reason: Reason conversation ended (e.g., 'file_upload', 'websocket_disconnect', 'user_stopped') Returns: - Dict with job IDs (transcription will be None if post_transcription=False) + Dict with job IDs for speaker_recognition, memory, title_summary, event_dispatch """ - from advanced_omi_backend.workers.transcription_jobs import transcribe_full_audio_job from advanced_omi_backend.workers.speaker_jobs import recognise_speakers_job - from advanced_omi_backend.workers.audio_jobs import process_cropping_job from advanced_omi_backend.workers.memory_jobs import process_memory_job - from advanced_omi_backend.workers.conversation_jobs import generate_title_summary_job + from advanced_omi_backend.workers.conversation_jobs import generate_title_summary_job, dispatch_conversation_complete_event_job version_id = transcript_version_id or str(uuid.uuid4()) # Build job metadata (include client_id if provided for UI tracking) - job_meta = {'audio_uuid': audio_uuid, 'conversation_id': conversation_id} + job_meta = {'conversation_id': conversation_id} if client_id: job_meta['client_id'] = client_id - # Step 1: Batch transcription job (ALWAYS run to get correct conversation-relative timestamps) - # Even for streaming, we need batch transcription before cropping to fix cumulative timestamps - transcribe_job_id = f"transcribe_{conversation_id[:12]}" - logger.info(f"πŸ” DEBUG: Creating transcribe job with job_id={transcribe_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") - - transcription_job = transcription_queue.enqueue( - transcribe_full_audio_job, - conversation_id, - audio_uuid, - audio_file_path, - version_id, - "batch", # trigger - job_timeout=1800, # 30 minutes - result_ttl=JOB_RESULT_TTL, - depends_on=depends_on_job, - job_id=transcribe_job_id, - description=f"Transcribe conversation {conversation_id[:8]}", - meta=job_meta - ) - logger.info(f"πŸ“₯ RQ: Enqueued transcription job {transcription_job.id}, meta={transcription_job.meta}") - crop_depends_on = transcription_job - - # Step 2: Audio cropping job (depends on transcription if it ran, otherwise depends_on_job) - crop_job_id = f"crop_{conversation_id[:12]}" - logger.info(f"πŸ” DEBUG: Creating crop job with job_id={crop_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") - - cropping_job = default_queue.enqueue( - process_cropping_job, - conversation_id, - audio_file_path, - job_timeout=300, # 5 minutes - result_ttl=JOB_RESULT_TTL, - depends_on=crop_depends_on, - job_id=crop_job_id, - description=f"Crop audio for conversation {conversation_id[:8]}", - meta=job_meta - ) - logger.info(f"πŸ“₯ RQ: Enqueued cropping job {cropping_job.id}, meta={cropping_job.meta}") - - # Speaker recognition depends on cropping - speaker_depends_on = cropping_job - - # Step 3: Speaker recognition job - speaker_job_id = f"speaker_{conversation_id[:12]}" - logger.info(f"πŸ” DEBUG: Creating speaker job with job_id={speaker_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") - - speaker_job = transcription_queue.enqueue( - recognise_speakers_job, - conversation_id, - version_id, - audio_file_path, - "", # transcript_text - will be read from DB - [], # words - will be read from DB - job_timeout=1200, # 20 minutes - result_ttl=JOB_RESULT_TTL, - depends_on=speaker_depends_on, - job_id=speaker_job_id, - description=f"Speaker recognition for conversation {conversation_id[:8]}", - meta=job_meta - ) - logger.info(f"πŸ“₯ RQ: Enqueued speaker recognition job {speaker_job.id}, meta={speaker_job.meta} (depends on {speaker_depends_on.id})") - - # Step 4: Memory extraction job (parallel with title/summary) - memory_job_id = f"memory_{conversation_id[:12]}" - logger.info(f"πŸ” DEBUG: Creating memory job with job_id={memory_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") - - memory_job = memory_queue.enqueue( - process_memory_job, - conversation_id, - job_timeout=900, # 15 minutes - result_ttl=JOB_RESULT_TTL, - depends_on=speaker_job, - job_id=memory_job_id, - description=f"Memory extraction for conversation {conversation_id[:8]}", - meta=job_meta - ) - logger.info(f"πŸ“₯ RQ: Enqueued memory extraction job {memory_job.id}, meta={memory_job.meta} (depends on {speaker_job.id})") + # Check if speaker recognition is enabled + speaker_config = get_service_config('speaker_recognition') + speaker_enabled = speaker_config.get('enabled', True) # Default to True for backward compatibility + + # Step 1: Speaker recognition job (conditional - only if enabled) + speaker_dependency = depends_on_job # Start with upstream dependency (transcription if file upload) + speaker_job = None + + if speaker_enabled: + speaker_job_id = f"speaker_{conversation_id[:12]}" + logger.info(f"πŸ” DEBUG: Creating speaker job with job_id={speaker_job_id}, conversation_id={conversation_id[:12]}") + + speaker_job = transcription_queue.enqueue( + recognise_speakers_job, + conversation_id, + version_id, + job_timeout=1200, # 20 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=speaker_dependency, + job_id=speaker_job_id, + description=f"Speaker recognition for conversation {conversation_id[:8]}", + meta=job_meta + ) + speaker_dependency = speaker_job # Chain for next jobs + if depends_on_job: + logger.info(f"πŸ“₯ RQ: Enqueued speaker recognition job {speaker_job.id}, meta={speaker_job.meta} (depends on {depends_on_job.id})") + else: + logger.info(f"πŸ“₯ RQ: Enqueued speaker recognition job {speaker_job.id}, meta={speaker_job.meta} (no dependencies, starts immediately)") + else: + logger.info(f"⏭️ Speaker recognition disabled, skipping speaker job for conversation {conversation_id[:8]}") + + # Step 2: Memory extraction job (conditional - only if enabled) + # Check if memory extraction is enabled + memory_config = get_service_config('memory.extraction') + memory_enabled = memory_config.get('enabled', True) # Default to True for backward compatibility + + memory_job = None + if memory_enabled: + # Depends on speaker job if it was created, otherwise depends on upstream (transcription or nothing) + memory_job_id = f"memory_{conversation_id[:12]}" + logger.info(f"πŸ” DEBUG: Creating memory job with job_id={memory_job_id}, conversation_id={conversation_id[:12]}") + + memory_job = memory_queue.enqueue( + process_memory_job, + conversation_id, + job_timeout=900, # 15 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=speaker_dependency, # Either speaker_job or upstream dependency + job_id=memory_job_id, + description=f"Memory extraction for conversation {conversation_id[:8]}", + meta=job_meta + ) + if speaker_job: + logger.info(f"πŸ“₯ RQ: Enqueued memory extraction job {memory_job.id}, meta={memory_job.meta} (depends on speaker job {speaker_job.id})") + elif depends_on_job: + logger.info(f"πŸ“₯ RQ: Enqueued memory extraction job {memory_job.id}, meta={memory_job.meta} (depends on {depends_on_job.id})") + else: + logger.info(f"πŸ“₯ RQ: Enqueued memory extraction job {memory_job.id}, meta={memory_job.meta} (no dependencies, starts immediately)") + else: + logger.info(f"⏭️ Memory extraction disabled, skipping memory job for conversation {conversation_id[:8]}") - # Step 5: Title/summary generation job (parallel with memory, independent) - # This ensures conversations always get titles/summaries even if memory job fails + # Step 3: Title/summary generation job + # Depends on speaker job if enabled, otherwise on upstream dependency title_job_id = f"title_summary_{conversation_id[:12]}" - logger.info(f"πŸ” DEBUG: Creating title/summary job with job_id={title_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") + logger.info(f"πŸ” DEBUG: Creating title/summary job with job_id={title_job_id}, conversation_id={conversation_id[:12]}") title_summary_job = default_queue.enqueue( generate_title_summary_job, conversation_id, job_timeout=300, # 5 minutes result_ttl=JOB_RESULT_TTL, - depends_on=speaker_job, # Depends on speaker job, NOT memory job + depends_on=speaker_dependency, # Depends on speaker job if enabled, NOT memory job job_id=title_job_id, description=f"Generate title and summary for conversation {conversation_id[:8]}", meta=job_meta ) - logger.info(f"πŸ“₯ RQ: Enqueued title/summary job {title_summary_job.id}, meta={title_summary_job.meta} (depends on {speaker_job.id})") + if speaker_job: + logger.info(f"πŸ“₯ RQ: Enqueued title/summary job {title_summary_job.id}, meta={title_summary_job.meta} (depends on speaker job {speaker_job.id})") + elif depends_on_job: + logger.info(f"πŸ“₯ RQ: Enqueued title/summary job {title_summary_job.id}, meta={title_summary_job.meta} (depends on {depends_on_job.id})") + else: + logger.info(f"πŸ“₯ RQ: Enqueued title/summary job {title_summary_job.id}, meta={title_summary_job.meta} (no dependencies, starts immediately)") + + # Step 5: Dispatch conversation.complete event (runs after both memory and title/summary complete) + # This ensures plugins receive the event after all processing is done + event_job_id = f"event_complete_{conversation_id[:12]}" + logger.info(f"πŸ” DEBUG: Creating conversation complete event job with job_id={event_job_id}, conversation_id={conversation_id[:12]}") + + # Event job depends on memory and title/summary jobs that were actually enqueued + # Build dependency list excluding None values + event_dependencies = [] + if memory_job: + event_dependencies.append(memory_job) + if title_summary_job: + event_dependencies.append(title_summary_job) + + # Enqueue event dispatch job (may have no dependencies if all jobs were skipped) + event_dispatch_job = default_queue.enqueue( + dispatch_conversation_complete_event_job, + conversation_id, + client_id or "", + user_id, + end_reason, # Use the end_reason parameter (defaults to 'file_upload' for backward compatibility) + job_timeout=120, # 2 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=event_dependencies if event_dependencies else None, # Wait for jobs that were enqueued + job_id=event_job_id, + description=f"Dispatch conversation complete event ({end_reason}) for {conversation_id[:8]}", + meta=job_meta + ) + + # Log event dispatch dependencies + if event_dependencies: + dep_ids = [job.id for job in event_dependencies] + logger.info(f"πŸ“₯ RQ: Enqueued conversation complete event job {event_dispatch_job.id}, meta={event_dispatch_job.meta} (depends on {', '.join(dep_ids)})") + else: + logger.info(f"πŸ“₯ RQ: Enqueued conversation complete event job {event_dispatch_job.id}, meta={event_dispatch_job.meta} (no dependencies, starts immediately)") return { - 'cropping': cropping_job.id, - 'transcription': transcription_job.id if transcription_job else None, - 'speaker_recognition': speaker_job.id, - 'memory': memory_job.id, - 'title_summary': title_summary_job.id + 'speaker_recognition': speaker_job.id if speaker_job else None, + 'memory': memory_job.id if memory_job else None, + 'title_summary': title_summary_job.id, + 'event_dispatch': event_dispatch_job.id } diff --git a/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py index a3836898..fe9b87cd 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py @@ -9,13 +9,62 @@ import logging import time -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Literal from fastapi.responses import JSONResponse logger = logging.getLogger(__name__) +async def mark_session_complete( + redis_client, + session_id: str, + reason: Literal[ + "websocket_disconnect", + "user_stopped", + "inactivity_timeout", + "max_duration", + "all_jobs_complete" + ], +) -> None: + """ + Single source of truth for marking sessions as complete. + + This function ensures that both 'status' and 'completion_reason' are ALWAYS + set together atomically, preventing race conditions where workers check status + before completion_reason is set. + + Args: + redis_client: Redis async client + session_id: Session UUID + reason: Why the session is completing (enforced by type system) + + Usage: + # WebSocket disconnect + await mark_session_complete(redis, session_id, "websocket_disconnect") + + # User manually stopped + await mark_session_complete(redis, session_id, "user_stopped") + + # Inactivity timeout + await mark_session_complete(redis, session_id, "inactivity_timeout") + + # Max duration reached + await mark_session_complete(redis, session_id, "max_duration") + + # All jobs finished + await mark_session_complete(redis, session_id, "all_jobs_complete") + """ + session_key = f"audio:session:{session_id}" + mark_time = time.time() + await redis_client.hset(session_key, mapping={ + "status": "finished", + "completed_at": str(mark_time), + "completion_reason": reason + }) + logger.info(f"βœ… Session {session_id[:12]} marked finished: {reason} [TIME: {mark_time:.3f}]") + + async def get_session_info(redis_client, session_id: str) -> Optional[Dict]: """ Get detailed information about a specific session. @@ -151,7 +200,7 @@ async def get_streaming_status(request): transcription_queue, memory_queue, default_queue, - all_jobs_complete_for_session + all_jobs_complete_for_client ) try: @@ -181,19 +230,19 @@ async def get_streaming_status(request): # Separate active and completed sessions # Check if all jobs are complete (including failed jobs) - all_jobs_done = all_jobs_complete_for_session(session_id) - - # Session is completed if: - # 1. Redis status says complete/finalized AND all jobs done, OR - # 2. All jobs are done (even if status isn't complete yet) - # This ensures sessions with failed jobs move to completed - if status in ["complete", "completed", "finalized"] or all_jobs_done: + # Note: session_id == client_id in streaming context, but using client_id explicitly + all_jobs_done = all_jobs_complete_for_client(session_obj.get("client_id")) + + # Session is finished if: + # 1. Redis status says finished AND all jobs done, OR + # 2. All jobs are done (even if status isn't finished yet) + # This ensures sessions with failed jobs move to finished + if status == "finished" or all_jobs_done: if all_jobs_done: - # All jobs complete - this is truly a completed session - # Update Redis status if it wasn't already marked complete - if status not in ["complete", "completed", "finalized"]: - await redis_client.hset(key, "status", "complete") - logger.info(f"βœ… Marked session {session_id} as complete (all jobs terminal)") + # All jobs finished - this is truly a finished session + # Update Redis status if it wasn't already marked finished + if status != "finished": + await mark_session_complete(redis_client, session_id, "all_jobs_complete") # Get additional session data for completed sessions session_key = f"audio:session:{session_id}" @@ -204,7 +253,7 @@ async def get_streaming_status(request): "client_id": session_obj.get("client_id", ""), "conversation_id": session_data.get(b"conversation_id", b"").decode() if session_data and b"conversation_id" in session_data else None, "has_conversation": bool(session_data and session_data.get(b"conversation_id", b"")), - "action": session_data.get(b"action", b"complete").decode() if session_data and b"action" in session_data else "complete", + "action": session_data.get(b"action", b"finished").decode() if session_data and b"action" in session_data else "finished", "reason": session_data.get(b"reason", b"").decode() if session_data and b"reason" in session_data else "", "completed_at": session_obj.get("last_chunk_at", 0), "audio_file": session_data.get(b"audio_file", b"").decode() if session_data and b"audio_file" in session_data else "", @@ -403,26 +452,26 @@ async def get_streaming_status(request): rq_stats = { "transcription_queue": { "queued": transcription_queue.count, - "processing": len(transcription_queue.started_job_registry), - "completed": len(transcription_queue.finished_job_registry), + "started": len(transcription_queue.started_job_registry), + "finished": len(transcription_queue.finished_job_registry), "failed": len(transcription_queue.failed_job_registry), - "cancelled": len(transcription_queue.canceled_job_registry), + "canceled": len(transcription_queue.canceled_job_registry), "deferred": len(transcription_queue.deferred_job_registry) }, "memory_queue": { "queued": memory_queue.count, - "processing": len(memory_queue.started_job_registry), - "completed": len(memory_queue.finished_job_registry), + "started": len(memory_queue.started_job_registry), + "finished": len(memory_queue.finished_job_registry), "failed": len(memory_queue.failed_job_registry), - "cancelled": len(memory_queue.canceled_job_registry), + "canceled": len(memory_queue.canceled_job_registry), "deferred": len(memory_queue.deferred_job_registry) }, "default_queue": { "queued": default_queue.count, - "processing": len(default_queue.started_job_registry), - "completed": len(default_queue.finished_job_registry), + "started": len(default_queue.started_job_registry), + "finished": len(default_queue.finished_job_registry), "failed": len(default_queue.failed_job_registry), - "cancelled": len(default_queue.canceled_job_registry), + "canceled": len(default_queue.canceled_job_registry), "deferred": len(default_queue.deferred_job_registry) } } diff --git a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py index aced763f..55a4b43e 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py @@ -6,15 +6,22 @@ import os import shutil import time +import warnings from datetime import UTC, datetime +from pathlib import Path import yaml from fastapi import HTTPException from advanced_omi_backend.config import ( - load_diarization_settings_from_file, - save_diarization_settings_to_file, + get_diarization_settings as load_diarization_settings, + get_misc_settings as load_misc_settings, + save_misc_settings, ) +from advanced_omi_backend.config import ( + save_diarization_settings, +) +from advanced_omi_backend.config_loader import get_plugins_yml_path from advanced_omi_backend.model_registry import _find_config_path, load_models_config from advanced_omi_backend.models.user import User @@ -22,6 +29,201 @@ audio_logger = logging.getLogger("audio_processing") +async def get_config_diagnostics(): + """ + Get comprehensive configuration diagnostics. + + Returns warnings, errors, and status for all configuration components. + """ + diagnostics = { + "timestamp": datetime.now(UTC).isoformat(), + "overall_status": "healthy", + "issues": [], + "warnings": [], + "info": [], + "components": {} + } + + # Test OmegaConf configuration loading + try: + from advanced_omi_backend.config_loader import load_config + + # Capture warnings during config load + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + config = load_config(force_reload=True) + + # Check for OmegaConf warnings + for warning in w: + warning_msg = str(warning.message) + if "some elements are missing" in warning_msg.lower(): + # Extract the variable name from warning + if "variable '" in warning_msg.lower(): + var_name = warning_msg.split("'")[1] + diagnostics["warnings"].append({ + "component": "OmegaConf", + "severity": "warning", + "message": f"Environment variable '{var_name}' not set (using empty default)", + "resolution": f"Set {var_name} in .env file if needed" + }) + + diagnostics["components"]["omegaconf"] = { + "status": "healthy", + "message": "Configuration loaded successfully" + } + except Exception as e: + diagnostics["overall_status"] = "unhealthy" + diagnostics["issues"].append({ + "component": "OmegaConf", + "severity": "error", + "message": f"Failed to load configuration: {str(e)}", + "resolution": "Check config/defaults.yml and config/config.yml syntax" + }) + diagnostics["components"]["omegaconf"] = { + "status": "unhealthy", + "message": str(e) + } + + # Test model registry + try: + from advanced_omi_backend.model_registry import get_models_registry + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + registry = get_models_registry() + + # Capture model loading warnings + for warning in w: + warning_msg = str(warning.message) + diagnostics["warnings"].append({ + "component": "Model Registry", + "severity": "warning", + "message": warning_msg, + "resolution": "Check model definitions in config/defaults.yml" + }) + + if registry: + diagnostics["components"]["model_registry"] = { + "status": "healthy", + "message": f"Loaded {len(registry.models)} models", + "details": { + "total_models": len(registry.models), + "defaults": dict(registry.defaults) if registry.defaults else {} + } + } + + # Check critical models + stt = registry.get_default("stt") + stt_stream = registry.get_default("stt_stream") + llm = registry.get_default("llm") + + # STT check + if stt: + if stt.api_key: + diagnostics["info"].append({ + "component": "STT (Batch)", + "message": f"Configured: {stt.name} ({stt.model_provider}) - API key present" + }) + else: + diagnostics["warnings"].append({ + "component": "STT (Batch)", + "severity": "warning", + "message": f"{stt.name} ({stt.model_provider}) - No API key configured", + "resolution": "Transcription can fail without API key" + }) + else: + diagnostics["issues"].append({ + "component": "STT (Batch)", + "severity": "error", + "message": "No batch STT model configured", + "resolution": "Set defaults.stt in config.yml" + }) + diagnostics["overall_status"] = "partial" + + # Streaming STT check + if stt_stream: + if stt_stream.api_key: + diagnostics["info"].append({ + "component": "STT (Streaming)", + "message": f"Configured: {stt_stream.name} ({stt_stream.model_provider}) - API key present" + }) + else: + diagnostics["warnings"].append({ + "component": "STT (Streaming)", + "severity": "warning", + "message": f"{stt_stream.name} ({stt_stream.model_provider}) - No API key configured", + "resolution": "Real-time transcription can fail without API key" + }) + else: + diagnostics["warnings"].append({ + "component": "STT (Streaming)", + "severity": "warning", + "message": "No streaming STT model configured - streaming worker disabled", + "resolution": "Set defaults.stt_stream in config.yml for WebSocket transcription" + }) + + # LLM check + if llm: + if llm.api_key: + diagnostics["info"].append({ + "component": "LLM", + "message": f"Configured: {llm.name} ({llm.model_provider}) - API key present" + }) + else: + diagnostics["warnings"].append({ + "component": "LLM", + "severity": "warning", + "message": f"{llm.name} ({llm.model_provider}) - No API key configured", + "resolution": "Memory extraction can fail without API key" + }) + + else: + diagnostics["overall_status"] = "unhealthy" + diagnostics["issues"].append({ + "component": "Model Registry", + "severity": "error", + "message": "Failed to load model registry", + "resolution": "Check config/defaults.yml for syntax errors" + }) + diagnostics["components"]["model_registry"] = { + "status": "unhealthy", + "message": "Registry failed to load" + } + except Exception as e: + diagnostics["overall_status"] = "partial" + diagnostics["issues"].append({ + "component": "Model Registry", + "severity": "error", + "message": f"Error loading registry: {str(e)}", + "resolution": "Check logs for detailed error information" + }) + diagnostics["components"]["model_registry"] = { + "status": "unhealthy", + "message": str(e) + } + + # Check environment variables + env_checks = [ + ("DEEPGRAM_API_KEY", "Required for Deepgram transcription"), + ("OPENAI_API_KEY", "Required for OpenAI LLM and embeddings"), + ("AUTH_SECRET_KEY", "Required for authentication"), + ("ADMIN_EMAIL", "Required for admin user login"), + ("ADMIN_PASSWORD", "Required for admin user login"), + ] + + for env_var, description in env_checks: + value = os.getenv(env_var) + if not value or value == "": + diagnostics["warnings"].append({ + "component": "Environment Variables", + "severity": "warning", + "message": f"{env_var} not set - {description}", + "resolution": f"Set {env_var} in .env file" + }) + + return diagnostics + + async def get_current_metrics(): """Get current system metrics.""" try: @@ -64,8 +266,8 @@ async def get_auth_config(): async def get_diarization_settings(): """Get current diarization settings.""" try: - # Reload from file to get latest settings - settings = load_diarization_settings_from_file() + # Get settings using OmegaConf + settings = load_diarization_settings() return { "settings": settings, "status": "success" @@ -75,7 +277,7 @@ async def get_diarization_settings(): raise e -async def save_diarization_settings(settings: dict): +async def save_diarization_settings_controller(settings: dict): """Save diarization settings.""" try: # Validate settings @@ -84,11 +286,13 @@ async def save_diarization_settings(settings: dict): "min_duration_off", "min_speakers", "max_speakers" } + # Filter to only valid keys (allow round-trip GETβ†’POST) + filtered_settings = {} for key, value in settings.items(): if key not in valid_keys: - raise HTTPException(status_code=400, detail=f"Invalid setting key: {key}") + continue # Skip unknown keys instead of rejecting - # Type validation + # Type validation for known keys only if key in ["min_speakers", "max_speakers"]: if not isinstance(value, int) or value < 1 or value > 20: raise HTTPException(status_code=400, detail=f"Invalid value for {key}: must be integer 1-20") @@ -98,34 +302,165 @@ async def save_diarization_settings(settings: dict): else: if not isinstance(value, (int, float)) or value < 0: raise HTTPException(status_code=400, detail=f"Invalid value for {key}: must be positive number") - + + filtered_settings[key] = value + + # Reject if NO valid keys provided (completely invalid request) + if not filtered_settings: + raise HTTPException(status_code=400, detail="No valid diarization settings provided") + # Get current settings and merge with new values - current_settings = load_diarization_settings_from_file() - current_settings.update(settings) - - # Save to file - if save_diarization_settings_to_file(current_settings): - logger.info(f"Updated and saved diarization settings: {settings}") - + current_settings = load_diarization_settings() + current_settings.update(filtered_settings) + + # Save using OmegaConf + if save_diarization_settings(current_settings): + logger.info(f"Updated and saved diarization settings: {filtered_settings}") + return { "message": "Diarization settings saved successfully", "settings": current_settings, "status": "success" } else: - # Even if file save fails, we've updated the in-memory settings - logger.warning("Settings updated in memory but file save failed") + logger.warning("Settings save failed") return { - "message": "Settings updated (file save failed)", + "message": "Settings save failed", "settings": current_settings, - "status": "partial" + "status": "error" } - + except Exception as e: logger.exception("Error saving diarization settings") raise e +async def get_misc_settings(): + """Get current miscellaneous settings.""" + try: + # Get settings using OmegaConf + settings = load_misc_settings() + return { + "settings": settings, + "status": "success" + } + except Exception as e: + logger.exception("Error getting misc settings") + raise e + + +async def save_misc_settings_controller(settings: dict): + """Save miscellaneous settings.""" + try: + # Validate settings + valid_keys = {"always_persist_enabled", "use_provider_segments"} + + # Filter to only valid keys + filtered_settings = {} + for key, value in settings.items(): + if key not in valid_keys: + continue # Skip unknown keys + + # Type validation + if not isinstance(value, bool): + raise HTTPException(status_code=400, detail=f"Invalid value for {key}: must be boolean") + + filtered_settings[key] = value + + # Reject if NO valid keys provided + if not filtered_settings: + raise HTTPException(status_code=400, detail="No valid misc settings provided") + + # Save using OmegaConf + if save_misc_settings(filtered_settings): + # Get updated settings + updated_settings = load_misc_settings() + logger.info(f"Updated and saved misc settings: {filtered_settings}") + + return { + "message": "Miscellaneous settings saved successfully", + "settings": updated_settings, + "status": "success" + } + else: + logger.warning("Settings save failed") + return { + "message": "Settings save failed", + "settings": load_misc_settings(), + "status": "error" + } + + except HTTPException: + raise + except Exception as e: + logger.exception("Error saving misc settings") + raise e + + +async def get_cleanup_settings_controller(user: User) -> dict: + """ + Get current cleanup settings (admin only). + + Args: + user: Authenticated admin user + + Returns: + Dict with cleanup settings + """ + from advanced_omi_backend.config import get_cleanup_settings + + return get_cleanup_settings() + + +async def save_cleanup_settings_controller( + auto_cleanup_enabled: bool, + retention_days: int, + user: User +) -> dict: + """ + Save cleanup settings (admin only). + + Args: + auto_cleanup_enabled: Enable/disable automatic cleanup + retention_days: Number of days to retain soft-deleted conversations + user: Authenticated admin user + + Returns: + Updated cleanup settings + + Raises: + ValueError: If validation fails + """ + from advanced_omi_backend.config import CleanupSettings, save_cleanup_settings + + # Validation + if not isinstance(auto_cleanup_enabled, bool): + raise ValueError("auto_cleanup_enabled must be a boolean") + + if not isinstance(retention_days, int): + raise ValueError("retention_days must be an integer") + + if retention_days < 1 or retention_days > 365: + raise ValueError("retention_days must be between 1 and 365") + + # Create settings object + settings = CleanupSettings( + auto_cleanup_enabled=auto_cleanup_enabled, + retention_days=retention_days + ) + + # Save using OmegaConf + save_cleanup_settings(settings) + + logger.info(f"Admin {user.email} updated cleanup settings: auto_cleanup={auto_cleanup_enabled}, retention={retention_days}d") + + return { + "auto_cleanup_enabled": settings.auto_cleanup_enabled, + "retention_days": settings.retention_days, + "message": "Cleanup settings saved successfully" + } + + async def get_speaker_configuration(user: User): """Get current user's primary speakers configuration.""" try: @@ -555,3 +890,417 @@ async def validate_chat_config_yaml(prompt_text: str) -> dict: except Exception as e: logger.error(f"Error validating chat config: {e}") return {"valid": False, "error": f"Validation error: {str(e)}"} + + +# Plugin Configuration Management Functions + +async def get_plugins_config_yaml() -> str: + """Get plugins configuration as YAML text.""" + try: + plugins_yml_path = get_plugins_yml_path() + + # Default empty plugins config + default_config = """plugins: + # No plugins configured yet + # Example plugin configuration: + # homeassistant: + # enabled: true + # access_level: transcript + # trigger: + # type: wake_word + # wake_word: vivi + # ha_url: http://localhost:8123 + # ha_token: YOUR_TOKEN_HERE +""" + + if not plugins_yml_path.exists(): + return default_config + + with open(plugins_yml_path, 'r') as f: + yaml_content = f.read() + + return yaml_content + + except Exception as e: + logger.error(f"Error loading plugins config: {e}") + raise + + +async def save_plugins_config_yaml(yaml_content: str) -> dict: + """Save plugins configuration from YAML text.""" + try: + plugins_yml_path = get_plugins_yml_path() + + # Validate YAML can be parsed + try: + parsed_config = yaml.safe_load(yaml_content) + if not isinstance(parsed_config, dict): + raise ValueError("Configuration must be a YAML dictionary") + + # Validate has 'plugins' key + if 'plugins' not in parsed_config: + raise ValueError("Configuration must contain 'plugins' key") + + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML syntax: {e}") + + # Create config directory if it doesn't exist + plugins_yml_path.parent.mkdir(parents=True, exist_ok=True) + + # Backup existing config + if plugins_yml_path.exists(): + backup_path = str(plugins_yml_path) + '.backup' + shutil.copy2(plugins_yml_path, backup_path) + logger.info(f"Created plugins config backup at {backup_path}") + + # Save new config + with open(plugins_yml_path, 'w') as f: + f.write(yaml_content) + + # Hot-reload plugins (optional - may require restart) + try: + from advanced_omi_backend.services.plugin_service import get_plugin_router + plugin_router = get_plugin_router() + if plugin_router: + logger.info("Plugin configuration updated - restart backend for changes to take effect") + except Exception as reload_err: + logger.warning(f"Could not reload plugins: {reload_err}") + + logger.info("Plugins configuration updated successfully") + + return { + "success": True, + "message": "Plugins configuration updated successfully. Restart backend for changes to take effect." + } + + except Exception as e: + logger.error(f"Error saving plugins config: {e}") + raise + + +async def validate_plugins_config_yaml(yaml_content: str) -> dict: + """Validate plugins configuration YAML.""" + try: + # Parse YAML + try: + parsed_config = yaml.safe_load(yaml_content) + except yaml.YAMLError as e: + return {"valid": False, "error": f"Invalid YAML syntax: {e}"} + + # Check structure + if not isinstance(parsed_config, dict): + return {"valid": False, "error": "Configuration must be a YAML dictionary"} + + if 'plugins' not in parsed_config: + return {"valid": False, "error": "Configuration must contain 'plugins' key"} + + plugins = parsed_config['plugins'] + if not isinstance(plugins, dict): + return {"valid": False, "error": "'plugins' must be a dictionary"} + + # Validate each plugin + valid_access_levels = ['transcript', 'conversation', 'memory'] + valid_trigger_types = ['wake_word', 'always', 'conditional'] + + for plugin_id, plugin_config in plugins.items(): + if not isinstance(plugin_config, dict): + return {"valid": False, "error": f"Plugin '{plugin_id}' config must be a dictionary"} + + # Check required fields + if 'enabled' in plugin_config and not isinstance(plugin_config['enabled'], bool): + return {"valid": False, "error": f"Plugin '{plugin_id}': 'enabled' must be boolean"} + + if 'access_level' in plugin_config and plugin_config['access_level'] not in valid_access_levels: + return {"valid": False, "error": f"Plugin '{plugin_id}': invalid access_level (must be one of {valid_access_levels})"} + + if 'trigger' in plugin_config: + trigger = plugin_config['trigger'] + if not isinstance(trigger, dict): + return {"valid": False, "error": f"Plugin '{plugin_id}': 'trigger' must be a dictionary"} + + if 'type' in trigger and trigger['type'] not in valid_trigger_types: + return {"valid": False, "error": f"Plugin '{plugin_id}': invalid trigger type (must be one of {valid_trigger_types})"} + + return {"valid": True, "message": "Configuration is valid"} + + except Exception as e: + logger.error(f"Error validating plugins config: {e}") + return {"valid": False, "error": f"Validation error: {str(e)}"} + + +# Structured Plugin Configuration Management Functions (Form-based UI) + +async def get_plugins_metadata() -> dict: + """Get plugin metadata for form-based configuration UI. + + Returns complete metadata for all discovered plugins including: + - Plugin information (name, description, enabled status) + - Auto-generated schemas from config.yml (or explicit schema.yml) + - Current configuration with masked secrets + - Orchestration settings (events, conditions) + + Returns: + Dict with plugins list containing metadata for each plugin + """ + try: + from advanced_omi_backend.services.plugin_service import ( + discover_plugins, + get_plugin_metadata, + ) + + # Discover all available plugins + discovered_plugins = discover_plugins() + + # Load orchestration config from plugins.yml + plugins_yml_path = get_plugins_yml_path() + orchestration_configs = {} + + if plugins_yml_path.exists(): + with open(plugins_yml_path, 'r') as f: + plugins_data = yaml.safe_load(f) or {} + orchestration_configs = plugins_data.get('plugins', {}) + + # Build metadata for each plugin + plugins_metadata = [] + for plugin_id, plugin_class in discovered_plugins.items(): + # Get orchestration config (or empty dict if not configured) + orchestration_config = orchestration_configs.get(plugin_id, { + 'enabled': False, + 'events': [], + 'condition': {'type': 'always'} + }) + + # Get complete metadata including schema + metadata = get_plugin_metadata(plugin_id, plugin_class, orchestration_config) + plugins_metadata.append(metadata) + + logger.info(f"Retrieved metadata for {len(plugins_metadata)} plugins") + + return { + "plugins": plugins_metadata, + "status": "success" + } + + except Exception as e: + logger.exception("Error getting plugins metadata") + raise e + + +async def update_plugin_config_structured(plugin_id: str, config: dict) -> dict: + """Update plugin configuration from structured JSON (form data). + + Updates the three-file plugin architecture: + 1. config/plugins.yml - Orchestration (enabled, events, condition) + 2. plugins/{plugin_id}/config.yml - Settings with ${ENV_VAR} references + 3. backends/advanced/.env - Actual secret values + + Args: + plugin_id: Plugin identifier + config: Structured configuration with 'orchestration', 'settings', 'env_vars' sections + + Returns: + Success message with list of updated files + """ + try: + from advanced_omi_backend.services.plugin_service import discover_plugins + import advanced_omi_backend.plugins + + # Validate plugin exists + discovered_plugins = discover_plugins() + if plugin_id not in discovered_plugins: + raise ValueError(f"Plugin '{plugin_id}' not found") + + updated_files = [] + + # 1. Update config/plugins.yml (orchestration) + if 'orchestration' in config: + plugins_yml_path = get_plugins_yml_path() + + # Load current plugins.yml + if plugins_yml_path.exists(): + with open(plugins_yml_path, 'r') as f: + plugins_data = yaml.safe_load(f) or {} + else: + plugins_data = {} + + if 'plugins' not in plugins_data: + plugins_data['plugins'] = {} + + # Update orchestration config + orchestration = config['orchestration'] + plugins_data['plugins'][plugin_id] = { + 'enabled': orchestration.get('enabled', False), + 'events': orchestration.get('events', []), + 'condition': orchestration.get('condition', {'type': 'always'}) + } + + # Create backup + if plugins_yml_path.exists(): + backup_path = str(plugins_yml_path) + '.backup' + shutil.copy2(plugins_yml_path, backup_path) + + # Create config directory if needed + plugins_yml_path.parent.mkdir(parents=True, exist_ok=True) + + # Write updated plugins.yml + with open(plugins_yml_path, 'w') as f: + yaml.dump(plugins_data, f, default_flow_style=False, sort_keys=False) + + updated_files.append(str(plugins_yml_path)) + logger.info(f"Updated orchestration config for '{plugin_id}' in {plugins_yml_path}") + + # 2. Update plugins/{plugin_id}/config.yml (settings with env var references) + if 'settings' in config: + plugins_dir = Path(advanced_omi_backend.plugins.__file__).parent + plugin_config_path = plugins_dir / plugin_id / "config.yml" + + # Load current config.yml + if plugin_config_path.exists(): + with open(plugin_config_path, 'r') as f: + plugin_config_data = yaml.safe_load(f) or {} + else: + plugin_config_data = {} + + # Update settings (preserve ${ENV_VAR} references) + settings = config['settings'] + plugin_config_data.update(settings) + + # Create backup + if plugin_config_path.exists(): + backup_path = str(plugin_config_path) + '.backup' + shutil.copy2(plugin_config_path, backup_path) + + # Write updated config.yml + with open(plugin_config_path, 'w') as f: + yaml.dump(plugin_config_data, f, default_flow_style=False, sort_keys=False) + + updated_files.append(str(plugin_config_path)) + logger.info(f"Updated settings for '{plugin_id}' in {plugin_config_path}") + + # 3. Update .env (only changed env vars) + if 'env_vars' in config and config['env_vars']: + env_path = os.path.join(os.getcwd(), ".env") + + if not os.path.exists(env_path): + raise FileNotFoundError(f".env file not found at {env_path}") + + # Read current .env + with open(env_path, 'r') as f: + env_lines = f.readlines() + + # Create backup + backup_path = f"{env_path}.backup" + shutil.copy2(env_path, backup_path) + + # Update env vars (only if not masked) + env_vars = config['env_vars'] + updated_env_lines = [] + updated_vars = set() + + for line in env_lines: + line_updated = False + for env_var, value in env_vars.items(): + # Skip if value is masked (not actually changed) + if value == 'β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’': + continue + + if line.strip().startswith(f"{env_var}="): + updated_env_lines.append(f"{env_var}={value}\n") + updated_vars.add(env_var) + line_updated = True + break + + if not line_updated: + updated_env_lines.append(line) + + # Add new env vars that weren't found in file + for env_var, value in env_vars.items(): + if value != 'β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’' and env_var not in updated_vars: + updated_env_lines.append(f"{env_var}={value}\n") + updated_vars.add(env_var) + + # Write updated .env + if updated_vars: + with open(env_path, 'w') as f: + f.writelines(updated_env_lines) + + updated_files.append(env_path) + logger.info(f"Updated {len(updated_vars)} environment variables in {env_path}") + + return { + "success": True, + "message": f"Plugin '{plugin_id}' configuration updated successfully. Restart backend for changes to take effect.", + "updated_files": updated_files, + "requires_restart": True, + "status": "success" + } + + except Exception as e: + logger.exception(f"Error updating structured config for plugin '{plugin_id}'") + raise e + + +async def test_plugin_connection(plugin_id: str, config: dict) -> dict: + """Test plugin connection/configuration without saving. + + Calls the plugin's test_connection method if available to validate + configuration (e.g., SMTP connection, Home Assistant API). + + Args: + plugin_id: Plugin identifier + config: Configuration to test (same structure as update_plugin_config_structured) + + Returns: + Test result with success status and details + """ + try: + from advanced_omi_backend.services.plugin_service import discover_plugins, expand_env_vars + + # Validate plugin exists + discovered_plugins = discover_plugins() + if plugin_id not in discovered_plugins: + raise ValueError(f"Plugin '{plugin_id}' not found") + + plugin_class = discovered_plugins[plugin_id] + + # Check if plugin supports testing + if not hasattr(plugin_class, 'test_connection'): + return { + "success": False, + "message": f"Plugin '{plugin_id}' does not support connection testing", + "status": "unsupported" + } + + # Build complete config from provided data + test_config = {} + + # Merge settings + if 'settings' in config: + test_config.update(config['settings']) + + # Add env vars (expand any ${ENV_VAR} references with test values) + if 'env_vars' in config: + for key, value in config['env_vars'].items(): + # Skip masked values + if value == 'β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’β€’': + # Use actual env var value + value = os.getenv(key, '') + test_config[key.lower()] = value + + # Expand any remaining env var references + test_config = expand_env_vars(test_config) + + # Call plugin's test_connection static method + result = await plugin_class.test_connection(test_config) + + logger.info(f"Test connection for '{plugin_id}': {result.get('message', 'No message')}") + + return result + + except Exception as e: + logger.exception(f"Error testing connection for plugin '{plugin_id}'") + return { + "success": False, + "message": f"Connection test failed: {str(e)}", + "status": "error" + } diff --git a/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py index 50ffc77f..89e5b46f 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py @@ -16,11 +16,14 @@ from typing import Optional from fastapi import WebSocket, WebSocketDisconnect, Query +from starlette.websockets import WebSocketState from friend_lite.decoder import OmiOpusDecoder +import redis.asyncio as redis from advanced_omi_backend.auth import websocket_auth from advanced_omi_backend.client_manager import generate_client_id, get_client_manager from advanced_omi_backend.constants import OMI_CHANNELS, OMI_SAMPLE_RATE, OMI_SAMPLE_WIDTH +from advanced_omi_backend.controllers.session_controller import mark_session_complete from advanced_omi_backend.utils.audio_utils import process_audio_chunk from advanced_omi_backend.services.audio_stream import AudioStreamProducer from advanced_omi_backend.services.audio_stream.producer import get_audio_stream_producer @@ -39,6 +42,89 @@ pending_connections: set[str] = set() +async def subscribe_to_interim_results(websocket: WebSocket, session_id: str) -> None: + """ + Subscribe to interim transcription results from Redis Pub/Sub and forward to client WebSocket. + + Runs as background task during WebSocket connection. Listens for interim and final + transcription results published by the Deepgram streaming consumer and forwards them + to the connected client for real-time transcript display. + + Args: + websocket: Connected WebSocket client + session_id: Session ID (client_id) to subscribe to + + Note: + This task runs continuously until the WebSocket disconnects or the task is cancelled. + Results are published to Redis Pub/Sub channel: transcription:interim:{session_id} + """ + redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0") + + try: + # Create Redis client for Pub/Sub + redis_client = await redis.from_url(redis_url, decode_responses=True) + + # Create Pub/Sub instance + pubsub = redis_client.pubsub() + + # Subscribe to interim results channel for this session + channel = f"transcription:interim:{session_id}" + await pubsub.subscribe(channel) + + logger.info(f"πŸ“’ Subscribed to interim results channel: {channel}") + + # Listen for messages + while True: + try: + message = await pubsub.get_message(ignore_subscribe_messages=True, timeout=1.0) + + if message and message['type'] == 'message': + # Parse result data + try: + result_data = json.loads(message['data']) + + # Forward to client WebSocket + await websocket.send_json({ + "type": "interim_transcript", + "data": result_data + }) + + # Log for debugging + is_final = result_data.get("is_final", False) + text_preview = result_data.get("text", "")[:50] + result_type = "FINAL" if is_final else "interim" + logger.debug(f"βœ‰οΈ Forwarded {result_type} result to client {session_id}: {text_preview}...") + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse interim result JSON: {e}") + except Exception as send_error: + logger.error(f"Failed to send interim result to client {session_id}: {send_error}") + # WebSocket might be closed, exit loop + break + + except asyncio.TimeoutError: + # No message received, continue waiting + continue + except asyncio.CancelledError: + logger.info(f"Interim results subscriber cancelled for session {session_id}") + break + except Exception as e: + logger.error(f"Error in interim results subscriber for {session_id}: {e}", exc_info=True) + break + + except Exception as e: + logger.error(f"Failed to initialize interim results subscriber for {session_id}: {e}", exc_info=True) + finally: + try: + # Unsubscribe and close connections + await pubsub.unsubscribe(channel) + await pubsub.close() + await redis_client.aclose() + logger.info(f"πŸ”• Unsubscribed from interim results channel: {channel}") + except Exception as cleanup_error: + logger.error(f"Error cleaning up interim results subscriber: {cleanup_error}") + + async def parse_wyoming_protocol(ws: WebSocket) -> tuple[dict, Optional[bytes]]: """Parse Wyoming protocol: JSON header line followed by optional binary payload. @@ -105,9 +191,9 @@ async def create_client_state(client_id: str, user, device_name: Optional[str] = client_id, CHUNK_DIR, user.user_id, user.email ) - # Also track in persistent mapping (for database queries) - from advanced_omi_backend.client_manager import track_client_user_relationship - track_client_user_relationship(client_id, user.user_id) + # Also track in persistent mapping (for database queries + cross-container Redis) + from advanced_omi_backend.client_manager import track_client_user_relationship_async + await track_client_user_relationship_async(client_id, user.user_id) # Register client in user model (persistent) from advanced_omi_backend.users import register_client_to_user @@ -117,35 +203,22 @@ async def create_client_state(client_id: str, user, device_name: Optional[str] = async def cleanup_client_state(client_id: str): - """Clean up and remove client state, including cancelling speech detection job and marking session complete.""" - # Cancel the speech detection job for this client - from advanced_omi_backend.controllers.queue_controller import redis_conn - from rq.job import Job + """ + Clean up and remove client state, marking session complete. + + Note: We do NOT cancel the speech detection job here because: + 1. The job needs to process all audio data that was already sent + 2. If speech was detected, it should create a conversation + 3. The job will complete naturally when it sees session status = "finalizing" + 4. The job has a grace period (15s) to wait for final transcription + 5. RQ's job_timeout (24h) prevents jobs from hanging forever + """ + # Note: Previously we cancelled the speech detection job here, but this prevented + # conversations from being created when WebSocket disconnects mid-recording. + # The speech detection job now monitors session status and completes naturally. import redis.asyncio as redis - try: - job_id_key = f"speech_detection_job:{client_id}" - job_id_bytes = redis_conn.get(job_id_key) - - if job_id_bytes: - job_id = job_id_bytes.decode() - logger.info(f"πŸ›‘ Cancelling speech detection job {job_id} for client {client_id}") - - try: - # Fetch and cancel the job - job = Job.fetch(job_id, connection=redis_conn) - job.cancel() - logger.info(f"βœ… Successfully cancelled speech detection job {job_id}") - except Exception as job_error: - logger.warning(f"⚠️ Failed to cancel job {job_id}: {job_error}") - - # Clean up the tracking key - redis_conn.delete(job_id_key) - logger.info(f"🧹 Cleaned up job tracking key for client {client_id}") - else: - logger.debug(f"No speech detection job found for client {client_id}") - except Exception as e: - logger.warning(f"⚠️ Error during job cancellation for client {client_id}: {e}") + logger.info(f"πŸ”„ Letting speech detection job complete naturally for client {client_id} (if running)") # Mark all active sessions for this client as complete AND delete Redis streams try: @@ -153,6 +226,10 @@ async def cleanup_client_state(client_id: str): redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0") async_redis = redis.from_url(redis_url, decode_responses=False) + # Get audio stream producer for finalization + from advanced_omi_backend.services.audio_stream.producer import get_audio_stream_producer + audio_stream_producer = get_audio_stream_producer() + # Find all session keys for this client and mark them complete pattern = f"audio:session:*" cursor = 0 @@ -165,14 +242,19 @@ async def cleanup_client_state(client_id: str): # Check if this session belongs to this client client_id_bytes = await async_redis.hget(key, "client_id") if client_id_bytes and client_id_bytes.decode() == client_id: - # Mark session as complete (WebSocket disconnected) - await async_redis.hset(key, mapping={ - "status": "complete", - "completed_at": str(time.time()), - "completion_reason": "websocket_disconnect" - }) session_id = key.decode().replace("audio:session:", "") - logger.info(f"πŸ“Š Marked session {session_id[:12]} as complete (WebSocket disconnect)") + + # Check session status + status_bytes = await async_redis.hget(key, "status") + status = status_bytes.decode() if status_bytes else None + + # If session is still active, finalize it first (sets status + completion_reason atomically) + if status in ["active", None]: + logger.info(f"πŸ“Š Finalizing active session {session_id[:12]} due to WebSocket disconnect") + await audio_stream_producer.finalize_session(session_id, completion_reason="websocket_disconnect") + + # Mark session as complete (WebSocket disconnected) + await mark_session_complete(async_redis, session_id, "websocket_disconnect") sessions_closed += 1 if cursor == 0: @@ -181,12 +263,32 @@ async def cleanup_client_state(client_id: str): if sessions_closed > 0: logger.info(f"βœ… Closed {sessions_closed} active session(s) for client {client_id}") - # Delete Redis Streams for this client + # Set TTL on Redis Streams for this client (allows consumer groups to finish processing) stream_pattern = f"audio:stream:{client_id}" stream_key = await async_redis.exists(stream_pattern) if stream_key: - await async_redis.delete(stream_pattern) - logger.info(f"🧹 Deleted Redis stream: {stream_pattern}") + # Check how many messages are in the stream + stream_length = await async_redis.xlen(stream_pattern) + + # Check for pending messages in consumer groups + pending_count = 0 + try: + # Check streaming-transcription consumer group for pending messages + pending_info = await async_redis.xpending(stream_pattern, "streaming-transcription") + if pending_info: + pending_count = pending_info.get('pending', 0) + except Exception as e: + # Consumer group might not exist yet - that's ok + logger.debug(f"No consumer group for {stream_pattern}: {e}") + + if stream_length > 0 or pending_count > 0: + logger.warning( + f"⚠️ Closing {stream_pattern} with unprocessed data: " + f"{stream_length} messages in stream, {pending_count} pending in consumer group" + ) + + await async_redis.expire(stream_pattern, 60) # 60 second TTL for consumer group fan-out + logger.info(f"⏰ Set 60s TTL on Redis stream: {stream_pattern}") else: logger.debug(f"No Redis stream found for client {client_id}") @@ -279,8 +381,9 @@ async def _initialize_streaming_session( user_id: str, user_email: str, client_id: str, - audio_format: dict -) -> None: + audio_format: dict, + websocket: Optional[WebSocket] = None +) -> Optional[asyncio.Task]: """ Initialize streaming session with Redis and enqueue processing jobs. @@ -291,15 +394,22 @@ async def _initialize_streaming_session( user_email: User email client_id: Client ID audio_format: Audio format dict from audio-start event + websocket: Optional WebSocket connection to launch interim results subscriber + + Returns: + Interim results subscriber task if websocket provided and session initialized, None otherwise """ + application_logger.info( + f"πŸ”΄ BACKEND: _initialize_streaming_session called for {client_id}" + ) + if hasattr(client_state, 'stream_session_id'): application_logger.debug(f"Session already initialized for {client_id}") - return + return None - # Initialize stream session - client_state.stream_session_id = str(uuid.uuid4()) - client_state.stream_chunk_count = 0 - client_state.stream_audio_format = audio_format + # Initialize stream session - use client_id as session_id for predictable lookup + # All other session metadata goes to Redis (single source of truth) + client_state.stream_session_id = client_state.client_id application_logger.info(f"πŸ†” Created stream session: {client_state.stream_session_id}") # Determine transcription provider from config.yml @@ -313,21 +423,31 @@ async def _initialize_streaming_session( if not stt_model: raise ValueError("No default STT model configured in config.yml (defaults.stt)") - provider = stt_model.model_provider.lower() - if provider not in ["deepgram", "parakeet"]: - raise ValueError(f"Unsupported STT provider: {provider}. Expected: deepgram or parakeet") + # Use model_provider for session tracking (generic, not validated against hardcoded list) + provider = stt_model.model_provider.lower() if stt_model.model_provider else stt_model.name application_logger.info(f"πŸ“‹ Using STT provider: {provider} (model: {stt_model.name})") - - # Initialize session tracking in Redis + + # Initialize session tracking in Redis (SINGLE SOURCE OF TRUTH for session metadata) + # This includes user_email, connection info, audio format, chunk counters, job IDs, etc. + connection_id = f"ws_{client_id}_{int(time.time())}" await audio_stream_producer.init_session( session_id=client_state.stream_session_id, user_id=user_id, client_id=client_id, + user_email=user_email, + connection_id=connection_id, mode="streaming", provider=provider ) + # Store audio format in Redis session (not in ClientState) + from advanced_omi_backend.services.audio_stream.producer import get_audio_stream_producer + import json + session_key = f"audio:session:{client_state.stream_session_id}" + redis_client = audio_stream_producer.redis_client + await redis_client.hset(session_key, "audio_format", json.dumps(audio_format)) + # Enqueue streaming jobs (speech detection + audio persistence) from advanced_omi_backend.controllers.queue_controller import start_streaming_jobs @@ -337,8 +457,25 @@ async def _initialize_streaming_session( client_id=client_id ) - client_state.speech_detection_job_id = job_ids['speech_detection'] - client_state.audio_persistence_job_id = job_ids['audio_persistence'] + # Store job IDs in Redis session (not in ClientState) + await audio_stream_producer.update_session_job_ids( + session_id=client_state.stream_session_id, + speech_detection_job_id=job_ids['speech_detection'], + audio_persistence_job_id=job_ids['audio_persistence'] + ) + + # Note: Placeholder conversation creation is handled by the audio persistence job, + # which reads the always_persist_enabled setting from global config. + + # Launch interim results subscriber if WebSocket provided + subscriber_task = None + if websocket: + subscriber_task = asyncio.create_task( + subscribe_to_interim_results(websocket, client_state.stream_session_id) + ) + application_logger.info(f"πŸ“‘ Launched interim results subscriber for session {client_state.stream_session_id}") + + return subscriber_task async def _finalize_streaming_session( @@ -377,8 +514,8 @@ async def _finalize_streaming_session( # Send end-of-session signal to workers await audio_stream_producer.send_session_end_signal(session_id) - # Mark session as finalizing - await audio_stream_producer.finalize_session(session_id) + # Mark session as finalizing with user_stopped reason (audio-stop event) + await audio_stream_producer.finalize_session(session_id, completion_reason="user_stopped") # NOTE: Finalize job disabled - open_conversation_job now handles everything # The open_conversation_job will: @@ -399,11 +536,10 @@ async def _finalize_streaming_session( f"βœ… Session {session_id[:12]} marked as finalizing - open_conversation_job will handle cleanup" ) - # Clear session state - for attr in ['stream_session_id', 'stream_chunk_count', 'stream_audio_format', - 'speech_detection_job_id', 'audio_persistence_job_id']: - if hasattr(client_state, attr): - delattr(client_state, attr) + # Clear session state from ClientState (only stream_session_id is stored there now) + # All other session metadata lives in Redis (single source of truth) + if hasattr(client_state, 'stream_session_id'): + delattr(client_state, 'stream_session_id') except Exception as finalize_error: application_logger.error( @@ -439,14 +575,18 @@ async def _publish_audio_to_stream( application_logger.warning(f"⚠️ Received audio chunk before session initialized for {client_id}") return - # Increment chunk count and format chunk ID - client_state.stream_chunk_count += 1 - chunk_id = f"{client_state.stream_chunk_count:05d}" + session_id = client_state.stream_session_id + + # Increment chunk count in Redis (single source of truth) and format chunk ID + session_key = f"audio:session:{session_id}" + redis_client = audio_stream_producer.redis_client + chunk_count = await redis_client.hincrby(session_key, "chunks_published", 1) + chunk_id = f"{chunk_count:05d}" # Publish to Redis Stream using producer await audio_stream_producer.add_audio_chunk( audio_data=audio_data, - session_id=client_state.stream_session_id, + session_id=session_id, chunk_id=chunk_id, user_id=user_id, client_id=client_id, @@ -516,8 +656,9 @@ async def _handle_streaming_mode_audio( audio_format: dict, user_id: str, user_email: str, - client_id: str -) -> None: + client_id: str, + websocket: Optional[WebSocket] = None +) -> Optional[asyncio.Task]: """ Handle audio chunk in streaming mode. @@ -529,16 +670,22 @@ async def _handle_streaming_mode_audio( user_id: User ID user_email: User email client_id: Client ID + websocket: Optional WebSocket connection to launch interim results subscriber + + Returns: + Interim results subscriber task if websocket provided and session initialized, None otherwise """ # Initialize session if needed + subscriber_task = None if not hasattr(client_state, 'stream_session_id'): - await _initialize_streaming_session( + subscriber_task = await _initialize_streaming_session( client_state, audio_stream_producer, user_id, user_email, client_id, - audio_format + audio_format, + websocket=websocket # Pass WebSocket to launch interim results subscriber ) # Publish to Redis Stream @@ -553,6 +700,8 @@ async def _handle_streaming_mode_audio( audio_format.get("width", 2) ) + return subscriber_task + async def _handle_batch_mode_audio( client_state, @@ -561,7 +710,7 @@ async def _handle_batch_mode_audio( client_id: str ) -> None: """ - Handle audio chunk in batch mode - accumulate in memory. + Handle audio chunk in batch mode with rolling 30-minute limit. Args: client_state: Client state object @@ -573,14 +722,53 @@ async def _handle_batch_mode_audio( if not hasattr(client_state, 'batch_audio_chunks'): client_state.batch_audio_chunks = [] client_state.batch_audio_format = audio_format + client_state.batch_audio_bytes = 0 # Track total bytes + client_state.batch_chunks_processed = 0 # Track how many batches processed application_logger.info(f"πŸ“¦ Started batch audio accumulation for {client_id}") # Accumulate audio client_state.batch_audio_chunks.append(audio_data) + client_state.batch_audio_bytes += len(audio_data) application_logger.debug( f"πŸ“¦ Accumulated chunk #{len(client_state.batch_audio_chunks)} ({len(audio_data)} bytes) for {client_id}" ) + # Calculate duration: sample_rate * width * channels = bytes/second + sample_rate = audio_format.get("rate", 16000) + width = audio_format.get("width", 2) + channels = audio_format.get("channels", 1) + bytes_per_second = sample_rate * width * channels + + accumulated_seconds = client_state.batch_audio_bytes / bytes_per_second + MAX_BATCH_SECONDS = 30 * 60 # 30 minutes + + # Check if we've hit the 30-minute limit + if accumulated_seconds >= MAX_BATCH_SECONDS: + application_logger.warning( + f"⚠️ Batch accumulation reached 30-minute limit " + f"({accumulated_seconds:.1f}s, {client_state.batch_audio_bytes / 1024 / 1024:.1f} MB). " + f"Processing batch #{client_state.batch_chunks_processed + 1}..." + ) + + # Process this batch (will create conversation and transcribe) + await _process_rolling_batch( + client_state, + user_id=client_state.user_id, # Need to store these on session start + user_email=client_state.user_email, + client_id=client_state.client_id, + batch_number=client_state.batch_chunks_processed + 1 + ) + + # Clear buffer for next batch + client_state.batch_audio_chunks = [] + client_state.batch_audio_bytes = 0 + client_state.batch_chunks_processed += 1 + + application_logger.info( + f"βœ… Rolled batch #{client_state.batch_chunks_processed}. " + f"Starting fresh accumulation for next 30 minutes." + ) + async def _handle_audio_chunk( client_state, @@ -589,8 +777,9 @@ async def _handle_audio_chunk( audio_format: dict, user_id: str, user_email: str, - client_id: str -) -> None: + client_id: str, + websocket: Optional[WebSocket] = None +) -> Optional[asyncio.Task]: """ Route audio chunk to appropriate mode handler (streaming or batch). @@ -602,39 +791,102 @@ async def _handle_audio_chunk( user_id: User ID user_email: User email client_id: Client ID + websocket: Optional WebSocket connection to launch interim results subscriber + + Returns: + Interim results subscriber task if websocket provided and streaming mode, None otherwise """ recording_mode = getattr(client_state, 'recording_mode', 'batch') if recording_mode == "streaming": - await _handle_streaming_mode_audio( + return await _handle_streaming_mode_audio( client_state, audio_stream_producer, audio_data, - audio_format, user_id, user_email, client_id + audio_format, user_id, user_email, client_id, + websocket=websocket ) else: await _handle_batch_mode_audio( client_state, audio_data, audio_format, client_id ) + return None async def _handle_audio_session_start( client_state, audio_format: dict, - client_id: str + client_id: str, + websocket: Optional[WebSocket] = None ) -> tuple[bool, str]: """ - Handle audio-start event - set mode and switch to audio streaming. + Handle audio-start event - validate mode and set recording mode. Args: client_state: Client state object audio_format: Audio format dict with mode client_id: Client ID + websocket: Optional WebSocket connection (for WebUI error messages) Returns: (audio_streaming_flag, recording_mode) """ + from advanced_omi_backend.services.transcription import is_transcription_available + recording_mode = audio_format.get("mode", "batch") + + application_logger.info( + f"πŸ”΄ BACKEND: Received audio-start for {client_id} - " + f"mode={recording_mode}, full format={audio_format}" + ) + + # Store on client state for later use client_state.recording_mode = recording_mode + # VALIDATION: Check if streaming mode is available + if recording_mode == "streaming": + if not is_transcription_available("streaming"): + error_msg = ( + "Streaming transcription not available. " + "Please use Batch mode or configure a streaming STT provider (defaults.stt_stream in config.yml)." + ) + + application_logger.warning( + f"⚠️ Streaming mode requested but stt_stream not configured for {client_id}" + ) + + # Send error to WebSocket client (for WebUI display) + if websocket and websocket.client_state == WebSocketState.CONNECTED: + try: + error_response = { + "type": "error", + "error": "streaming_not_configured", + "message": error_msg, + "code": 400 + } + await websocket.send_json(error_response) + application_logger.info(f"πŸ“€ Sent streaming error to WebUI client {client_id}") + + # Close the websocket connection after sending error + await websocket.close(code=1008, reason="Streaming transcription not configured") + application_logger.info(f"πŸ”Œ Closed WebSocket connection for {client_id} due to streaming config error") + + # Raise ValueError to exit the handler completely + raise ValueError(error_msg) + except ValueError: + # Re-raise ValueError to exit handler + raise + except Exception as e: + application_logger.error(f"Failed to send error to client: {e}") + # Still raise ValueError to exit handler + raise ValueError(error_msg) + + # For OMI devices (no websocket), fall back to batch mode silently + if not websocket: + application_logger.warning( + f"πŸ”„ OMI device {client_id} requested streaming but falling back to batch mode" + ) + recording_mode = "batch" + client_state.recording_mode = recording_mode + application_logger.info( f"πŸŽ™οΈ Audio session started for {client_id} - " f"Format: {audio_format.get('rate')}Hz, " @@ -682,6 +934,99 @@ async def _handle_audio_session_stop( return False # Switch back to control mode +async def _process_rolling_batch( + client_state, + user_id: str, + user_email: str, + client_id: str, + batch_number: int +) -> None: + """ + Process accumulated batch audio as a rolling segment. + + Creates conversation titled "Recording Part {batch_number}" and enqueues transcription. + + Args: + client_state: Client state with batch_audio_chunks + user_id: User ID + user_email: User email + client_id: Client ID + batch_number: Sequential batch number (1, 2, 3...) + """ + if not hasattr(client_state, 'batch_audio_chunks') or not client_state.batch_audio_chunks: + application_logger.warning(f"⚠️ No audio chunks to process for rolling batch") + return + + try: + from advanced_omi_backend.models.conversation import create_conversation + from advanced_omi_backend.utils.audio_chunk_utils import convert_audio_to_chunks + + # Combine chunks + complete_audio = b''.join(client_state.batch_audio_chunks) + application_logger.info( + f"πŸ“¦ Rolling batch #{batch_number}: Combined {len(client_state.batch_audio_chunks)} chunks " + f"into {len(complete_audio)} bytes" + ) + + # Get audio format + audio_format = getattr(client_state, 'batch_audio_format', {}) + sample_rate = audio_format.get("rate", 16000) + width = audio_format.get("width", 2) + channels = audio_format.get("channels", 1) + + # Create conversation with batch number in title + conversation = create_conversation( + user_id=user_id, + client_id=client_id, + title=f"Recording Part {batch_number}", + summary="Rolling batch processing..." + ) + await conversation.insert() + conversation_id = conversation.conversation_id # Get the auto-generated ID + + # Convert to MongoDB chunks + num_chunks = await convert_audio_to_chunks( + conversation_id=conversation_id, + audio_data=complete_audio, + sample_rate=sample_rate, + channels=channels, + sample_width=width + ) + + # Enqueue transcription job + from advanced_omi_backend.controllers.queue_controller import ( + transcription_queue, + JOB_RESULT_TTL + ) + from advanced_omi_backend.workers.transcription_jobs import transcribe_full_audio_job + + version_id = str(uuid.uuid4()) + transcribe_job_id = f"transcribe_rolling_{conversation_id[:12]}_{batch_number}" + + transcription_job = transcription_queue.enqueue( + transcribe_full_audio_job, + conversation_id, + version_id, + f"rolling_batch_{batch_number}", # trigger + job_timeout=1800, # 30 minutes + result_ttl=JOB_RESULT_TTL, + job_id=transcribe_job_id, + description=f"Transcribe rolling batch #{batch_number} {conversation_id[:8]}", + meta={'conversation_id': conversation_id, 'client_id': client_id, 'batch_number': batch_number} + ) + + application_logger.info( + f"βœ… Rolling batch #{batch_number} created conversation {conversation_id}, " + f"enqueued transcription job {transcription_job.id}" + ) + + except Exception as e: + application_logger.error( + f"❌ Failed to process rolling batch #{batch_number}: {e}", + exc_info=True + ) + + async def _process_batch_audio_complete( client_state, user_id: str, @@ -702,8 +1047,8 @@ async def _process_batch_audio_complete( return try: - from advanced_omi_backend.utils.audio_utils import write_audio_file from advanced_omi_backend.models.conversation import create_conversation + from advanced_omi_backend.utils.audio_chunk_utils import convert_audio_to_chunks # Combine all chunks complete_audio = b''.join(client_state.batch_audio_chunks) @@ -711,57 +1056,92 @@ async def _process_batch_audio_complete( f"πŸ“¦ Batch mode: Combined {len(client_state.batch_audio_chunks)} chunks into {len(complete_audio)} bytes" ) - # Generate audio UUID and timestamp - audio_uuid = str(uuid.uuid4()) + # Timestamp for logging timestamp = int(time.time() * 1000) - # Write audio file and create AudioFile entry - relative_audio_path, file_path, duration = await write_audio_file( - raw_audio_data=complete_audio, - audio_uuid=audio_uuid, - source="websocket", - client_id=client_id, - user_id=user_id, - user_email=user_email, - timestamp=timestamp, - validate=False # PCM data, not WAV - ) + # Get audio format from batch metadata (set during audio-start) + audio_format = getattr(client_state, 'batch_audio_format', {}) + sample_rate = audio_format.get('rate', OMI_SAMPLE_RATE) + sample_width = audio_format.get('width', OMI_SAMPLE_WIDTH) + channels = audio_format.get('channels', OMI_CHANNELS) + + # Calculate audio duration + duration = len(complete_audio) / (sample_rate * sample_width * channels) application_logger.info( - f"βœ… Batch mode: Wrote audio file {relative_audio_path} ({duration:.1f}s)" + f"βœ… Batch mode: Processing audio ({duration:.1f}s)" ) # Create conversation immediately for batch audio (conversation_id auto-generated) version_id = str(uuid.uuid4()) conversation = create_conversation( - audio_uuid=audio_uuid, user_id=user_id, client_id=client_id, title="Batch Recording", summary="Processing batch audio..." ) - conversation.audio_path = relative_audio_path await conversation.insert() conversation_id = conversation.conversation_id # Get the auto-generated ID application_logger.info(f"πŸ“ Batch mode: Created conversation {conversation_id}") - # Enqueue post-conversation processing job chain - from advanced_omi_backend.controllers.queue_controller import start_post_conversation_jobs + # Convert audio directly to MongoDB chunks (no disk intermediary) + try: + num_chunks = await convert_audio_to_chunks( + conversation_id=conversation_id, + audio_data=complete_audio, + sample_rate=sample_rate, + channels=channels, + sample_width=sample_width, + ) + application_logger.info( + f"πŸ“¦ Batch mode: Converted to {num_chunks} MongoDB chunks " + f"(conversation {conversation_id[:12]})" + ) + except Exception as chunk_error: + application_logger.error( + f"Failed to convert batch audio to chunks: {chunk_error}", + exc_info=True + ) + # Continue anyway - transcription job will handle it + + # Enqueue batch transcription job first (file uploads need transcription) + from advanced_omi_backend.controllers.queue_controller import ( + start_post_conversation_jobs, + transcription_queue, + JOB_RESULT_TTL + ) + from advanced_omi_backend.workers.transcription_jobs import transcribe_full_audio_job + + version_id = str(uuid.uuid4()) + transcribe_job_id = f"transcribe_{conversation_id[:12]}" + + transcription_job = transcription_queue.enqueue( + transcribe_full_audio_job, + conversation_id, + version_id, + "batch", # trigger + job_timeout=1800, # 30 minutes + result_ttl=JOB_RESULT_TTL, + job_id=transcribe_job_id, + description=f"Transcribe batch audio {conversation_id[:8]}", + meta={'conversation_id': conversation_id, 'client_id': client_id} + ) + application_logger.info(f"πŸ“₯ Batch mode: Enqueued transcription job {transcription_job.id}") + + # Enqueue post-conversation processing job chain (depends on transcription) job_ids = start_post_conversation_jobs( conversation_id=conversation_id, - audio_uuid=audio_uuid, - audio_file_path=file_path, user_id=None, # Will be read from conversation in DB by jobs - post_transcription=True, # Run batch transcription for uploads + depends_on_job=transcription_job, # Wait for transcription to complete client_id=client_id # Pass client_id for UI tracking ) application_logger.info( f"βœ… Batch mode: Enqueued job chain for {conversation_id} - " - f"transcription ({job_ids['transcription']}) β†’ " + f"transcription ({transcription_job.id}) β†’ " f"speaker ({job_ids['speaker_recognition']}) β†’ " f"memory ({job_ids['memory']})" ) @@ -788,6 +1168,7 @@ async def handle_omi_websocket( client_id = None client_state = None + interim_subscriber_task = None try: # Setup connection (accept, auth, create client state) @@ -813,14 +1194,22 @@ async def handle_omi_websocket( if header["type"] == "audio-start": # Handle audio session start + application_logger.info(f"πŸ”΄ BACKEND: Received audio-start in OMI MODE for {client_id} (header={header})") application_logger.info(f"πŸŽ™οΈ OMI audio session started for {client_id}") - await _initialize_streaming_session( + + # Store user context on client state + client_state.user_id = user.user_id + client_state.user_email = user.email + client_state.client_id = client_id + + interim_subscriber_task = await _initialize_streaming_session( client_state, audio_stream_producer, user.user_id, user.email, client_id, - header.get("data", {"rate": OMI_SAMPLE_RATE, "width": OMI_SAMPLE_WIDTH, "channels": OMI_CHANNELS}) + header.get("data", {"rate": OMI_SAMPLE_RATE, "width": OMI_SAMPLE_WIDTH, "channels": OMI_CHANNELS}), + websocket=ws # Pass WebSocket to launch interim results subscriber ) elif header["type"] == "audio-chunk" and payload: @@ -883,6 +1272,16 @@ async def handle_omi_websocket( except Exception as e: application_logger.error(f"❌ WebSocket error for client {client_id}: {e}", exc_info=True) finally: + # Cancel interim results subscriber task if running + if interim_subscriber_task and not interim_subscriber_task.done(): + interim_subscriber_task.cancel() + try: + await interim_subscriber_task + except asyncio.CancelledError: + application_logger.info(f"Interim subscriber task cancelled for {client_id}") + except Exception as task_error: + application_logger.error(f"Error cancelling interim subscriber task: {task_error}") + # Clean up pending connection tracking pending_connections.discard(pending_client_id) @@ -909,6 +1308,7 @@ async def handle_pcm_websocket( client_id = None client_state = None + interim_subscriber_task = None try: # Setup connection (accept, auth, create client state) @@ -935,13 +1335,35 @@ async def handle_pcm_websocket( application_logger.debug(f"βœ… Received message type: {header.get('type')} for {client_id}") if header["type"] == "audio-start": + application_logger.info(f"πŸ”΄ BACKEND: Received audio-start in CONTROL MODE for {client_id}") application_logger.debug(f"πŸŽ™οΈ Processing audio-start for {client_id}") - # Handle audio session start using helper function + + # Store user context on client state for rolling batch processing + client_state.user_id = user.user_id + client_state.user_email = user.email + client_state.client_id = client_id + + # Handle audio session start using helper function (pass websocket for error handling) audio_streaming, recording_mode = await _handle_audio_session_start( client_state, header.get("data", {}), - client_id + client_id, + websocket=ws # Pass websocket for WebUI error display ) + + # Initialize streaming session + if recording_mode == "streaming": + application_logger.info(f"πŸ”΄ BACKEND: Initializing streaming session for {client_id}") + interim_subscriber_task = await _initialize_streaming_session( + client_state, + audio_stream_producer, + user.user_id, + user.email, + client_id, + header.get("data", {}), + websocket=ws + ) + continue # Continue to audio streaming mode elif header["type"] == "ping": @@ -1011,15 +1433,19 @@ async def handle_pcm_websocket( # Route to appropriate mode handler audio_format = control_header.get("data", {}) - await _handle_audio_chunk( + task = await _handle_audio_chunk( client_state, audio_stream_producer, audio_data, audio_format, user.user_id, user.email, - client_id + client_id, + websocket=ws ) + # Store subscriber task if it was created (first streaming chunk) + if task and not interim_subscriber_task: + interim_subscriber_task = task else: application_logger.warning(f"Expected binary payload for audio-chunk, got: {payload_msg.keys()}") else: @@ -1044,15 +1470,19 @@ async def handle_pcm_websocket( # Route to appropriate mode handler with default format default_format = {"rate": 16000, "width": 2, "channels": 1} - await _handle_audio_chunk( + task = await _handle_audio_chunk( client_state, audio_stream_producer, audio_data, default_format, user.user_id, user.email, - client_id + client_id, + websocket=ws ) + # Store subscriber task if it was created (first streaming chunk) + if task and not interim_subscriber_task: + interim_subscriber_task = task else: application_logger.warning(f"Unexpected message format in streaming mode: {message.keys()}") @@ -1115,6 +1545,16 @@ async def handle_pcm_websocket( f"❌ PCM WebSocket error for client {client_id}: {e}", exc_info=True ) finally: + # Cancel interim results subscriber task if running + if interim_subscriber_task and not interim_subscriber_task.done(): + interim_subscriber_task.cancel() + try: + await interim_subscriber_task + except asyncio.CancelledError: + application_logger.info(f"Interim subscriber task cancelled for {client_id}") + except Exception as task_error: + application_logger.error(f"Error cancelling interim subscriber task: {task_error}") + # Clean up pending connection tracking pending_connections.discard(pending_client_id) diff --git a/backends/advanced/src/advanced_omi_backend/cron.py b/backends/advanced/src/advanced_omi_backend/cron.py new file mode 100644 index 00000000..161ceb31 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/cron.py @@ -0,0 +1,121 @@ +""" +Annotation cron scheduler for AI-powered suggestion surfacing. + +This scheduler runs background jobs to: +1. Surface AI suggestions for potential transcript/memory errors (daily) +2. Fine-tune error detection models using user feedback (weekly) + +Configuration via environment variables: +- MONGODB_URI: MongoDB connection string +- DEV_MODE: When true, uses 1-minute intervals for testing + +Usage: + uv run python -m advanced_omi_backend.cron +""" + +import asyncio +import logging +import os +from datetime import datetime, timezone + +from beanie import init_beanie +from motor.motor_asyncio import AsyncIOMotorClient + +from advanced_omi_backend.models.annotation import Annotation +from advanced_omi_backend.models.conversation import Conversation +from advanced_omi_backend.models.user import User +from advanced_omi_backend.workers.annotation_jobs import ( + finetune_hallucination_model, + surface_error_suggestions, +) + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Configuration +MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://mongo:27017") +DEV_MODE = os.getenv("DEV_MODE", "false").lower() == "true" + +# Intervals (1 minute in dev, normal in production) +if DEV_MODE: + SUGGESTION_INTERVAL = 60 # 1 minute for dev testing + TRAINING_INTERVAL = 60 # 1 minute for dev testing + logger.info("πŸ”§ DEV_MODE enabled - using 1-minute intervals for testing") +else: + SUGGESTION_INTERVAL = 24 * 60 * 60 # Daily + TRAINING_INTERVAL = 7 * 24 * 60 * 60 # Weekly + logger.info("πŸ“… Production mode - using daily/weekly intervals") + + +async def init_db(): + """Initialize database connection""" + try: + client = AsyncIOMotorClient(MONGODB_URI) + await init_beanie( + database=client.chronicle, + document_models=[Annotation, Conversation, User], + ) + logger.info("βœ… Database connection initialized") + except Exception as e: + logger.error(f"❌ Failed to initialize database: {e}") + raise + + +async def run_scheduler(): + """Main scheduler loop""" + await init_db() + logger.info("πŸ• Annotation cron scheduler started") + logger.info(f" - Suggestion interval: {SUGGESTION_INTERVAL}s") + logger.info(f" - Training interval: {TRAINING_INTERVAL}s") + + last_suggestion_run = datetime.now(timezone.utc) + last_training_run = datetime.now(timezone.utc) + + while True: + try: + now = datetime.now(timezone.utc) + + # Daily: Surface AI suggestions + if (now - last_suggestion_run).total_seconds() >= SUGGESTION_INTERVAL: + logger.info(f"πŸ€– Running suggestion surfacing at {now}") + try: + await surface_error_suggestions() + last_suggestion_run = now + logger.info("βœ… Suggestion surfacing completed") + except Exception as e: + logger.error(f"❌ Suggestion job failed: {e}", exc_info=True) + + # Weekly: Fine-tune model + if (now - last_training_run).total_seconds() >= TRAINING_INTERVAL: + logger.info(f"πŸŽ“ Running model fine-tuning at {now}") + try: + await finetune_hallucination_model() + last_training_run = now + logger.info("βœ… Model fine-tuning completed") + except Exception as e: + logger.error(f"❌ Training job failed: {e}", exc_info=True) + + # Sleep for check interval + await asyncio.sleep(60) # Check every minute + + except KeyboardInterrupt: + logger.info("β›” Scheduler stopped by user") + break + except Exception as e: + logger.error(f"❌ Unexpected error in scheduler loop: {e}", exc_info=True) + # Continue running despite errors + await asyncio.sleep(60) + + +if __name__ == "__main__": + logger.info("πŸš€ Starting annotation cron scheduler...") + try: + asyncio.run(run_scheduler()) + except KeyboardInterrupt: + logger.info("πŸ‘‹ Annotation cron scheduler stopped") + except Exception as e: + logger.error(f"πŸ’₯ Fatal error: {e}", exc_info=True) + exit(1) diff --git a/backends/advanced/src/advanced_omi_backend/database.py b/backends/advanced/src/advanced_omi_backend/database.py index ae7650b0..1b214b6d 100644 --- a/backends/advanced/src/advanced_omi_backend/database.py +++ b/backends/advanced/src/advanced_omi_backend/database.py @@ -14,7 +14,7 @@ # MongoDB Configuration MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://mongo:27017") -MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "friend-lite") +MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "chronicle") mongo_client = AsyncIOMotorClient( MONGODB_URI, diff --git a/backends/advanced/src/advanced_omi_backend/main.py b/backends/advanced/src/advanced_omi_backend/main.py index df51e1cc..ee60696f 100644 --- a/backends/advanced/src/advanced_omi_backend/main.py +++ b/backends/advanced/src/advanced_omi_backend/main.py @@ -2,7 +2,7 @@ """ Unified Omi-audio service - * Accepts Opus packets over a WebSocket (`/ws`) or PCM over a WebSocket (`/ws_pcm`). + * Accepts audio over a unified WebSocket endpoint (`/ws`) with codec parameter (pcm or opus). * Uses a central queue to decouple audio ingestion from processing. * A saver consumer buffers PCM and writes 30-second WAV chunks to `./data/audio_chunks/`. * A transcription consumer sends each chunk to a Wyoming ASR service. @@ -16,6 +16,7 @@ """ import logging + import uvicorn from advanced_omi_backend.app_factory import create_app diff --git a/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py b/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py index eafeffec..069d5239 100644 --- a/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py +++ b/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py @@ -56,12 +56,11 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware): "/auth/jwt/logout", "/auth/cookie/logout", "/ws", - "/ws_omi", - "/ws_pcm", "/mcp", "/health", "/auth/health", "/readiness", + "/api/queue/dashboard", # Auto-refresh endpoint, too noisy } # Binary content types to exclude diff --git a/backends/advanced/src/advanced_omi_backend/model_registry.py b/backends/advanced/src/advanced_omi_backend/model_registry.py index 18f464ae..382674da 100644 --- a/backends/advanced/src/advanced_omi_backend/model_registry.py +++ b/backends/advanced/src/advanced_omi_backend/model_registry.py @@ -4,12 +4,11 @@ definitions (LLM, embeddings, etc.) in a provider-agnostic way. Now using Pydantic for robust validation and type safety. +Environment variable resolution is handled by OmegaConf in the config module. """ from __future__ import annotations -import os -import re import yaml from pathlib import Path from typing import Any, Dict, List, Optional @@ -17,74 +16,9 @@ import logging from pydantic import BaseModel, Field, field_validator, model_validator, ConfigDict, ValidationError -def _resolve_env(value: Any) -> Any: - """Resolve ``${VAR:-default}`` patterns inside a single value. - - This helper is intentionally minimal: it only operates on strings and leaves - all other types unchanged. Patterns of the form ``${VAR}`` or - ``${VAR:-default}`` are expanded using ``os.getenv``: - - - If the environment variable **VAR** is set, its value is used. - - Otherwise the optional ``default`` is used (or ``\"\"`` if omitted). - - Examples: - >>> os.environ.get("OLLAMA_MODEL") - >>> _resolve_env("${OLLAMA_MODEL:-llama3.1:latest}") - 'llama3.1:latest' - - >>> os.environ["OLLAMA_MODEL"] = "llama3.2:latest" - >>> _resolve_env("${OLLAMA_MODEL:-llama3.1:latest}") - 'llama3.2:latest' - - >>> _resolve_env("Bearer ${OPENAI_API_KEY:-}") - 'Bearer ' # when OPENAI_API_KEY is not set - - Note: - Use :func:`_deep_resolve_env` to apply this logic to an entire - nested config structure (dicts/lists) loaded from YAML. - """ - if not isinstance(value, str): - return value - - pattern = re.compile(r"\$\{([^}:]+)(?::-(.*?))?\}") - - def repl(match: re.Match[str]) -> str: - var, default = match.group(1), match.group(2) - return os.getenv(var, default or "") - - return pattern.sub(repl, value) - - -def _deep_resolve_env(data: Any) -> Any: - """Recursively resolve environment variables in nested structures. - - This walks arbitrary Python structures produced by ``yaml.safe_load`` and - applies :func:`_resolve_env` to every string it finds. Dictionaries and - lists are traversed deeply; scalars are passed through unchanged. - - Examples: - >>> os.environ["OPENAI_MODEL"] = "gpt-4o-mini" - >>> cfg = { - ... "models": [ - ... {"model_name": "${OPENAI_MODEL:-gpt-4o-mini}"}, - ... {"model_url": "${OPENAI_BASE_URL:-https://api.openai.com/v1}"} - ... ] - ... } - >>> resolved = _deep_resolve_env(cfg) - >>> resolved["models"][0]["model_name"] - 'gpt-4o-mini' - >>> resolved["models"][1]["model_url"] - 'https://api.openai.com/v1' - - This is what :func:`load_models_config` uses immediately after loading - ``config.yml`` so that all ``${VAR:-default}`` placeholders are resolved - before Pydantic validation and model registry construction. - """ - if isinstance(data, dict): - return {k: _deep_resolve_env(v) for k, v in data.items()} - if isinstance(data, list): - return [_deep_resolve_env(v) for v in data] - return _resolve_env(data) +# Import config merging for defaults.yml + config.yml integration +# OmegaConf handles environment variable resolution (${VAR:-default} syntax) +from advanced_omi_backend.config import get_config class ModelDef(BaseModel): @@ -250,73 +184,47 @@ def list_model_types(self) -> List[str]: def _find_config_path() -> Path: - """Find config.yml in expected locations. - - Search order: - 1. CONFIG_FILE environment variable - 2. Current working directory - 3. /app/config.yml (Docker container) - 4. Walk up from module directory - - Returns: - Path to config.yml (may not exist) """ - # ENV override - cfg_env = os.getenv("CONFIG_FILE") - if cfg_env and Path(cfg_env).exists(): - return Path(cfg_env) - - # Common locations (container vs repo root) - candidates = [Path("config.yml"), Path("/app/config.yml")] + Find config.yml using canonical path from config module. - # Also walk up from current file's parents defensively - try: - for parent in Path(__file__).resolve().parents: - c = parent / "config.yml" - if c.exists(): - return c - except Exception: - pass + DEPRECATED: Use advanced_omi_backend.config.get_config_yml_path() directly. + Kept for backward compatibility. - for c in candidates: - if c.exists(): - return c - - # Last resort: return /app/config.yml path (may not exist yet) - return Path("/app/config.yml") + Returns: + Path to config.yml + """ + from advanced_omi_backend.config import get_config_yml_path + return get_config_yml_path() def load_models_config(force_reload: bool = False) -> Optional[AppModels]: - """Load model configuration from config.yml. - - This function loads and parses the config.yml file, resolves environment - variables, validates model definitions using Pydantic, and caches the result. - + """Load model configuration from merged defaults.yml + config.yml. + + This function loads defaults.yml and config.yml, merges them with user overrides, + validates model definitions using Pydantic, and caches the result. + Environment variables are resolved by OmegaConf during config loading. + Args: force_reload: If True, reload from disk even if already cached - + Returns: AppModels instance with validated configuration, or None if config not found - + Raises: ValidationError: If config.yml has invalid model definitions - yaml.YAMLError: If config.yml has invalid YAML syntax """ global _REGISTRY if _REGISTRY is not None and not force_reload: return _REGISTRY - cfg_path = _find_config_path() - if not cfg_path.exists(): + # Get merged configuration (defaults + user config) + # OmegaConf resolves environment variables automatically + try: + raw = get_config(force_reload=force_reload) + except Exception as e: + logging.error(f"Failed to load merged configuration: {e}") return None - # Load and parse YAML - with cfg_path.open("r") as f: - raw = yaml.safe_load(f) or {} - - # Resolve environment variables - raw = _deep_resolve_env(raw) - # Extract sections defaults = raw.get("defaults", {}) or {} model_list = raw.get("models", []) or [] diff --git a/backends/advanced/src/advanced_omi_backend/models/annotation.py b/backends/advanced/src/advanced_omi_backend/models/annotation.py new file mode 100644 index 00000000..b2a986a5 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/annotation.py @@ -0,0 +1,175 @@ +""" +Unified annotation system for Chronicle. + +Supports annotations for memories, transcripts, and future content types. +Enables both user edits and AI-powered suggestions. +""" + +from enum import Enum +from typing import Optional +from datetime import datetime, timezone +import uuid + +from beanie import Document, Indexed +from pydantic import BaseModel, Field + + +class AnnotationType(str, Enum): + """Type of content being annotated.""" + MEMORY = "memory" + TRANSCRIPT = "transcript" + DIARIZATION = "diarization" # Speaker identification corrections + + +class AnnotationSource(str, Enum): + """Origin of the annotation.""" + USER = "user" # User-created edit + MODEL_SUGGESTION = "model_suggestion" # AI-generated suggestion + + +class AnnotationStatus(str, Enum): + """Lifecycle status of annotation.""" + PENDING = "pending" # Waiting for user review (suggestions) + ACCEPTED = "accepted" # Applied to content + REJECTED = "rejected" # User dismissed suggestion + + +class Annotation(Document): + """ + Unified annotation model for all content types. + + Supports both user edits and AI-powered suggestions across + memories, transcripts, and future content types (chat, action items, etc.). + + Design: Polymorphic model with type-specific fields based on annotation_type. + """ + + # Identity + id: str = Field(default_factory=lambda: str(uuid.uuid4())) + + # Classification + annotation_type: AnnotationType + user_id: Indexed(str) + source: AnnotationSource = Field(default=AnnotationSource.USER) + status: AnnotationStatus = Field(default=AnnotationStatus.ACCEPTED) + + # Content + original_text: str = "" # Text before correction (not used for diarization) + corrected_text: str = "" # Text after correction (not used for diarization) + + # Polymorphic References (based on annotation_type) + # For MEMORY annotations: + memory_id: Optional[str] = None + + # For TRANSCRIPT annotations: + conversation_id: Optional[str] = None + segment_index: Optional[int] = None + + # For DIARIZATION annotations: + original_speaker: Optional[str] = None # Speaker label before correction + corrected_speaker: Optional[str] = None # Speaker label after correction + segment_start_time: Optional[float] = None # Time offset for reference + + # Processed tracking (applies to ALL annotation types) + processed: bool = Field(default=False) # Whether annotation has been applied/sent to training + processed_at: Optional[datetime] = None # When annotation was processed + processed_by: Optional[str] = None # What processed it (manual, cron, apply, training, etc.) + + # Timestamps (Python 3.12+ compatible) + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) + + class Settings: + name = "annotations" + # Create indexes on commonly queried fields + # Note: Enum fields and Optional fields don't use Indexed() wrapper + indexes = [ + "annotation_type", # Query by type (memory vs transcript vs diarization) + "user_id", # User-scoped queries + "status", # Filter by status (pending/accepted/rejected) + "memory_id", # Lookup annotations for specific memory + "conversation_id", # Lookup annotations for specific conversation + "processed", # Query unprocessed annotations + ] + + def is_memory_annotation(self) -> bool: + """Check if this is a memory annotation.""" + return self.annotation_type == AnnotationType.MEMORY + + def is_transcript_annotation(self) -> bool: + """Check if this is a transcript annotation.""" + return self.annotation_type == AnnotationType.TRANSCRIPT + + def is_diarization_annotation(self) -> bool: + """Check if this is a diarization annotation.""" + return self.annotation_type == AnnotationType.DIARIZATION + + def is_pending_suggestion(self) -> bool: + """Check if this is a pending AI suggestion.""" + return ( + self.source == AnnotationSource.MODEL_SUGGESTION + and self.status == AnnotationStatus.PENDING + ) + + +# Pydantic Request/Response Models + + +class AnnotationCreateBase(BaseModel): + """Base model for annotation creation.""" + original_text: str = "" # Optional for diarization + corrected_text: str = "" # Optional for diarization + status: AnnotationStatus = AnnotationStatus.ACCEPTED + + +class MemoryAnnotationCreate(AnnotationCreateBase): + """Create memory annotation request.""" + memory_id: str + original_text: str # Required for memory annotations + corrected_text: str # Required for memory annotations + + +class TranscriptAnnotationCreate(AnnotationCreateBase): + """Create transcript annotation request.""" + conversation_id: str + segment_index: int + original_text: str # Required for transcript annotations + corrected_text: str # Required for transcript annotations + + +class DiarizationAnnotationCreate(BaseModel): + """Create diarization annotation request.""" + conversation_id: str + segment_index: int + original_speaker: str + corrected_speaker: str + segment_start_time: Optional[float] = None + status: AnnotationStatus = AnnotationStatus.ACCEPTED + + +class AnnotationResponse(BaseModel): + """Annotation response for API.""" + id: str + annotation_type: AnnotationType + user_id: str + memory_id: Optional[str] = None + conversation_id: Optional[str] = None + segment_index: Optional[int] = None + original_text: str = "" + corrected_text: str = "" + original_speaker: Optional[str] = None + corrected_speaker: Optional[str] = None + segment_start_time: Optional[float] = None + processed: bool = False + processed_at: Optional[datetime] = None + processed_by: Optional[str] = None + status: AnnotationStatus + source: AnnotationSource + created_at: datetime + + class Config: + from_attributes = True # Pydantic v2 compatibility diff --git a/backends/advanced/src/advanced_omi_backend/models/audio_chunk.py b/backends/advanced/src/advanced_omi_backend/models/audio_chunk.py new file mode 100644 index 00000000..cea20ef7 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/audio_chunk.py @@ -0,0 +1,158 @@ +""" +Audio chunk models for MongoDB-based audio storage. + +This module contains the AudioChunkDocument model for storing Opus-compressed +audio chunks in MongoDB. Each chunk represents a 10-second segment of audio +from a conversation. +""" + +from datetime import datetime +from typing import Optional +from pydantic import ConfigDict, Field, field_serializer +from beanie import Document, Indexed +from bson import Binary + + +class AudioChunkDocument(Document): + """ + MongoDB document representing a 10-second audio chunk. + + Audio chunks are stored in Opus-compressed format for ~94% storage reduction + compared to raw PCM. Chunks are sequentially numbered and can be reconstructed + into complete WAV files for playback or batch processing. + + Storage Format: + - Encoding: Opus (24kbps VBR, optimized for speech) + - Chunk Duration: 10 seconds (configurable) + - Original Format: 16kHz, 16-bit, mono PCM + - Compression Ratio: ~0.047 (94% reduction) + + Indexes: + - (conversation_id, chunk_index): Primary query pattern for reconstruction + - conversation_id: Conversation lookup and counting + - created_at: Maintenance and cleanup operations + """ + + # Pydantic v2 configuration + model_config = ConfigDict(arbitrary_types_allowed=True) + + # Primary identifiers + conversation_id: Indexed(str) = Field( + description="Parent conversation ID (UUID format)" + ) + chunk_index: int = Field( + description="Sequential chunk number (0-based)", + ge=0 + ) + + # Audio data + audio_data: bytes = Field( + description="Opus-encoded audio bytes (stored as BSON Binary in MongoDB)" + ) + + # Size tracking + original_size: int = Field( + description="Original PCM size in bytes (before compression)", + gt=0 + ) + compressed_size: int = Field( + description="Opus-encoded size in bytes (after compression)", + gt=0 + ) + + # Time boundaries + start_time: float = Field( + description="Start time in seconds from conversation start", + ge=0.0 + ) + end_time: float = Field( + description="End time in seconds from conversation start", + gt=0.0 + ) + duration: float = Field( + description="Chunk duration in seconds (typically 10.0)", + gt=0.0 + ) + + # Audio format + sample_rate: int = Field( + default=16000, + description="Original PCM sample rate (Hz)" + ) + channels: int = Field( + default=1, + description="Number of audio channels (1=mono, 2=stereo)" + ) + + # Optional analysis + has_speech: Optional[bool] = Field( + default=None, + description="Voice Activity Detection result (if available)" + ) + + # Metadata + created_at: datetime = Field( + default_factory=datetime.utcnow, + description="Chunk creation timestamp" + ) + + # Soft delete fields + deleted: bool = Field( + default=False, + description="Whether this chunk was soft-deleted" + ) + deleted_at: Optional[datetime] = Field( + default=None, + description="When the chunk was marked as deleted" + ) + + @field_serializer('audio_data') + def serialize_audio_data(self, v: bytes) -> Binary: + """ + Convert bytes to BSON Binary for MongoDB storage. + + MongoDB returns BSON Binary as plain bytes during deserialization, + but expects Binary type for serialization to ensure proper binary data handling. + """ + if isinstance(v, bytes): + return Binary(v) + return v + + class Settings: + """Beanie document settings.""" + name = "audio_chunks" + + indexes = [ + # Primary query: Retrieve chunks in order for a conversation + [("conversation_id", 1), ("chunk_index", 1)], + + # Conversation lookup and counting + "conversation_id", + + # Maintenance queries (cleanup, monitoring) + "created_at", + + # Soft delete filtering + "deleted" + ] + + @property + def compression_ratio(self) -> float: + """Calculate compression ratio (compressed/original).""" + if self.original_size == 0: + return 0.0 + return self.compressed_size / self.original_size + + @property + def storage_savings_percent(self) -> float: + """Calculate storage savings as percentage.""" + return (1 - self.compression_ratio) * 100 + + def __repr__(self) -> str: + """Human-readable representation.""" + return ( + f"AudioChunk(conversation={self.conversation_id[:8]}..., " + f"index={self.chunk_index}, " + f"duration={self.duration:.1f}s, " + f"compression={self.compression_ratio:.3f})" + ) diff --git a/backends/advanced/src/advanced_omi_backend/models/audio_file.py b/backends/advanced/src/advanced_omi_backend/models/audio_file.py deleted file mode 100644 index e1e2c09a..00000000 --- a/backends/advanced/src/advanced_omi_backend/models/audio_file.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -AudioFile models for Chronicle backend. - -This module contains the Beanie Document model for audio_chunks collection, -which stores ALL audio files (both with and without speech). This is the -storage layer - all audio gets stored here with its metadata. - -Note: Named AudioFile (not AudioChunk) to avoid confusion with wyoming.audio.AudioChunk -which is the in-memory streaming audio data structure. -""" - -from datetime import datetime -from typing import Dict, List, Optional, Any -from pydantic import BaseModel, Field - -from beanie import Document, Indexed - - -class AudioFile(Document): - """ - Audio file model representing persisted audio files in MongoDB. - - The audio_chunks collection stores ALL raw audio files (both with and without speech). - This is just for audio file storage and metadata. If speech is detected, a - Conversation document is created which contains transcripts and memories. - - This is different from wyoming.audio.AudioChunk which is for streaming audio data. - """ - - # Core identifiers - audio_uuid: Indexed(str, unique=True) = Field(description="Unique audio identifier") - source: Indexed(str) = Field( - default="upload", - description="Source of the audio (upload, gdrive, etc.)" - ) - audio_path: str = Field(description="Path to raw audio file") - client_id: Indexed(str) = Field(description="Client device identifier") - timestamp: Indexed(int) = Field(description="Unix timestamp in milliseconds") - - # User information - user_id: Indexed(str) = Field(description="User who owns this audio") - user_email: Optional[str] = Field(None, description="User email") - - # Audio processing - cropped_audio_path: Optional[str] = Field(None, description="Path to cropped audio (speech only)") - - # Speech-driven conversation linking - conversation_id: Optional[str] = Field( - None, - description="Link to Conversation if speech was detected" - ) - has_speech: bool = Field(default=False, description="Whether speech was detected") - speech_analysis: Dict[str, Any] = Field( - default_factory=dict, - description="Speech detection results" - ) - - - - class Settings: - name = "audio_chunks" - indexes = [ - "audio_uuid", - "client_id", - "user_id", - "timestamp", - ] \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/models/conversation.py b/backends/advanced/src/advanced_omi_backend/models/conversation.py index 01dd5d96..e4446f0f 100644 --- a/backends/advanced/src/advanced_omi_backend/models/conversation.py +++ b/backends/advanced/src/advanced_omi_backend/models/conversation.py @@ -7,24 +7,18 @@ from datetime import datetime from typing import Dict, List, Optional, Any, Union -from pydantic import BaseModel, Field, model_validator, computed_field +from pydantic import BaseModel, Field, model_validator, computed_field, field_validator from enum import Enum import uuid from beanie import Document, Indexed +from pymongo import IndexModel class Conversation(Document): """Complete conversation model with versioned processing.""" - # Nested Enums - class TranscriptProvider(str, Enum): - """Supported transcription providers.""" - DEEPGRAM = "deepgram" - MISTRAL = "mistral" - PARAKEET = "parakeet" - SPEECH_DETECTION = "speech_detection" # Legacy value - UNKNOWN = "unknown" # Fallback value + # Nested Enums - Note: TranscriptProvider accepts any string value for flexibility class MemoryProvider(str, Enum): """Supported memory providers.""" @@ -49,6 +43,13 @@ class EndReason(str, Enum): UNKNOWN = "unknown" # Unknown or legacy reason # Nested Models + class Word(BaseModel): + """Individual word with timestamp in a transcript.""" + word: str = Field(description="Word text") + start: float = Field(description="Start time in seconds") + end: float = Field(description="End time in seconds") + confidence: Optional[float] = Field(None, description="Confidence score (0-1)") + class SpeakerSegment(BaseModel): """Individual speaker segment in a transcript.""" start: float = Field(description="Start time in seconds") @@ -56,14 +57,22 @@ class SpeakerSegment(BaseModel): text: str = Field(description="Transcript text for this segment") speaker: str = Field(description="Speaker identifier") confidence: Optional[float] = Field(None, description="Confidence score (0-1)") + words: List["Conversation.Word"] = Field(default_factory=list, description="Word-level timestamps for this segment") class TranscriptVersion(BaseModel): """Version of a transcript with processing metadata.""" version_id: str = Field(description="Unique version identifier") transcript: Optional[str] = Field(None, description="Full transcript text") - segments: List["Conversation.SpeakerSegment"] = Field(default_factory=list, description="Speaker segments") - provider: Optional["Conversation.TranscriptProvider"] = Field(None, description="Transcription provider used") - model: Optional[str] = Field(None, description="Model used (e.g., nova-3, voxtral-mini-2507)") + words: List["Conversation.Word"] = Field( + default_factory=list, + description="Word-level timestamps for entire transcript" + ) + segments: List["Conversation.SpeakerSegment"] = Field( + default_factory=list, + description="Speaker segments (filled by speaker recognition)" + ) + provider: Optional[str] = Field(None, description="Transcription provider used (deepgram, parakeet, etc.)") + model: Optional[str] = Field(None, description="Model used (e.g., nova-3, parakeet)") created_at: datetime = Field(description="When this version was created") processing_time_seconds: Optional[float] = Field(None, description="Time taken to process") metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional provider-specific metadata") @@ -81,13 +90,32 @@ class MemoryVersion(BaseModel): # Core identifiers conversation_id: Indexed(str, unique=True) = Field(default_factory=lambda: str(uuid.uuid4()), description="Unique conversation identifier") - audio_uuid: Indexed(str) = Field(description="Session/audio identifier (for tracking audio files)") user_id: Indexed(str) = Field(description="User who owns this conversation") client_id: Indexed(str) = Field(description="Client device identifier") - # Audio file reference - audio_path: Optional[str] = Field(None, description="Path to audio file (relative to CHUNK_DIR)") - cropped_audio_path: Optional[str] = Field(None, description="Path to cropped audio file (relative to CHUNK_DIR)") + # External file tracking (for deduplication of imported files) + external_source_id: Optional[str] = Field( + None, + description="External file identifier (e.g., Google Drive file_id) for deduplication" + ) + external_source_type: Optional[str] = Field( + None, + description="Type of external source (gdrive, dropbox, s3, etc.)" + ) + + # MongoDB chunk-based audio storage (new system) + audio_chunks_count: Optional[int] = Field( + None, + description="Total number of 10-second audio chunks stored in MongoDB" + ) + audio_total_duration: Optional[float] = Field( + None, + description="Total audio duration in seconds (sum of all chunks)" + ) + audio_compression_ratio: Optional[float] = Field( + None, + description="Compression ratio (compressed_size / original_size), typically ~0.047 for Opus" + ) # Creation metadata created_at: Indexed(datetime) = Field(default_factory=datetime.utcnow, description="When the conversation was created") @@ -97,6 +125,16 @@ class MemoryVersion(BaseModel): deletion_reason: Optional[str] = Field(None, description="Reason for deletion (no_meaningful_speech, audio_file_not_ready, etc.)") deleted_at: Optional[datetime] = Field(None, description="When the conversation was marked as deleted") + # Always persist audio flag and processing status + processing_status: Optional[str] = Field( + None, + description="Processing status: pending_transcription, transcription_failed, completed" + ) + always_persist: bool = Field( + default=False, + description="Flag indicating conversation was created for audio persistence" + ) + # Conversation completion tracking end_reason: Optional["Conversation.EndReason"] = Field(None, description="Reason why the conversation ended") completed_at: Optional[datetime] = Field(None, description="When the conversation was completed/closed") @@ -228,12 +266,35 @@ def memory_version_count(self) -> int: """Get count of memory versions.""" return len(self.memory_versions) + @computed_field + @property + def active_transcript_version_number(self) -> Optional[int]: + """Get 1-based version number of the active transcript version.""" + if not self.active_transcript_version: + return None + for i, version in enumerate(self.transcript_versions): + if version.version_id == self.active_transcript_version: + return i + 1 + return None + + @computed_field + @property + def active_memory_version_number(self) -> Optional[int]: + """Get 1-based version number of the active memory version.""" + if not self.active_memory_version: + return None + for i, version in enumerate(self.memory_versions): + if version.version_id == self.active_memory_version: + return i + 1 + return None + def add_transcript_version( self, version_id: str, transcript: str, - segments: List["Conversation.SpeakerSegment"], - provider: "Conversation.TranscriptProvider", + words: Optional[List["Conversation.Word"]] = None, + segments: Optional[List["Conversation.SpeakerSegment"]] = None, + provider: str = None, # Provider name from config.yml (deepgram, parakeet, etc.) model: Optional[str] = None, processing_time_seconds: Optional[float] = None, metadata: Optional[Dict[str, Any]] = None, @@ -243,7 +304,8 @@ def add_transcript_version( new_version = Conversation.TranscriptVersion( version_id=version_id, transcript=transcript, - segments=segments, + words=words or [], + segments=segments or [], provider=provider, model=model, created_at=datetime.now(), @@ -310,13 +372,13 @@ class Settings: "conversation_id", "user_id", "created_at", - [("user_id", 1), ("created_at", -1)] # Compound index for user queries + [("user_id", 1), ("created_at", -1)], # Compound index for user queries + IndexModel([("external_source_id", 1)], sparse=True) # Sparse index for deduplication ] # Factory function for creating conversations def create_conversation( - audio_uuid: str, user_id: str, client_id: str, conversation_id: Optional[str] = None, @@ -324,12 +386,13 @@ def create_conversation( summary: Optional[str] = None, transcript: Optional[str] = None, segments: Optional[List["Conversation.SpeakerSegment"]] = None, + external_source_id: Optional[str] = None, + external_source_type: Optional[str] = None, ) -> Conversation: """ Factory function to create a new conversation. Args: - audio_uuid: Unique identifier for the audio session user_id: User who owns this conversation client_id: Client device identifier conversation_id: Optional unique conversation identifier (auto-generated if not provided) @@ -337,26 +400,25 @@ def create_conversation( summary: Optional conversation summary transcript: Optional transcript text segments: Optional speaker segments + external_source_id: Optional external file ID for deduplication (e.g., Google Drive file_id) + external_source_type: Optional external source type (gdrive, dropbox, etc.) Returns: Conversation instance """ # Build the conversation data conv_data = { - "audio_uuid": audio_uuid, "user_id": user_id, "client_id": client_id, "created_at": datetime.now(), "title": title, "summary": summary, - "transcript": transcript or "", - "segments": segments or [], "transcript_versions": [], "active_transcript_version": None, "memory_versions": [], "active_memory_version": None, - "memories": [], - "memory_count": 0 + "external_source_id": external_source_id, + "external_source_type": external_source_type, } # Only set conversation_id if provided, otherwise let the model auto-generate it diff --git a/backends/advanced/src/advanced_omi_backend/models/job.py b/backends/advanced/src/advanced_omi_backend/models/job.py index b295782c..5d906865 100644 --- a/backends/advanced/src/advanced_omi_backend/models/job.py +++ b/backends/advanced/src/advanced_omi_backend/models/job.py @@ -35,15 +35,16 @@ async def _ensure_beanie_initialized(): from motor.motor_asyncio import AsyncIOMotorClient from beanie import init_beanie from advanced_omi_backend.models.conversation import Conversation - from advanced_omi_backend.models.audio_file import AudioFile - from advanced_omi_backend.models.user import User + from advanced_omi_backend.models.audio_chunk import AudioChunkDocument + from advanced_omi_backend.models.user import User + from advanced_omi_backend.models.waveform import WaveformData from pymongo.errors import ConfigurationError - + # Get MongoDB URI from environment mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017") # Create MongoDB client - mongodb_database = os.getenv("MONGODB_DATABASE", "friend-lite") + mongodb_database = os.getenv("MONGODB_DATABASE", "chronicle") client = AsyncIOMotorClient(mongodb_uri) try: database = client.get_default_database(mongodb_database) @@ -54,7 +55,7 @@ async def _ensure_beanie_initialized(): # Initialize Beanie await init_beanie( database=database, - document_models=[User, Conversation, AudioFile], + document_models=[User, Conversation, AudioChunkDocument, WaveformData], ) _beanie_initialized = True diff --git a/backends/advanced/src/advanced_omi_backend/models/user.py b/backends/advanced/src/advanced_omi_backend/models/user.py index b0ced195..7291f9bb 100644 --- a/backends/advanced/src/advanced_omi_backend/models/user.py +++ b/backends/advanced/src/advanced_omi_backend/models/user.py @@ -16,6 +16,7 @@ class UserCreate(BaseUserCreate): """Schema for creating new users.""" display_name: Optional[str] = None + notification_email: Optional[EmailStr] = None is_superuser: Optional[bool] = False @@ -23,6 +24,7 @@ class UserRead(BaseUser[PydanticObjectId]): """Schema for reading user data.""" display_name: Optional[str] = None + notification_email: Optional[EmailStr] = None registered_clients: dict[str, dict] = Field(default_factory=dict) primary_speakers: list[dict] = Field(default_factory=list) @@ -31,6 +33,7 @@ class UserUpdate(BaseUserUpdate): """Schema for updating user data.""" display_name: Optional[str] = None + notification_email: Optional[EmailStr] = None is_superuser: Optional[bool] = None def create_update_dict(self): @@ -38,6 +41,8 @@ def create_update_dict(self): update_dict = super().create_update_dict() if self.display_name is not None: update_dict["display_name"] = self.display_name + if self.notification_email is not None: + update_dict["notification_email"] = self.notification_email return update_dict def create_update_dict_superuser(self): @@ -45,6 +50,8 @@ def create_update_dict_superuser(self): update_dict = super().create_update_dict_superuser() if self.display_name is not None: update_dict["display_name"] = self.display_name + if self.notification_email is not None: + update_dict["notification_email"] = self.notification_email return update_dict @@ -58,6 +65,7 @@ class User(BeanieBaseUser, Document): ) display_name: Optional[str] = None + notification_email: Optional[EmailStr] = None # Client tracking for audio devices registered_clients: dict[str, dict] = Field(default_factory=dict) # Speaker processing filter configuration diff --git a/backends/advanced/src/advanced_omi_backend/models/waveform.py b/backends/advanced/src/advanced_omi_backend/models/waveform.py new file mode 100644 index 00000000..caf6fd49 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/waveform.py @@ -0,0 +1,47 @@ +""" +Waveform visualization data model for conversations. + +This module provides the WaveformData model for storing pre-computed +waveform visualization data, enabling UI to display audio waveforms +without real-time decoding. +""" + +from datetime import datetime +from typing import List, Optional + +from beanie import Document, Indexed +from pydantic import Field + + +class WaveformData(Document): + """Pre-computed waveform visualization for conversations.""" + + # Link to parent conversation + conversation_id: Indexed(str) = Field( + description="Parent conversation ID (unique per conversation)" + ) + + # Waveform amplitude data + samples: List[float] = Field( + description="Amplitude samples normalized to [-1.0, 1.0] range" + ) + sample_rate: int = Field( + description="Samples per second (e.g., 10 = 1 sample per 100ms)" + ) + + # Metadata + duration_seconds: float = Field(description="Total audio duration in seconds") + created_at: datetime = Field( + default_factory=datetime.utcnow, + description="When this waveform was generated" + ) + processing_time_seconds: Optional[float] = Field( + None, + description="Time taken to generate waveform" + ) + + class Settings: + name = "waveforms" + indexes = [ + "conversation_id", # Unique lookup by conversation + ] diff --git a/backends/advanced/src/advanced_omi_backend/plugins/__init__.py b/backends/advanced/src/advanced_omi_backend/plugins/__init__.py new file mode 100644 index 00000000..3ccea7dc --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/__init__.py @@ -0,0 +1,18 @@ +""" +Chronicle plugin system for multi-level pipeline extension. + +Plugins can hook into different stages of the processing pipeline: +- transcript: When new transcript segment arrives +- conversation: When conversation processing completes +- memory: After memory extraction finishes + +Trigger types control when plugins execute: +- wake_word: Only when transcript starts with specified wake word +- always: Execute on every invocation at access level +- conditional: Execute based on custom condition (future) +""" + +from .base import BasePlugin, PluginContext, PluginResult +from .router import PluginRouter + +__all__ = ['BasePlugin', 'PluginContext', 'PluginResult', 'PluginRouter'] diff --git a/backends/advanced/src/advanced_omi_backend/plugins/base.py b/backends/advanced/src/advanced_omi_backend/plugins/base.py new file mode 100644 index 00000000..dbd13301 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/base.py @@ -0,0 +1,145 @@ +""" +Base plugin classes for Chronicle multi-level plugin architecture. + +Provides: +- PluginContext: Context passed to plugin execution +- PluginResult: Result from plugin execution +- BasePlugin: Abstract base class for all plugins +""" +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any, List +from dataclasses import dataclass, field + + +@dataclass +class PluginContext: + """Context passed to plugin execution""" + user_id: str + event: str # Event name (e.g., "transcript.streaming", "conversation.complete") + data: Dict[str, Any] # Event-specific data + metadata: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class PluginResult: + """Result from plugin execution""" + success: bool + data: Optional[Dict[str, Any]] = None + message: Optional[str] = None + should_continue: bool = True # Whether to continue normal processing + + +class BasePlugin(ABC): + """ + Base class for all Chronicle plugins. + + Plugins can hook into different stages of the processing pipeline: + - transcript: When new transcript segment arrives + - conversation: When conversation processing completes + - memory: When memory extraction finishes + + Subclasses should: + 1. Set SUPPORTED_ACCESS_LEVELS to list which levels they support + 2. Implement initialize() for plugin initialization + 3. Implement the appropriate callback methods (on_transcript, on_conversation_complete, on_memory_processed) + 4. Optionally implement cleanup() for resource cleanup + """ + + # Subclasses declare which access levels they support + SUPPORTED_ACCESS_LEVELS: List[str] = [] + + def __init__(self, config: Dict[str, Any]): + """ + Initialize plugin with configuration. + + Args: + config: Plugin configuration from config/plugins.yml + Contains: enabled, events, condition, and plugin-specific config + """ + import logging + logger = logging.getLogger(__name__) + + self.config = config + self.enabled = config.get('enabled', False) + + # NEW terminology with backward compatibility + self.events = config.get('events') or config.get('subscriptions', []) + self.condition = config.get('condition') or config.get('trigger', {'type': 'always'}) + + # Deprecation warnings + plugin_name = config.get('name', 'unknown') + if 'subscriptions' in config: + logger.warning(f"Plugin '{plugin_name}': 'subscriptions' is deprecated, use 'events' instead") + if 'trigger' in config: + logger.warning(f"Plugin '{plugin_name}': 'condition' is deprecated, use 'condition' instead") + if 'access_level' in config: + logger.warning(f"Plugin '{plugin_name}': 'access_level' is deprecated and ignored") + + @abstractmethod + async def initialize(self): + """ + Initialize plugin resources (connect to services, etc.) + + Called during application startup after plugin registration. + Raise an exception if initialization fails. + """ + pass + + async def cleanup(self): + """ + Clean up plugin resources. + + Called during application shutdown. + Override if your plugin needs cleanup (closing connections, etc.) + """ + pass + + # Access-level specific methods (implement only what you need) + + async def on_transcript(self, context: PluginContext) -> Optional[PluginResult]: + """ + Called when new transcript segment arrives. + + Context data contains: + - transcript: str - The transcript text + - segment_id: str - Unique segment identifier + - conversation_id: str - Current conversation ID + + For wake_word conditions, router adds: + - command: str - Command with wake word stripped + - original_transcript: str - Full transcript + + Returns: + PluginResult with success status, optional message, and should_continue flag + """ + pass + + async def on_conversation_complete(self, context: PluginContext) -> Optional[PluginResult]: + """ + Called when conversation processing completes. + + Context data contains: + - conversation: dict - Full conversation data + - transcript: str - Complete transcript + - duration: float - Conversation duration + - conversation_id: str - Conversation identifier + + Returns: + PluginResult with success status, optional message, and should_continue flag + """ + pass + + async def on_memory_processed(self, context: PluginContext) -> Optional[PluginResult]: + """ + Called after memory extraction finishes. + + Context data contains: + - memories: list - Extracted memories + - conversation: dict - Source conversation + - memory_count: int - Number of memories created + - conversation_id: str - Conversation identifier + + Returns: + PluginResult with success status, optional message, and should_continue flag + """ + pass diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/README.md b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/README.md new file mode 100644 index 00000000..f1a21a52 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/README.md @@ -0,0 +1,276 @@ +# Email Summarizer Plugin + +Automatically sends email summaries when conversations complete. + +## Features + +- πŸ“§ **Automatic Email Delivery**: Sends emails when conversations finish +- πŸ€– **LLM-Powered Summaries**: Uses your configured LLM to generate intelligent summaries +- 🎨 **Beautiful HTML Emails**: Professional-looking emails with proper formatting +- πŸ“± **Plain Text Fallback**: Ensures compatibility with all email clients +- ⚑ **Async Processing**: Non-blocking email sending +- πŸ”’ **Secure SMTP**: TLS/SSL encryption support + +## How It Works + +1. User completes a conversation (via OMI device or file upload) +2. Plugin receives `conversation.complete` event +3. Retrieves user email from database +4. Generates LLM summary (2-3 sentences) +5. Formats beautiful HTML and plain text emails +6. Sends email via configured SMTP server + +## Configuration Architecture + +Chronicle uses a clean three-file separation for plugin configuration: + +1. **`backends/advanced/.env`** - Secrets only (SMTP credentials, API keys) + - Gitignored for security + - Never commit to version control + +2. **`plugins/email_summarizer/config.yml`** - Plugin-specific settings + - Email content options (subject prefix, max sentences, etc.) + - References environment variables using `${VAR_NAME}` syntax + - Defaults work for most users - typically no editing needed + +3. **`config/plugins.yml`** - Orchestration only + - `enabled` flag + - Event subscriptions + - Trigger conditions + +This separation keeps secrets secure and configuration organized. See [`plugin-configuration.md`](../../../Docs/plugin-configuration.md) for details. + +## Configuration + +### Step 1: Get SMTP Credentials + +#### For Gmail (Recommended for Testing): + +1. **Enable 2-Factor Authentication** on your Google account +2. Go to Google Account β†’ Security β†’ 2-Step Verification +3. Scroll down to **App passwords** +4. Generate an app password for "Mail" +5. Copy the 16-character password (no spaces) + +#### For Other Providers: + +- **Outlook/Hotmail**: smtp.office365.com:587 +- **Yahoo**: smtp.mail.yahoo.com:587 +- **Custom SMTP**: Use your provider's settings + +### Step 2: Configure Environment Variables + +Add to `backends/advanced/.env`: + +```bash +# Email Summarizer Plugin +SMTP_HOST=smtp.gmail.com +SMTP_PORT=587 +SMTP_USERNAME=your-email@gmail.com +SMTP_PASSWORD=your-app-password-here # Gmail App Password (16 chars, no spaces) +SMTP_USE_TLS=true +FROM_EMAIL=noreply@chronicle.ai +FROM_NAME=Chronicle AI +``` + +### Step 3: Enable Plugin + +Add to `config/plugins.yml` (orchestration only): + +```yaml +plugins: + email_summarizer: + enabled: true + events: + - conversation.complete + condition: + type: always +``` + +**That's it!** Plugin-specific settings are already configured in: +- **`plugins/email_summarizer/config.yml`** - Email content options (subject prefix, max sentences, etc.) +- **SMTP credentials** are automatically read from `.env` via environment variable references + +You typically don't need to edit `config.yml` - the defaults work for most users. If you want to customize email content settings, see the Configuration Options section below. + +### Step 4: Restart Backend + +```bash +cd backends/advanced +docker compose restart +``` + +## Configuration Options + +All configuration options below are in **`plugins/email_summarizer/config.yml`** and have sensible defaults. You typically don't need to modify these unless you want to customize email content. + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `smtp_host` | string | `smtp.gmail.com` | SMTP server hostname | +| `smtp_port` | integer | `587` | SMTP server port (587 for TLS, 465 for SSL) | +| `smtp_username` | string | **Required** | SMTP authentication username | +| `smtp_password` | string | **Required** | SMTP authentication password | +| `smtp_use_tls` | boolean | `true` | Use STARTTLS encryption | +| `from_email` | string | **Required** | Sender email address | +| `from_name` | string | `Chronicle AI` | Sender display name | +| `subject_prefix` | string | `Conversation Summary` | Email subject prefix | +| `summary_max_sentences` | integer | `3` | Maximum sentences in LLM summary | +| `include_conversation_id` | boolean | `true` | Show conversation ID in email | +| `include_duration` | boolean | `true` | Show conversation duration | + +## Email Template + +### Subject Line +``` +Conversation Summary - Jan 15, 2025 at 10:30 AM +``` + +### Email Body +``` +πŸ“‹ SUMMARY +[LLM-generated 2-3 sentence summary of key points] + +πŸ“ FULL TRANSCRIPT +[Complete conversation transcript] + +πŸ“Š METADATA +Duration: 5m 30s +Conversation ID: 507f1f77bc... +``` + +## Testing + +### Test SMTP Connection + +```bash +cd backends/advanced +uv run python -m advanced_omi_backend.services.email_service +``` + +This will: +- Test SMTP connectivity +- Send a test email to your SMTP username +- Verify configuration + +### Test Plugin Integration + +1. Start the backend with plugin enabled +2. Upload a test audio file or use OMI device +3. Wait for conversation to complete +4. Check your email inbox + +## Troubleshooting + +### "Authentication failed" + +**For Gmail:** +- Make sure you're using an **App Password**, not your regular password +- Enable 2-Factor Authentication first +- App password should be 16 characters (xxxx xxxx xxxx xxxx) + +**For other providers:** +- Verify username and password are correct +- Check if "less secure apps" needs to be enabled + +### "Connection timeout" + +- Check `smtp_host` and `smtp_port` are correct +- Verify firewall allows outbound SMTP connections +- Try port 465 with SSL instead of 587 with TLS + +### "No email received" + +- Check user has email configured in database +- Look for plugin logs: `docker compose logs -f chronicle-backend | grep EmailSummarizer` +- Verify plugin is enabled in `plugins.yml` +- Check spam/junk folder + +### "Empty summary" or "LLM error" + +- Verify LLM service is configured and running +- Check LLM API keys are valid +- Plugin will fall back to truncated transcript if LLM fails + +## πŸ”’ Security Best Practices + +### NEVER Commit Secrets to Version Control + +Always use environment variable references in configuration files: + +```yaml +# plugins/email_summarizer/config.yml +smtp_password: ${SMTP_PASSWORD} # Reference to environment variable +``` + +```bash +# backends/advanced/.env (gitignored) +SMTP_PASSWORD=xnetcqctkkfgzllh # Actual secret stored safely +``` + +### How Configuration Works + +The plugin system automatically: +- βœ… Loads settings from `plugins/email_summarizer/config.yml` +- βœ… Expands `${ENV_VAR}` references from `backends/advanced/.env` +- βœ… Merges orchestration settings (enabled, events) from `config/plugins.yml` +- βœ… Prevents accidental secret commits (only .env has secrets, and it's gitignored) + +**Always use the setup wizard** instead of manual configuration: +```bash +uv run python backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/setup.py +``` + +### Additional Security Tips + +1. **Never commit SMTP passwords** to git (use .env only) +2. **Use environment variable references** (`${SMTP_PASSWORD}`) in YAML files +3. **Enable TLS/SSL** for encrypted SMTP connections +4. **Gmail App Passwords** are safer than account passwords +5. **Rotate credentials** periodically +6. **Review commits** before pushing to ensure no hardcoded secrets + +## Development + +### File Structure + +``` +plugins/email_summarizer/ +β”œβ”€β”€ __init__.py # Plugin exports +β”œβ”€β”€ plugin.py # Main plugin logic +β”œβ”€β”€ templates.py # Email HTML/text templates +└── README.md # This file +``` + +### Key Methods + +- `on_conversation_complete()` - Main event handler +- `_get_user_email()` - Fetch user email from database +- `_generate_summary()` - Generate LLM summary with fallback +- `_format_subject()` - Format email subject line + +### Dependencies + +- `advanced_omi_backend.database` - MongoDB access +- `advanced_omi_backend.llm_client` - LLM generation +- `advanced_omi_backend.services.email_service` - SMTP email sending + +## Future Enhancements + +- [ ] Email templates customization +- [ ] User preference for email frequency +- [ ] Unsubscribe link +- [ ] Email digests (daily/weekly summaries) +- [ ] Rich formatting for action items +- [ ] Attachment support (audio files) +- [ ] Multiple recipient support +- [ ] Email open tracking + +## Support + +- **Issues**: [GitHub Issues](https://github.com/chronicle-ai/chronicle/issues) +- **Discussions**: [GitHub Discussions](https://github.com/chronicle-ai/chronicle/discussions) +- **Documentation**: [Chronicle Docs](https://github.com/chronicle-ai/chronicle) + +## License + +MIT License - see project LICENSE file for details. diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/__init__.py b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/__init__.py new file mode 100644 index 00000000..525acd51 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/__init__.py @@ -0,0 +1,9 @@ +""" +Email Summarizer Plugin for Chronicle. + +Automatically sends email summaries when conversations complete. +""" + +from .plugin import EmailSummarizerPlugin + +__all__ = ['EmailSummarizerPlugin'] diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/config.yml b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/config.yml new file mode 100644 index 00000000..9f4ed8f6 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/config.yml @@ -0,0 +1,23 @@ +# Email Summarizer Plugin Configuration +# +# This file contains non-secret configuration for the email summarizer plugin. +# Secrets (SMTP credentials) are stored in backends/advanced/.env +# Plugin orchestration (enabled, events) is in config/plugins.yml + +# Email content settings +subject_prefix: "Conversation Summary" +summary_max_sentences: 3 +include_conversation_id: true +include_duration: true + +# SMTP Configuration (reads from .env) +# These use environment variable references ${VAR_NAME} +smtp_host: ${SMTP_HOST} +smtp_port: ${SMTP_PORT:-587} +smtp_username: ${SMTP_USERNAME} +smtp_password: ${SMTP_PASSWORD} +smtp_use_tls: ${SMTP_USE_TLS:-true} + +# Email sender configuration +from_email: ${FROM_EMAIL} +from_name: ${FROM_NAME:-Chronicle AI} diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/email_service.py b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/email_service.py new file mode 100644 index 00000000..b51de0b5 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/email_service.py @@ -0,0 +1,237 @@ +""" +SMTP Email Service for Chronicle. + +Provides email sending functionality via SMTP protocol with support for: +- HTML and plain text emails +- TLS/SSL encryption +- Gmail and other SMTP providers +- Async implementation +""" +import asyncio +import logging +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from typing import Any, Dict, Optional + +from advanced_omi_backend.utils.logging_utils import mask_dict + +logger = logging.getLogger(__name__) + + +class SMTPEmailService: + """SMTP email service for sending emails via SMTP protocol.""" + + def __init__(self, config: Dict[str, Any]): + """ + Initialize SMTP email service with configuration. + + Args: + config: SMTP configuration containing: + - smtp_host: SMTP server hostname + - smtp_port: SMTP server port (default: 587) + - smtp_username: SMTP username + - smtp_password: SMTP password + - smtp_use_tls: Whether to use TLS (default: True) + - from_email: Sender email address + - from_name: Sender display name (default: 'Chronicle AI') + """ + self.host = config.get('smtp_host') + self.port = config.get('smtp_port', 587) + self.username = config.get('smtp_username') + self.password = config.get('smtp_password') + self.use_tls = config.get('smtp_use_tls', True) + self.from_email = config.get('from_email') + self.from_name = config.get('from_name', 'Chronicle AI') + + # Validate required configuration + if not all([self.host, self.username, self.password, self.from_email]): + raise ValueError( + "SMTP configuration incomplete. Required: smtp_host, smtp_username, " + "smtp_password, from_email" + ) + + # Log configuration with masked secrets + masked_config = mask_dict(config) + logger.info( + f"SMTP Email Service initialized: {self.username}@{self.host}:{self.port} " + f"(TLS: {self.use_tls})" + ) + logger.debug(f"SMTP config: {masked_config}") + + async def send_email( + self, + to_email: str, + subject: str, + body_text: str, + body_html: Optional[str] = None + ) -> bool: + """ + Send email via SMTP with HTML/text support. + + Args: + to_email: Recipient email address + subject: Email subject line + body_text: Plain text email body + body_html: Optional HTML email body + + Returns: + True if email sent successfully, False otherwise + """ + try: + # Create message container + msg = MIMEMultipart('alternative') + msg['Subject'] = subject + msg['From'] = f"{self.from_name} <{self.from_email}>" + msg['To'] = to_email + + # Attach plain text version + text_part = MIMEText(body_text, 'plain') + msg.attach(text_part) + + # Attach HTML version if provided + if body_html: + html_part = MIMEText(body_html, 'html') + msg.attach(html_part) + + # Send email asynchronously (run in thread pool to avoid blocking) + await asyncio.to_thread(self._send_smtp, msg, to_email) + + logger.info(f"βœ… Email sent successfully to {to_email}: {subject}") + return True + + except Exception as e: + logger.error(f"Failed to send email to {to_email}: {e}", exc_info=True) + return False + + def _send_smtp(self, msg: MIMEMultipart, to_email: str) -> None: + """ + Internal method to send email via SMTP (blocking). + + Args: + msg: MIME message to send + to_email: Recipient email address + + Raises: + Exception: If SMTP sending fails + """ + # Connect to SMTP server + if self.use_tls: + # Use STARTTLS (most common for port 587) + smtp_server = smtplib.SMTP(self.host, self.port, timeout=30) + smtp_server.ehlo() + smtp_server.starttls() + smtp_server.ehlo() + else: + # Direct connection (for port 465 SSL or no encryption) + smtp_server = smtplib.SMTP(self.host, self.port, timeout=30) + + try: + # Login and send + smtp_server.login(self.username, self.password) + smtp_server.send_message(msg) + logger.debug(f"SMTP send completed for {to_email}") + finally: + smtp_server.quit() + + async def test_connection(self) -> bool: + """ + Test SMTP connectivity and authentication. + + Returns: + True if connection successful, False otherwise + """ + try: + await asyncio.to_thread(self._test_smtp_connection) + logger.info(f"βœ… SMTP connection test successful: {self.username}@{self.host}") + return True + except Exception as e: + logger.error(f"SMTP connection test failed: {e}", exc_info=True) + return False + + def _test_smtp_connection(self) -> None: + """ + Internal method to test SMTP connection (blocking). + + Raises: + Exception: If connection fails + """ + try: + if self.use_tls: + smtp_server = smtplib.SMTP(self.host, self.port, timeout=10) + smtp_server.ehlo() + smtp_server.starttls() + smtp_server.ehlo() + else: + smtp_server = smtplib.SMTP(self.host, self.port, timeout=10) + + try: + smtp_server.login(self.username, self.password) + logger.debug("SMTP authentication successful") + finally: + smtp_server.quit() + except smtplib.SMTPAuthenticationError as e: + # Note: Error message from smtplib should not contain password, but be cautious + raise Exception(f"SMTP Authentication failed for {self.username}. Check credentials. For Gmail, use an App Password instead of your regular password. Error: {str(e)}") + except smtplib.SMTPConnectError as e: + raise Exception(f"Failed to connect to SMTP server {self.host}:{self.port}. Check host and port. Error: {str(e)}") + except smtplib.SMTPServerDisconnected as e: + raise Exception(f"SMTP server disconnected unexpectedly. Check TLS settings (port 587 needs TLS, port 465 needs SSL). Error: {str(e)}") + except TimeoutError as e: + raise Exception(f"Connection to {self.host}:{self.port} timed out. Check firewall/network settings. Error: {str(e)}") + except Exception as e: + raise Exception(f"SMTP connection test failed: {type(e).__name__}: {str(e)}") + + +# Test script for development/debugging +async def main(): + """Test the SMTP email service.""" + import os + + from dotenv import load_dotenv + + load_dotenv() + + config = { + 'smtp_host': os.getenv('SMTP_HOST', 'smtp.gmail.com'), + 'smtp_port': int(os.getenv('SMTP_PORT', 587)), + 'smtp_username': os.getenv('SMTP_USERNAME'), + 'smtp_password': os.getenv('SMTP_PASSWORD'), + 'smtp_use_tls': os.getenv('SMTP_USE_TLS', 'true').lower() == 'true', + 'from_email': os.getenv('FROM_EMAIL', 'noreply@chronicle.ai'), + 'from_name': os.getenv('FROM_NAME', 'Chronicle AI'), + } + + try: + service = SMTPEmailService(config) + + # Test connection + print("Testing SMTP connection...") + if await service.test_connection(): + print("βœ… Connection test passed") + else: + print("❌ Connection test failed") + return + + # Send test email + test_email = config['smtp_username'] # Send to self + print(f"\nSending test email to {test_email}...") + + success = await service.send_email( + to_email=test_email, + subject="Chronicle Email Service Test", + body_text="This is a test email from Chronicle Email Service.\n\nIf you received this, the email service is working correctly!", + body_html="

Chronicle Email Service Test

This is a test email from Chronicle Email Service.

If you received this, the email service is working correctly!

" + ) + + if success: + print("βœ… Test email sent successfully") + else: + print("❌ Failed to send test email") + + except Exception as e: + print(f"❌ Error: {e}") + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/plugin.py b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/plugin.py new file mode 100644 index 00000000..a61a915d --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/plugin.py @@ -0,0 +1,399 @@ +""" +Email Summarizer Plugin for Chronicle. + +Automatically sends email summaries when conversations complete. +""" +import logging +from datetime import datetime +from typing import Any, Dict, List, Optional + +from advanced_omi_backend.database import get_database +from advanced_omi_backend.llm_client import async_generate +from advanced_omi_backend.utils.logging_utils import mask_dict + +from ..base import BasePlugin, PluginContext, PluginResult +from .email_service import SMTPEmailService +from .templates import format_html_email, format_text_email + +logger = logging.getLogger(__name__) + + +class EmailSummarizerPlugin(BasePlugin): + """ + Plugin for sending email summaries when conversations complete. + + Subscribes to conversation.complete events and: + 1. Retrieves user email from database + 2. Generates LLM summary of the conversation + 3. Formats HTML and plain text emails + 4. Sends email via SMTP + + Configuration (config/plugins.yml): + enabled: true + events: + - conversation.complete + condition: + type: always + smtp_host: smtp.gmail.com + smtp_port: 587 + smtp_username: ${SMTP_USERNAME} + smtp_password: ${SMTP_PASSWORD} + smtp_use_tls: true + from_email: noreply@chronicle.ai + from_name: Chronicle AI + subject_prefix: "Conversation Summary" + summary_max_sentences: 3 + """ + + SUPPORTED_ACCESS_LEVELS: List[str] = ['conversation'] + + name = "Email Summarizer" + description = "Sends email summaries when conversations complete" + + def __init__(self, config: Dict[str, Any]): + """ + Initialize Email Summarizer plugin. + + Args: + config: Plugin configuration from config/plugins.yml + """ + super().__init__(config) + + self.subject_prefix = config.get('subject_prefix', 'Conversation Summary') + self.summary_max_sentences = config.get('summary_max_sentences', 3) + self.include_conversation_id = config.get('include_conversation_id', True) + self.include_duration = config.get('include_duration', True) + + # Email service will be initialized in initialize() + self.email_service: Optional[SMTPEmailService] = None + + # MongoDB database handle + self.db = None + + async def initialize(self): + """ + Initialize plugin resources. + + Sets up SMTP email service and MongoDB connection. + + Raises: + ValueError: If SMTP configuration is incomplete + Exception: If email service initialization fails + """ + if not self.enabled: + logger.info("Email Summarizer plugin is disabled, skipping initialization") + return + + logger.info("Initializing Email Summarizer plugin...") + + # Initialize SMTP email service + try: + smtp_config = { + 'smtp_host': self.config.get('smtp_host'), + 'smtp_port': self.config.get('smtp_port', 587), + 'smtp_username': self.config.get('smtp_username'), + 'smtp_password': self.config.get('smtp_password'), + 'smtp_use_tls': self.config.get('smtp_use_tls', True), + 'from_email': self.config.get('from_email'), + 'from_name': self.config.get('from_name', 'Chronicle AI'), + } + + self.email_service = SMTPEmailService(smtp_config) + + # Test SMTP connection + logger.info("Testing SMTP connectivity...") + if await self.email_service.test_connection(): + logger.info("βœ… SMTP connection test successful") + else: + raise Exception("SMTP connection test failed") + + except Exception as e: + logger.error(f"Failed to initialize email service: {e}") + raise + + # Get MongoDB database handle + self.db = get_database() + logger.info("βœ… Email Summarizer plugin initialized successfully") + + async def cleanup(self): + """Clean up plugin resources.""" + logger.info("Email Summarizer plugin cleanup complete") + + async def on_conversation_complete(self, context: PluginContext) -> Optional[PluginResult]: + """ + Send email summary when conversation completes. + + Args: + context: Plugin context with conversation data + - conversation: dict - Full conversation data + - transcript: str - Complete transcript + - duration: float - Conversation duration + - conversation_id: str - Conversation identifier + + Returns: + PluginResult with success status and message + """ + try: + logger.info(f"Processing conversation complete event for user: {context.user_id}") + + # Extract conversation data + conversation = context.data.get('conversation', {}) + transcript = context.data.get('transcript', '') + duration = context.data.get('duration', 0) + conversation_id = context.data.get('conversation_id', 'unknown') + created_at = conversation.get('created_at') + + # Validate transcript exists + if not transcript or transcript.strip() == '': + logger.warning(f"Empty transcript for conversation {conversation_id}, skipping email") + return PluginResult( + success=False, + message="Skipped: Empty transcript" + ) + + # Get user email from database + user_email = await self._get_user_email(context.user_id) + if not user_email: + logger.warning(f"No email found for user {context.user_id}, cannot send summary") + return PluginResult( + success=False, + message=f"No email configured for user {context.user_id}" + ) + + # Generate LLM summary + summary = await self._generate_summary(transcript) + + # Format email subject and body + subject = self._format_subject(created_at) + body_html = format_html_email( + summary=summary, + transcript=transcript, + conversation_id=conversation_id, + duration=duration, + created_at=created_at + ) + body_text = format_text_email( + summary=summary, + transcript=transcript, + conversation_id=conversation_id, + duration=duration, + created_at=created_at + ) + + # Send email + success = await self.email_service.send_email( + to_email=user_email, + subject=subject, + body_text=body_text, + body_html=body_html + ) + + if success: + logger.info(f"βœ… Email summary sent to {user_email} for conversation {conversation_id}") + return PluginResult( + success=True, + message=f"Email sent to {user_email}", + data={'recipient': user_email, 'conversation_id': conversation_id} + ) + else: + logger.error(f"Failed to send email to {user_email}") + return PluginResult( + success=False, + message=f"Failed to send email to {user_email}" + ) + + except Exception as e: + logger.error(f"Error in email summarizer plugin: {e}", exc_info=True) + return PluginResult( + success=False, + message=f"Error: {str(e)}" + ) + + async def _get_user_email(self, user_id: str) -> Optional[str]: + """ + Get notification email from user. + + Args: + user_id: User identifier (MongoDB ObjectId) + + Returns: + User's notification_email, or None if not set + """ + try: + from bson import ObjectId + + # Query users collection + user = await self.db['users'].find_one({'_id': ObjectId(user_id)}) + + if not user: + logger.warning(f"User {user_id} not found") + return None + + notification_email = user.get('notification_email') + + if not notification_email: + logger.warning(f"User {user_id} has no notification_email set") + return None + + logger.debug(f"Sending notification to {notification_email} for user {user_id}") + return notification_email + + except Exception as e: + logger.error(f"Error fetching user email: {e}", exc_info=True) + return None + + async def _generate_summary(self, transcript: str) -> str: + """ + Generate LLM summary of the conversation. + + Args: + transcript: Full conversation transcript + + Returns: + Generated summary (2-3 sentences) + """ + try: + prompt = ( + f"Summarize this conversation in {self.summary_max_sentences} sentences or less. " + f"Focus on key points, main topics discussed, and any action items or decisions. " + f"Be concise and clear.\n\n" + f"Conversation:\n{transcript}" + ) + + logger.debug("Generating LLM summary...") + summary = await async_generate(prompt) + + if not summary or summary.strip() == '': + raise ValueError("LLM returned empty summary") + + logger.info("βœ… LLM summary generated successfully") + return summary.strip() + + except Exception as e: + logger.error(f"Failed to generate LLM summary: {e}", exc_info=True) + # Fallback: return first 300 characters of transcript + logger.warning("Using fallback: truncated transcript") + return transcript[:300] + "..." if len(transcript) > 300 else transcript + + def _format_subject(self, created_at: Optional[datetime] = None) -> str: + """ + Format email subject line. + + Args: + created_at: Conversation creation timestamp + + Returns: + Formatted subject line + """ + if created_at: + date_str = created_at.strftime("%b %d, %Y at %I:%M %p") + return f"{self.subject_prefix} - {date_str}" + else: + return self.subject_prefix + + @staticmethod + async def test_connection(config: Dict[str, Any]) -> Dict[str, Any]: + """ + Test SMTP connection with provided configuration. + + This static method tests the SMTP connection without fully initializing the plugin. + Used by the form-based configuration UI to validate settings before saving. + + Args: + config: Configuration dictionary with SMTP settings + + Returns: + Dict with success status, message, and optional details + + Example: + >>> result = await EmailSummarizerPlugin.test_connection({ + ... 'smtp_host': 'smtp.gmail.com', + ... 'smtp_port': 587, + ... 'smtp_username': 'user@gmail.com', + ... 'smtp_password': 'password', + ... 'smtp_use_tls': True, + ... 'from_email': 'noreply@example.com', + ... 'from_name': 'Test' + ... }) + >>> result['success'] + True + """ + import time + + try: + # Validate required config fields + required_fields = ['smtp_host', 'smtp_username', 'smtp_password', 'from_email'] + missing_fields = [field for field in required_fields if not config.get(field)] + + if missing_fields: + return { + "success": False, + "message": f"Missing required fields: {', '.join(missing_fields)}", + "status": "error" + } + + # Build SMTP config + smtp_config = { + 'smtp_host': config.get('smtp_host'), + 'smtp_port': config.get('smtp_port', 587), + 'smtp_username': config.get('smtp_username'), + 'smtp_password': config.get('smtp_password'), + 'smtp_use_tls': config.get('smtp_use_tls', True), + 'from_email': config.get('from_email'), + 'from_name': config.get('from_name', 'Chronicle AI'), + } + + # Log config with masked secrets for debugging + logger.debug(f"SMTP config for testing: {mask_dict(smtp_config)}") + + # Create temporary email service instance + email_service = SMTPEmailService(smtp_config) + + # Test connection + logger.info(f"Testing SMTP connection to {smtp_config['smtp_host']}...") + start_time = time.time() + + connection_success = await email_service.test_connection() + connection_time_ms = int((time.time() - start_time) * 1000) + + if connection_success: + return { + "success": True, + "message": f"Successfully connected to SMTP server at {smtp_config['smtp_host']}", + "status": "success", + "details": { + "smtp_host": smtp_config['smtp_host'], + "smtp_port": smtp_config['smtp_port'], + "connection_time_ms": connection_time_ms, + "use_tls": smtp_config['smtp_use_tls'] + } + } + else: + return { + "success": False, + "message": "SMTP connection test failed", + "status": "error" + } + + except Exception as e: + logger.error(f"SMTP connection test failed: {e}", exc_info=True) + error_msg = str(e) + + # Provide helpful hints based on error type + hints = [] + if "Authentication" in error_msg or "535" in error_msg: + hints.append("For Gmail: Enable 2FA and create an App Password at https://myaccount.google.com/apppasswords") + hints.append("Verify your username and password are correct") + elif "Connection" in error_msg or "timeout" in error_msg.lower(): + hints.append("Check your SMTP host and port settings") + hints.append("Verify firewall/network allows outbound SMTP connections") + elif "TLS" in error_msg or "SSL" in error_msg: + hints.append("For port 587: Enable TLS") + hints.append("For port 465: Disable TLS (uses implicit SSL)") + + return { + "success": False, + "message": f"Connection test failed: {error_msg}", + "status": "error", + "hints": hints + } diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/setup.py b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/setup.py new file mode 100755 index 00000000..728ae607 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/setup.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Email Summarizer Plugin Setup Wizard + +Configures SMTP credentials and plugin settings. +Follows Chronicle's clean configuration architecture: +- Secrets β†’ backends/advanced/.env +- Non-secret settings β†’ plugins/email_summarizer/config.yml +- Orchestration β†’ config/plugins.yml +""" + +import shutil +import sys +from datetime import datetime +from pathlib import Path + +import yaml +from dotenv import set_key +from rich.console import Console +from rich.prompt import Confirm + +# Add repo root to path for setup_utils import +project_root = Path(__file__).resolve().parents[6] +sys.path.insert(0, str(project_root)) + +from setup_utils import ( + prompt_with_existing_masked, + prompt_value +) + +console = Console() + + +def update_plugins_yml_orchestration(): + """ + Update config/plugins.yml with orchestration settings only. + Plugin-specific settings are in plugins/email_summarizer/config.yml. + This follows Chronicle's three-file configuration architecture. + """ + plugins_yml_path = project_root / "config" / "plugins.yml" + + # Load existing or create from template + if plugins_yml_path.exists(): + with open(plugins_yml_path, 'r') as f: + config = yaml.safe_load(f) or {} + else: + # Copy from template + template_path = project_root / "config" / "plugins.yml.template" + if template_path.exists(): + with open(template_path, 'r') as f: + config = yaml.safe_load(f) or {} + else: + config = {'plugins': {}} + + # Ensure structure exists + if 'plugins' not in config: + config['plugins'] = {} + + # Only orchestration settings in config/plugins.yml + # Plugin-specific settings are in plugins/email_summarizer/config.yml + plugin_config = { + 'enabled': False, # Let user enable manually or prompt + 'events': ['conversation.complete'], + 'condition': {'type': 'always'} + } + + # Update or create plugin entry + config['plugins']['email_summarizer'] = plugin_config + + # Backup existing file + if plugins_yml_path.exists(): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + backup_path = plugins_yml_path.parent / f"plugins.yml.backup.{timestamp}" + shutil.copy(plugins_yml_path, backup_path) + console.print(f"[dim]Backed up existing plugins.yml to {backup_path.name}[/dim]") + + # Write updated config + plugins_yml_path.parent.mkdir(parents=True, exist_ok=True) + with open(plugins_yml_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + + console.print("[green]βœ… Updated config/plugins.yml (orchestration only)[/green]") + + return plugins_yml_path + + +def main(): + """Interactive setup for Email Summarizer plugin""" + console.print("\nπŸ“§ [bold cyan]Email Summarizer Plugin Setup[/bold cyan]") + console.print("This plugin sends email summaries when conversations complete.\n") + + # Path to main backend .env file + env_path = str(project_root / "backends" / "advanced" / ".env") + + # SMTP Configuration + console.print("[bold]SMTP Configuration[/bold]") + console.print("[dim]For Gmail: Use App Password (Settings > Security > 2FA > App Passwords)[/dim]\n") + + smtp_host = prompt_with_existing_masked( + prompt_text="SMTP Host", + env_file_path=env_path, + env_key="SMTP_HOST", + placeholders=['your-smtp-host-here'], + is_password=False, + default="smtp.gmail.com" + ) + + smtp_port = prompt_value("SMTP Port", default="587") + + smtp_username = prompt_with_existing_masked( + prompt_text="SMTP Username (your email)", + env_file_path=env_path, + env_key="SMTP_USERNAME", + placeholders=['your-email@example.com'], + is_password=False + ) + + smtp_password = prompt_with_existing_masked( + prompt_text="SMTP Password (App Password)", + env_file_path=env_path, + env_key="SMTP_PASSWORD", + placeholders=['your-password-here', 'your-app-password-here'], + is_password=True # Shows masked existing value + ) + + # Remove spaces from app password (Google adds spaces when copying) + smtp_password = smtp_password.replace(" ", "") + + smtp_use_tls = prompt_value("Use TLS? (true/false)", default="true") + + # Email sender configuration + from_email = prompt_with_existing_masked( + prompt_text="From Email", + env_file_path=env_path, + env_key="FROM_EMAIL", + placeholders=['noreply@example.com'], + is_password=False, + default=smtp_username # Default to SMTP username + ) + + from_name = prompt_value("From Name", default="Chronicle AI") + + # Save secrets to .env + console.print("\nπŸ’Ύ [bold]Saving credentials to .env...[/bold]") + + set_key(env_path, "SMTP_HOST", smtp_host) + set_key(env_path, "SMTP_PORT", smtp_port) + set_key(env_path, "SMTP_USERNAME", smtp_username) + set_key(env_path, "SMTP_PASSWORD", smtp_password) + set_key(env_path, "SMTP_USE_TLS", smtp_use_tls) + set_key(env_path, "FROM_EMAIL", from_email) + set_key(env_path, "FROM_NAME", from_name) + + console.print("[green]βœ… SMTP credentials saved to backends/advanced/.env[/green]") + + # Auto-update plugins.yml with orchestration settings only + console.print("\nπŸ“ [bold]Updating plugin configuration...[/bold]") + plugins_yml_path = update_plugins_yml_orchestration() + + # Prompt to enable plugin + enable_now = Confirm.ask("\nEnable email_summarizer plugin now?", default=True) + if enable_now: + with open(plugins_yml_path, 'r') as f: + config = yaml.safe_load(f) + config['plugins']['email_summarizer']['enabled'] = True + with open(plugins_yml_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + console.print("[green]βœ… Plugin enabled in config/plugins.yml[/green]") + + console.print("\n[bold cyan]βœ… Email Summarizer configured successfully![/bold cyan]") + console.print("\n[bold]Configuration saved to:[/bold]") + console.print(" β€’ [green]backends/advanced/.env[/green] - SMTP credentials (secrets)") + console.print(" β€’ [green]config/plugins.yml[/green] - Plugin orchestration (enabled, events)") + console.print(" β€’ [green]plugins/email_summarizer/config.yml[/green] - Plugin settings (already configured)") + console.print() + + if not enable_now: + console.print("[bold]To enable later:[/bold]") + console.print(" Edit config/plugins.yml and set: enabled: true") + console.print() + + console.print("[bold]Restart backend to apply:[/bold]") + console.print(" [dim]cd backends/advanced && docker compose restart[/dim]") + console.print() + console.print("[yellow]⚠️ SECURITY: Never commit secrets to git![/yellow]") + console.print("[yellow] β€’ Secrets go in backends/advanced/.env (gitignored)[/yellow]") + console.print("[yellow] β€’ Config files use ${ENV_VAR} references only[/yellow]") + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + console.print("\n[yellow]Setup cancelled by user[/yellow]") + sys.exit(1) + except Exception as e: + console.print(f"\n[red]Error during setup: {e}[/red]") + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/templates.py b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/templates.py new file mode 100644 index 00000000..9f99e5cb --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/email_summarizer/templates.py @@ -0,0 +1,258 @@ +""" +Email templates for the Email Summarizer plugin. + +Provides HTML and plain text email templates. +""" +import html +from datetime import datetime +from typing import Optional + + +def format_duration(seconds: float) -> str: + """ + Format duration in seconds to human-readable format. + + Args: + seconds: Duration in seconds + + Returns: + Formatted duration (e.g., "5m 30s", "1h 15m") + """ + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + + if hours > 0: + return f"{hours}h {minutes}m" + elif minutes > 0: + return f"{minutes}m {secs}s" + else: + return f"{secs}s" + + +def format_html_email( + summary: str, + transcript: str, + conversation_id: str, + duration: float, + created_at: Optional[datetime] = None +) -> str: + """ + Format HTML email template. + + Args: + summary: LLM-generated summary + transcript: Full conversation transcript + conversation_id: Conversation identifier + duration: Conversation duration in seconds + created_at: Conversation creation timestamp + + Returns: + HTML email body + """ + formatted_duration = format_duration(duration) + date_str = created_at.strftime("%B %d, %Y at %I:%M %p") if created_at else "N/A" + + # Escape HTML to prevent XSS attacks + summary_escaped = html.escape(summary, quote=True) + transcript_escaped = html.escape(transcript, quote=True) + + # Format transcript with line breaks (after escaping) + transcript_html = transcript_escaped.replace('\n', '
') + + return f""" + + + + + + + +
+

πŸŽ™οΈ Conversation Summary

+
{date_str}
+
+ +
+
+

πŸ“‹ Summary

+

{summary_escaped}

+
+ +
+

πŸ“ Full Transcript

+
{transcript_html}
+
+ + +
+ + + + +""" + + +def format_text_email( + summary: str, + transcript: str, + conversation_id: str, + duration: float, + created_at: Optional[datetime] = None +) -> str: + """ + Format plain text email template. + + Args: + summary: LLM-generated summary + transcript: Full conversation transcript + conversation_id: Conversation identifier + duration: Conversation duration in seconds + created_at: Conversation creation timestamp + + Returns: + Plain text email body + """ + formatted_duration = format_duration(duration) + date_str = created_at.strftime("%B %d, %Y at %I:%M %p") if created_at else "N/A" + + return f""" +πŸŽ™οΈ CONVERSATION SUMMARY +{date_str} + +═══════════════════════════════════════════════════════════ + +πŸ“‹ SUMMARY + +{summary} + +─────────────────────────────────────────────────────────── + +πŸ“ FULL TRANSCRIPT + +{transcript} + +═══════════════════════════════════════════════════════════ + +πŸ“Š METADATA + +Duration: {formatted_duration} +Conversation ID: {conversation_id} + +─────────────────────────────────────────────────────────── + +Sent by Chronicle AI +Your personal AI memory system +https://github.com/chronicle-ai/chronicle +""" diff --git a/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/__init__.py b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/__init__.py new file mode 100644 index 00000000..11b831e9 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/__init__.py @@ -0,0 +1,9 @@ +""" +Home Assistant plugin for Chronicle. + +Allows control of Home Assistant devices via natural language wake word commands. +""" + +from .plugin import HomeAssistantPlugin + +__all__ = ['HomeAssistantPlugin'] diff --git a/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/command_parser.py b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/command_parser.py new file mode 100644 index 00000000..cc73626d --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/command_parser.py @@ -0,0 +1,97 @@ +""" +LLM-based command parser for Home Assistant integration. + +This module provides structured command parsing using LLM to extract +intent, target entities/areas, and parameters from natural language. +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + + +@dataclass +class ParsedCommand: + """Structured representation of a parsed Home Assistant command.""" + + action: str + """Action to perform (e.g., turn_on, turn_off, set_brightness, toggle)""" + + target_type: str + """Type of target (area, entity, all_in_area)""" + + target: str + """Target identifier (area name or entity name)""" + + entity_type: Optional[str] = None + """Entity domain filter (e.g., light, switch, fan) - None means all types""" + + parameters: Dict[str, Any] = field(default_factory=dict) + """Additional parameters (e.g., brightness_pct=50, color='red')""" + + +# LLM System Prompt for Command Parsing +COMMAND_PARSER_SYSTEM_PROMPT = """You are a smart home command parser for Home Assistant. + +Extract structured information from natural language commands. +Return ONLY valid JSON in this exact format (no markdown, no code blocks, no explanation): + +{ + "action": "turn_off", + "target_type": "area", + "target": "study", + "entity_type": "light", + "parameters": {} +} + +ACTIONS (choose one): +- turn_on: Turn on entities +- turn_off: Turn off entities +- toggle: Toggle entity state +- set_brightness: Set brightness level +- set_color: Set color + +TARGET_TYPE (choose one): +- area: Targeting all entities of a type in an area (e.g., "study lights") +- all_in_area: Targeting ALL entities in an area (e.g., "everything in study") +- entity: Targeting a specific entity by name (e.g., "desk lamp") + +ENTITY_TYPE (optional, use null if not specified): +- light: Light entities +- switch: Switch entities +- fan: Fan entities +- cover: Covers/blinds +- null: All entity types (when target_type is "all_in_area") + +PARAMETERS (optional, empty dict if none): +- brightness_pct: Brightness percentage (0-100) +- color: Color name (e.g., "red", "blue", "warm white") + +EXAMPLES: + +Command: "turn off study lights" +Response: {"action": "turn_off", "target_type": "area", "target": "study", "entity_type": "light", "parameters": {}} + +Command: "turn off everything in study" +Response: {"action": "turn_off", "target_type": "all_in_area", "target": "study", "entity_type": null, "parameters": {}} + +Command: "turn on desk lamp" +Response: {"action": "turn_on", "target_type": "entity", "target": "desk lamp", "entity_type": null, "parameters": {}} + +Command: "set study lights to 50%" +Response: {"action": "set_brightness", "target_type": "area", "target": "study", "entity_type": "light", "parameters": {"brightness_pct": 50}} + +Command: "turn on living room fan" +Response: {"action": "turn_on", "target_type": "area", "target": "living room", "entity_type": "fan", "parameters": {}} + +Command: "turn off all lights" +Response: {"action": "turn_off", "target_type": "entity", "target": "all", "entity_type": "light", "parameters": {}} + +Command: "toggle hallway light" +Response: {"action": "toggle", "target_type": "entity", "target": "hallway light", "entity_type": null, "parameters": {}} + +Remember: +1. Return ONLY the JSON object, no markdown formatting +2. Use lowercase for action, target_type, target, entity_type +3. Use null (not "null" string) for missing entity_type +4. Always include all 5 fields: action, target_type, target, entity_type, parameters +""" diff --git a/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/config.yml b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/config.yml new file mode 100644 index 00000000..eb477aa5 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/config.yml @@ -0,0 +1,13 @@ +# Home Assistant Plugin Configuration +# +# This file contains non-secret configuration for the Home Assistant plugin. +# Secrets (HA_TOKEN) are stored in backends/advanced/.env +# Plugin orchestration (enabled, events, condition) is in config/plugins.yml + +# Home Assistant server configuration +ha_url: ${HA_URL} +ha_token: ${HA_TOKEN} + +# Command configuration +wake_word: ${HA_WAKE_WORD:-vivi} +timeout: ${HA_TIMEOUT:-30} diff --git a/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/entity_cache.py b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/entity_cache.py new file mode 100644 index 00000000..e8624f1b --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/entity_cache.py @@ -0,0 +1,133 @@ +""" +Entity cache for Home Assistant integration. + +This module provides caching and lookup functionality for Home Assistant areas and entities. +""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, List, Optional +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class EntityCache: + """Cache for Home Assistant areas and entities.""" + + areas: List[str] = field(default_factory=list) + """List of area names (e.g., ["study", "living_room"])""" + + area_entities: Dict[str, List[str]] = field(default_factory=dict) + """Map of area names to entity IDs (e.g., {"study": ["light.tubelight_3"]})""" + + entity_details: Dict[str, Dict] = field(default_factory=dict) + """Full entity state data keyed by entity_id""" + + last_refresh: datetime = field(default_factory=datetime.now) + """Timestamp of last cache refresh""" + + def find_entity_by_name(self, name: str) -> Optional[str]: + """ + Find entity ID by fuzzy name matching. + + Matching priority: + 1. Exact friendly_name match (case-insensitive) + 2. Partial friendly_name match (case-insensitive) + 3. Entity ID match (e.g., "tubelight_3" β†’ "light.tubelight_3") + + Args: + name: Entity name to search for + + Returns: + Entity ID if found, None otherwise + """ + name_lower = name.lower().strip() + + # Step 1: Exact friendly_name match + for entity_id, details in self.entity_details.items(): + friendly_name = details.get('attributes', {}).get('friendly_name', '') + if friendly_name.lower() == name_lower: + logger.debug(f"Exact match: {name} β†’ {entity_id} (friendly_name: {friendly_name})") + return entity_id + + # Step 2: Partial friendly_name match + for entity_id, details in self.entity_details.items(): + friendly_name = details.get('attributes', {}).get('friendly_name', '') + if name_lower in friendly_name.lower(): + logger.debug(f"Partial match: {name} β†’ {entity_id} (friendly_name: {friendly_name})") + return entity_id + + # Step 3: Entity ID match (try adding common domains) + common_domains = ['light', 'switch', 'fan', 'cover'] + for domain in common_domains: + candidate_id = f"{domain}.{name_lower.replace(' ', '_')}" + if candidate_id in self.entity_details: + logger.debug(f"Entity ID match: {name} β†’ {candidate_id}") + return candidate_id + + logger.warning(f"No entity found matching: {name}") + return None + + def get_entities_in_area( + self, + area: str, + entity_type: Optional[str] = None + ) -> List[str]: + """ + Get all entities in an area, optionally filtered by domain. + + Args: + area: Area name (case-insensitive) + entity_type: Entity domain filter (e.g., "light", "switch") + + Returns: + List of entity IDs in the area + """ + area_lower = area.lower().strip() + + # Find matching area (case-insensitive) + matching_area = None + for area_name in self.areas: + if area_name.lower() == area_lower: + matching_area = area_name + break + + if not matching_area: + logger.warning(f"Area not found: {area}") + return [] + + # Get entities in area + entities = self.area_entities.get(matching_area, []) + + # Filter by entity type if specified + if entity_type: + entity_type_lower = entity_type.lower() + entities = [ + e for e in entities + if e.split('.')[0] == entity_type_lower + ] + + logger.debug( + f"Found {len(entities)} entities in area '{matching_area}'" + + (f" (type: {entity_type})" if entity_type else "") + ) + + return entities + + def get_cache_age_seconds(self) -> float: + """Get cache age in seconds.""" + return (datetime.now() - self.last_refresh).total_seconds() + + def is_stale(self, max_age_seconds: int = 3600) -> bool: + """ + Check if cache is stale. + + Args: + max_age_seconds: Maximum cache age before considering stale (default: 1 hour) + + Returns: + True if cache is older than max_age_seconds + """ + return self.get_cache_age_seconds() > max_age_seconds diff --git a/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/mcp_client.py b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/mcp_client.py new file mode 100644 index 00000000..42ede8dc --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/mcp_client.py @@ -0,0 +1,421 @@ +""" +MCP client for communicating with Home Assistant's MCP Server. + +Home Assistant exposes an MCP server at /api/mcp that provides tools +for controlling smart home devices. +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +import httpx + +logger = logging.getLogger(__name__) + + +class MCPError(Exception): + """MCP protocol error""" + pass + + +class HAMCPClient: + """ + MCP Client for Home Assistant's /api/mcp endpoint. + + Implements the Model Context Protocol for communicating with + Home Assistant's built-in MCP server. + """ + + def __init__(self, base_url: str, token: str, timeout: int = 30): + """ + Initialize the MCP client. + + Args: + base_url: Base URL of Home Assistant (e.g., http://localhost:8123) + token: Long-lived access token for authentication + timeout: Request timeout in seconds + + """ + self.base_url = base_url.rstrip('/') + self.mcp_url = f"{self.base_url}/api/mcp" + self.token = token + self.timeout = timeout + self.client = httpx.AsyncClient(timeout=timeout) + self._request_id = 0 + + async def close(self): + """Close the HTTP client""" + await self.client.aclose() + + def _next_request_id(self) -> int: + """Generate next request ID""" + self._request_id += 1 + return self._request_id + + async def _send_mcp_request(self, method: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """ + Send MCP protocol request to Home Assistant. + + Args: + method: MCP method name (e.g., "tools/list", "tools/call") + params: Optional method parameters + + Returns: + Response data from MCP server + + Raises: + MCPError: If request fails or returns an error + """ + payload = { + "jsonrpc": "2.0", + "id": self._next_request_id(), + "method": method + } + + if params: + payload["params"] = params + + headers = { + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json" + } + + try: + logger.debug(f"MCP Request: {method} with params: {params}") + response = await self.client.post( + self.mcp_url, + json=payload, + headers=headers + ) + response.raise_for_status() + + data = response.json() + + # Check for JSON-RPC error + if "error" in data: + error = data["error"] + raise MCPError(f"MCP Error {error.get('code')}: {error.get('message')}") + + return data.get("result", {}) + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error calling MCP endpoint: {e.response.status_code}") + raise MCPError(f"HTTP {e.response.status_code}: {e.response.text}") + except httpx.RequestError as e: + logger.error(f"Request error calling MCP endpoint: {e}") + raise MCPError(f"Request failed: {e}") + except Exception as e: + logger.error(f"Unexpected error calling MCP endpoint: {e}") + raise MCPError(f"Unexpected error: {e}") + + async def list_tools(self) -> List[Dict[str, Any]]: + """ + Get list of available MCP tools from Home Assistant. + + Returns: + List of tool definitions with schema + + Example tool: + { + "name": "turn_on", + "description": "Turn on a light or switch", + "inputSchema": { + "type": "object", + "properties": { + "entity_id": {"type": "string"} + } + } + } + """ + result = await self._send_mcp_request("tools/list") + tools = result.get("tools", []) + logger.info(f"Retrieved {len(tools)} tools from Home Assistant MCP") + return tools + + async def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a tool via MCP. + + Args: + tool_name: Name of the tool to call (e.g., "turn_on", "turn_off") + arguments: Tool arguments (e.g., {"entity_id": "light.hall_light"}) + + Returns: + Tool execution result + + Raises: + MCPError: If tool execution fails + + Example: + >>> await client.call_tool("turn_off", {"entity_id": "light.hall_light"}) + {"success": True} + """ + params = { + "name": tool_name, + "arguments": arguments + } + + logger.info(f"Calling MCP tool '{tool_name}' with args: {arguments}") + result = await self._send_mcp_request("tools/call", params) + + # MCP tool results are wrapped in content blocks + content = result.get("content", []) + if content and isinstance(content, list): + # Extract text content from first block + first_block = content[0] + if isinstance(first_block, dict) and first_block.get("type") == "text": + return {"result": first_block.get("text"), "success": True} + + return result + + async def test_connection(self) -> bool: + """ + Test connection to Home Assistant MCP server. + + Returns: + True if connection successful, False otherwise + """ + try: + tools = await self.list_tools() + logger.info(f"MCP connection test successful ({len(tools)} tools available)") + return True + except Exception as e: + logger.error(f"MCP connection test failed: {e}") + return False + + async def _render_template(self, template: str) -> Any: + """ + Render a Home Assistant template using the Template API. + + Args: + template: Jinja2 template string (e.g., "{{ areas() }}") + + Returns: + Rendered template result (parsed as JSON if possible) + + Raises: + MCPError: If template rendering fails + + Example: + >>> await client._render_template("{{ areas() }}") + ["study", "living_room", "bedroom"] + """ + headers = { + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json" + } + + payload = {"template": template} + + try: + logger.debug(f"Rendering template: {template}") + response = await self.client.post( + f"{self.base_url}/api/template", + json=payload, + headers=headers + ) + response.raise_for_status() + + result = response.text.strip() + + # Try to parse as JSON (for lists, dicts) + if result.startswith('[') or result.startswith('{'): + try: + return json.loads(result) + except json.JSONDecodeError: + logger.warning(f"Failed to parse template result as JSON: {result}") + return result + + return result + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error rendering template: {e.response.status_code}") + raise MCPError(f"HTTP {e.response.status_code}: {e.response.text}") + except httpx.RequestError as e: + logger.error(f"Request error rendering template: {e}") + raise MCPError(f"Request failed: {e}") + + async def fetch_areas(self) -> List[str]: + """ + Fetch all areas from Home Assistant using Template API. + + Returns: + List of area names + + Example: + >>> await client.fetch_areas() + ["study", "living_room", "bedroom"] + """ + template = "{{ areas() | to_json }}" + areas = await self._render_template(template) + + if isinstance(areas, list): + logger.info(f"Fetched {len(areas)} areas from Home Assistant") + return areas + else: + logger.warning(f"Unexpected areas format: {type(areas)}") + return [] + + async def fetch_area_entities(self, area_name: str) -> List[str]: + """ + Fetch all entity IDs in a specific area. + + Args: + area_name: Name of the area + + Returns: + List of entity IDs in the area + + Example: + >>> await client.fetch_area_entities("study") + ["light.tubelight_3", "switch.desk_fan"] + """ + template = f"{{{{ area_entities('{area_name}') | to_json }}}}" + entities = await self._render_template(template) + + if isinstance(entities, list): + logger.info(f"Fetched {len(entities)} entities from area '{area_name}'") + return entities + else: + logger.warning(f"Unexpected entities format for area '{area_name}': {type(entities)}") + return [] + + async def fetch_entity_states(self) -> Dict[str, Dict]: + """ + Fetch all entity states from Home Assistant. + + Returns: + Dict mapping entity_id to state data (includes attributes, area_id) + + Example: + >>> await client.fetch_entity_states() + { + "light.tubelight_3": { + "state": "on", + "attributes": {"friendly_name": "Study Light", ...}, + "area_id": "study" + } + } + """ + headers = { + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json" + } + + try: + logger.debug("Fetching all entity states") + response = await self.client.get( + f"{self.base_url}/api/states", + headers=headers + ) + response.raise_for_status() + + states = response.json() + entity_details = {} + + # Enrich with area information + for state in states: + entity_id = state.get('entity_id') + if entity_id: + # Get area_id using Template API + try: + area_template = f"{{{{ area_id('{entity_id}') }}}}" + area_id = await self._render_template(area_template) + state['area_id'] = area_id if area_id else None + except Exception as e: + logger.debug(f"Failed to get area for {entity_id}: {e}") + state['area_id'] = None + + entity_details[entity_id] = state + + logger.info(f"Fetched {len(entity_details)} entity states") + return entity_details + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error fetching states: {e.response.status_code}") + raise MCPError(f"HTTP {e.response.status_code}: {e.response.text}") + except httpx.RequestError as e: + logger.error(f"Request error fetching states: {e}") + raise MCPError(f"Request failed: {e}") + + async def call_service( + self, + domain: str, + service: str, + entity_ids: List[str], + **parameters + ) -> Dict[str, Any]: + """ + Call a Home Assistant service directly via REST API. + + Args: + domain: Service domain (e.g., "light", "switch") + service: Service name (e.g., "turn_on", "turn_off") + entity_ids: List of entity IDs to target + **parameters: Additional service parameters (e.g., brightness_pct=50) + + Returns: + Service call response + + Example: + >>> await client.call_service("light", "turn_on", ["light.study"], brightness_pct=50) + [{"entity_id": "light.study", "state": "on"}] + """ + headers = { + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json" + } + + payload = { + "entity_id": entity_ids, + **parameters + } + + service_url = f"{self.base_url}/api/services/{domain}/{service}" + + try: + logger.info(f"Calling service {domain}.{service} for {len(entity_ids)} entities") + logger.debug(f"Service payload: {payload}") + + response = await self.client.post( + service_url, + json=payload, + headers=headers + ) + response.raise_for_status() + + result = response.json() + logger.info(f"Service call successful: {domain}.{service}") + return result + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error calling service: {e.response.status_code}") + raise MCPError(f"HTTP {e.response.status_code}: {e.response.text}") + except httpx.RequestError as e: + logger.error(f"Request error calling service: {e}") + raise MCPError(f"Request failed: {e}") + + async def discover_entities(self) -> Dict[str, Dict]: + """ + Discover available entities from MCP tools. + + Parses the available tools to build an index of entities + that can be controlled. + + Returns: + Dict mapping entity_id to metadata + """ + tools = await self.list_tools() + entities = {} + + for tool in tools: + # Extract entity information from tool schemas + # This will depend on how HA MCP structures its tools + # For now, we'll just log what we find + logger.debug(f"Tool: {tool.get('name')} - {tool.get('description')}") + + # TODO: Parse tool schemas to extract entity_id information + # For now, return empty dict - will be populated based on actual HA MCP response + + return entities diff --git a/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/plugin.py b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/plugin.py new file mode 100644 index 00000000..13683194 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/homeassistant/plugin.py @@ -0,0 +1,692 @@ +""" +Home Assistant plugin for Chronicle. + +Enables control of Home Assistant devices through natural language commands +triggered by a wake word. +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from ..base import BasePlugin, PluginContext, PluginResult +from .entity_cache import EntityCache +from .mcp_client import HAMCPClient, MCPError + +logger = logging.getLogger(__name__) + + +class HomeAssistantPlugin(BasePlugin): + """ + Plugin for controlling Home Assistant devices via wake word commands. + + Example: + User says: "Vivi, turn off the hall lights" + -> Wake word "vivi" detected by router + -> Command "turn off the hall lights" passed to on_transcript() + -> Plugin parses command and calls HA MCP to execute + -> Returns: PluginResult with "I've turned off the hall light" + """ + + SUPPORTED_ACCESS_LEVELS: List[str] = ['transcript'] + + name = "Home Assistant" + description = "Wake word device control with Home Assistant integration" + + def __init__(self, config: Dict[str, Any]): + """ + Initialize Home Assistant plugin. + + Args: + config: Plugin configuration with keys: + - ha_url: Home Assistant URL + - ha_token: Long-lived access token + - wake_word: Wake word for triggering commands (handled by router) + - enabled: Whether plugin is enabled + - access_level: Should be 'transcript' + - trigger: Should be {'type': 'wake_word', 'wake_word': '...'} + """ + super().__init__(config) + self.mcp_client: Optional[HAMCPClient] = None + self.available_tools: List[Dict] = [] + self.entities: Dict[str, Dict] = {} + + # Entity cache for area-based commands + self.entity_cache: Optional[EntityCache] = None + self.cache_initialized = False + + # Configuration + self.ha_url = config.get('ha_url', 'http://localhost:8123') + self.ha_token = config.get('ha_token', '') + self.wake_word = config.get('wake_word', 'vivi') + self.timeout = config.get('timeout', 30) + + async def initialize(self): + """ + Initialize the Home Assistant plugin. + + Connects to Home Assistant MCP server and discovers available tools. + + Raises: + MCPError: If connection or discovery fails + """ + if not self.enabled: + logger.info("Home Assistant plugin is disabled, skipping initialization") + return + + if not self.ha_token: + raise ValueError("Home Assistant token is required") + + logger.info(f"Initializing Home Assistant plugin (URL: {self.ha_url})") + + # Create MCP client (used for REST API calls, not MCP protocol) + self.mcp_client = HAMCPClient( + base_url=self.ha_url, + token=self.ha_token, + timeout=self.timeout + ) + + # Test basic API connectivity with Template API + try: + logger.info("Testing Home Assistant API connectivity...") + test_result = await self.mcp_client._render_template("{{ 1 + 1 }}") + if str(test_result).strip() != "2": + raise ValueError(f"Unexpected template result: {test_result}") + logger.info("Home Assistant API connection successful") + except Exception as e: + raise MCPError(f"Failed to connect to Home Assistant API: {e}") + + logger.info("Home Assistant plugin initialized successfully") + + async def on_transcript(self, context: PluginContext) -> Optional[PluginResult]: + """ + Execute Home Assistant command from wake word transcript. + + Called by the router when a wake word is detected in the transcript. + The router has already stripped the wake word and extracted the command. + + Args: + context: PluginContext containing: + - user_id: User ID who issued the command + - access_level: 'transcript' + - data: Dict with: + - command: str - Command with wake word already stripped + - original_transcript: str - Full transcript with wake word + - transcript: str - Original transcript + - segment_id: str - Unique segment identifier + - conversation_id: str - Current conversation ID + - metadata: Optional additional metadata + + Returns: + PluginResult with: + - success: True if command executed + - message: User-friendly response + - data: Dict with action details + - should_continue: False to stop normal processing + + Example: + Context data: + { + 'command': 'turn off study lights', + 'original_transcript': 'vivi turn off study lights', + 'conversation_id': 'conv_123' + } + + Returns: + PluginResult( + success=True, + message="I've turned off 1 light in study", + data={'action': 'turn_off', 'entity_ids': ['light.tubelight_3']}, + should_continue=False + ) + """ + command = context.data.get('command', '') + + if not command: + return PluginResult( + success=False, + message="No command provided", + should_continue=True + ) + + if not self.mcp_client: + logger.error("MCP client not initialized") + return PluginResult( + success=False, + message="Sorry, Home Assistant is not connected", + should_continue=True + ) + + try: + # Step 1: Parse command using hybrid LLM + fallback parsing + logger.info(f"Processing HA command: '{command}'") + parsed = await self._parse_command_hybrid(command) + + if not parsed: + return PluginResult( + success=False, + message="Sorry, I couldn't understand that command", + should_continue=True + ) + + # Step 2: Resolve entities from parsed command + try: + entity_ids = await self._resolve_entities(parsed) + except ValueError as e: + logger.warning(f"Entity resolution failed: {e}") + return PluginResult( + success=False, + message=str(e), + should_continue=True + ) + + # Step 3: Determine service and domain + # Extract domain from first entity (all should have same domain for area-based) + domain = entity_ids[0].split('.')[0] if entity_ids else 'light' + + # Map action to service name + service_map = { + 'turn_on': 'turn_on', + 'turn_off': 'turn_off', + 'toggle': 'toggle', + 'set_brightness': 'turn_on', # brightness uses turn_on with params + 'set_color': 'turn_on' # color uses turn_on with params + } + service = service_map.get(parsed.action, 'turn_on') + + # Step 4: Call Home Assistant service + logger.info( + f"Calling {domain}.{service} for {len(entity_ids)} entities: {entity_ids}" + ) + + result = await self.mcp_client.call_service( + domain=domain, + service=service, + entity_ids=entity_ids, + **parsed.parameters + ) + + # Step 5: Format user-friendly response + entity_type_name = parsed.entity_type or domain + if parsed.target_type == 'area': + message = ( + f"I've {parsed.action.replace('_', ' ')} {len(entity_ids)} " + f"{entity_type_name}{'s' if len(entity_ids) != 1 else ''} " + f"in {parsed.target}" + ) + elif parsed.target_type == 'all_in_area': + message = ( + f"I've {parsed.action.replace('_', ' ')} {len(entity_ids)} " + f"entities in {parsed.target}" + ) + else: + message = f"I've {parsed.action.replace('_', ' ')} {parsed.target}" + + logger.info(f"HA command executed successfully: {message}") + + return PluginResult( + success=True, + data={ + 'action': parsed.action, + 'entity_ids': entity_ids, + 'target_type': parsed.target_type, + 'target': parsed.target, + 'ha_result': result + }, + message=message, + should_continue=False # Stop normal processing - HA command handled + ) + + except MCPError as e: + logger.error(f"Home Assistant API error: {e}", exc_info=True) + return PluginResult( + success=False, + message=f"Sorry, Home Assistant couldn't execute that: {e}", + should_continue=True + ) + except Exception as e: + logger.error(f"Command execution failed: {e}", exc_info=True) + return PluginResult( + success=False, + message="Sorry, something went wrong while executing that command", + should_continue=True + ) + + async def cleanup(self): + """Clean up resources""" + if self.mcp_client: + await self.mcp_client.close() + logger.info("Closed Home Assistant MCP client") + + async def _ensure_cache_initialized(self): + """Ensure entity cache is initialized. Lazy-load on first use.""" + if not self.cache_initialized: + logger.info("Entity cache not initialized, refreshing...") + await self._refresh_cache() + self.cache_initialized = True + + async def _refresh_cache(self): + """ + Refresh the entity cache from Home Assistant. + + Fetches: + - All areas + - Entities in each area + - Entity state details + """ + if not self.mcp_client: + logger.error("Cannot refresh cache: MCP client not initialized") + return + + try: + logger.info("Refreshing entity cache from Home Assistant...") + + # Fetch all areas + areas = await self.mcp_client.fetch_areas() + logger.debug(f"Fetched {len(areas)} areas: {areas}") + + # Fetch entities for each area + area_entities = {} + for area in areas: + entities = await self.mcp_client.fetch_area_entities(area) + area_entities[area] = entities + logger.debug(f"Area '{area}': {len(entities)} entities") + + # Fetch all entity states + entity_details = await self.mcp_client.fetch_entity_states() + logger.debug(f"Fetched {len(entity_details)} entity states") + + # Create cache + from datetime import datetime + self.entity_cache = EntityCache( + areas=areas, + area_entities=area_entities, + entity_details=entity_details, + last_refresh=datetime.now() + ) + + logger.info( + f"Entity cache refreshed: {len(areas)} areas, " + f"{len(entity_details)} entities" + ) + + except Exception as e: + logger.error(f"Failed to refresh entity cache: {e}", exc_info=True) + raise + + async def _parse_command_with_llm(self, command: str) -> Optional['ParsedCommand']: + """ + Parse command using LLM with structured system prompt. + + Args: + command: Natural language command (wake word already stripped) + + Returns: + ParsedCommand if parsing succeeds, None otherwise + + Example: + >>> await self._parse_command_with_llm("turn off study lights") + ParsedCommand( + action="turn_off", + target_type="area", + target="study", + entity_type="light", + parameters={} + ) + """ + try: + from advanced_omi_backend.llm_client import get_llm_client + from .command_parser import COMMAND_PARSER_SYSTEM_PROMPT, ParsedCommand + + llm_client = get_llm_client() + + logger.debug(f"Parsing command with LLM: '{command}'") + + # Use OpenAI chat format with system + user messages + response = llm_client.client.chat.completions.create( + model=llm_client.model, + messages=[ + {"role": "system", "content": COMMAND_PARSER_SYSTEM_PROMPT}, + {"role": "user", "content": f'Command: "{command}"\n\nReturn JSON only.'} + ], + temperature=0.1, + max_tokens=150 + ) + + result_text = response.choices[0].message.content.strip() + logger.debug(f"LLM response: {result_text}") + + # Remove markdown code blocks if present + if result_text.startswith('```'): + lines = result_text.split('\n') + result_text = '\n'.join(lines[1:-1]) if len(lines) > 2 else result_text + result_text = result_text.strip() + + # Parse JSON response + result_json = json.loads(result_text) + + # Validate required fields + required_fields = ['action', 'target_type', 'target'] + if not all(field in result_json for field in required_fields): + logger.warning(f"LLM response missing required fields: {result_json}") + return None + + parsed = ParsedCommand( + action=result_json['action'], + target_type=result_json['target_type'], + target=result_json['target'], + entity_type=result_json.get('entity_type'), + parameters=result_json.get('parameters', {}) + ) + + logger.info( + f"LLM parsed command: action={parsed.action}, " + f"target_type={parsed.target_type}, target={parsed.target}, " + f"entity_type={parsed.entity_type}" + ) + + return parsed + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse LLM JSON response: {e}\nResponse: {result_text}") + return None + except Exception as e: + logger.error(f"LLM command parsing failed: {e}", exc_info=True) + return None + + async def _resolve_entities(self, parsed: 'ParsedCommand') -> List[str]: + """ + Resolve ParsedCommand to actual Home Assistant entity IDs. + + Args: + parsed: ParsedCommand from LLM parsing + + Returns: + List of entity IDs to target + + Raises: + ValueError: If target not found or ambiguous + + Example: + >>> await self._resolve_entities(ParsedCommand( + ... action="turn_off", + ... target_type="area", + ... target="study", + ... entity_type="light" + ... )) + ["light.tubelight_3"] + """ + from .command_parser import ParsedCommand + + # Ensure cache is ready + await self._ensure_cache_initialized() + + if not self.entity_cache: + raise ValueError("Entity cache not initialized") + + if parsed.target_type == 'area': + # Get entities in area, filtered by type + entities = self.entity_cache.get_entities_in_area( + area=parsed.target, + entity_type=parsed.entity_type + ) + + if not entities: + entity_desc = f"{parsed.entity_type}s" if parsed.entity_type else "entities" + raise ValueError( + f"No {entity_desc} found in area '{parsed.target}'. " + f"Available areas: {', '.join(self.entity_cache.areas)}" + ) + + logger.info( + f"Resolved area '{parsed.target}' to {len(entities)} " + f"{parsed.entity_type or 'entity'}(s)" + ) + return entities + + elif parsed.target_type == 'all_in_area': + # Get ALL entities in area (no filter) + entities = self.entity_cache.get_entities_in_area( + area=parsed.target, + entity_type=None + ) + + if not entities: + raise ValueError( + f"No entities found in area '{parsed.target}'. " + f"Available areas: {', '.join(self.entity_cache.areas)}" + ) + + logger.info(f"Resolved 'all in {parsed.target}' to {len(entities)} entities") + return entities + + elif parsed.target_type == 'entity': + # Fuzzy match entity by name + entity_id = self.entity_cache.find_entity_by_name(parsed.target) + + if not entity_id: + raise ValueError( + f"Entity '{parsed.target}' not found. " + f"Try being more specific or check the entity name." + ) + + logger.info(f"Resolved entity '{parsed.target}' to {entity_id}") + return [entity_id] + + else: + raise ValueError(f"Unknown target type: {parsed.target_type}") + + async def _parse_command_fallback(self, command: str) -> Optional[Dict[str, Any]]: + """ + Fallback keyword-based command parser (used when LLM fails). + + Args: + command: Natural language command + + Returns: + Dict with 'tool', 'arguments', and optional metadata + None if parsing fails + + Example: + Input: "turn off the hall lights" + Output: { + "tool": "turn_off", + "arguments": {"entity_id": "light.hall_light"}, + "friendly_name": "Hall Light", + "action": "turn_off" + } + """ + logger.debug("Using fallback keyword-based parsing") + command_lower = command.lower().strip() + + # Determine action + tool = None + if any(word in command_lower for word in ['turn off', 'off', 'disable']): + tool = 'turn_off' + action_desc = 'turned off' + elif any(word in command_lower for word in ['turn on', 'on', 'enable']): + tool = 'turn_on' + action_desc = 'turned on' + elif 'toggle' in command_lower: + tool = 'toggle' + action_desc = 'toggled' + else: + logger.warning(f"Unknown action in command: {command}") + return None + + # Extract entity name from command + entity_query = command_lower + for action_word in ['turn off', 'turn on', 'toggle', 'off', 'on', 'the']: + entity_query = entity_query.replace(action_word, '').strip() + + logger.info(f"Searching for entity: '{entity_query}'") + + # Return placeholder (this will work if entity ID matches pattern) + return { + "tool": tool, + "arguments": { + "entity_id": f"light.{entity_query.replace(' ', '_')}" + }, + "friendly_name": entity_query.title(), + "action_desc": action_desc + } + + async def _parse_command_hybrid(self, command: str) -> Optional['ParsedCommand']: + """ + Hybrid command parser: Try LLM first, fallback to keywords. + + This provides the best of both worlds: + - LLM parsing for complex area-based and natural commands + - Keyword fallback for reliability when LLM fails or times out + + Args: + command: Natural language command + + Returns: + ParsedCommand if successful, None otherwise + + Example: + >>> await self._parse_command_hybrid("turn off study lights") + ParsedCommand(action="turn_off", target_type="area", target="study", ...) + """ + import asyncio + from .command_parser import ParsedCommand + + # Try LLM parsing with timeout + try: + logger.debug("Attempting LLM-based command parsing...") + parsed = await asyncio.wait_for( + self._parse_command_with_llm(command), + timeout=5.0 + ) + + if parsed: + logger.info("LLM parsing succeeded") + return parsed + else: + logger.warning("LLM parsing returned None, falling back to keywords") + + except asyncio.TimeoutError: + logger.warning("LLM parsing timed out (>5s), falling back to keywords") + except Exception as e: + logger.warning(f"LLM parsing failed: {e}, falling back to keywords") + + # Fallback to keyword-based parsing + try: + logger.debug("Using fallback keyword parsing...") + fallback_result = await self._parse_command_fallback(command) + + if not fallback_result: + return None + + # Convert fallback format to ParsedCommand + # Extract entity_id from arguments + entity_id = fallback_result['arguments'].get('entity_id', '') + entity_name = entity_id.split('.', 1)[1] if '.' in entity_id else entity_id + + # Simple heuristic: assume it's targeting a single entity + parsed = ParsedCommand( + action=fallback_result['tool'], + target_type='entity', + target=entity_name.replace('_', ' '), + entity_type=None, + parameters={} + ) + + logger.info("Fallback parsing succeeded") + return parsed + + except Exception as e: + logger.error(f"Fallback parsing failed: {e}", exc_info=True) + return None + + @staticmethod + async def test_connection(config: Dict[str, Any]) -> Dict[str, Any]: + """ + Test Home Assistant API connection with provided configuration. + + This static method tests the HA API connection without fully initializing the plugin. + Used by the form-based configuration UI to validate settings before saving. + + Args: + config: Configuration dictionary with HA settings: + - ha_url: Home Assistant URL + - ha_token: Long-lived access token + - timeout: Request timeout (optional, default 30) + + Returns: + Dict with success status, message, and optional details + + Example: + >>> result = await HomeAssistantPlugin.test_connection({ + ... 'ha_url': 'http://homeassistant.local:8123', + ... 'ha_token': 'your_long_lived_token' + ... }) + >>> result['success'] + True + """ + import time + + try: + # Validate required config fields + required_fields = ['ha_url', 'ha_token'] + missing_fields = [field for field in required_fields if not config.get(field)] + + if missing_fields: + return { + "success": False, + "message": f"Missing required fields: {', '.join(missing_fields)}", + "status": "error" + } + + ha_url = config.get('ha_url') + ha_token = config.get('ha_token') + timeout = config.get('timeout', 30) + + # Create temporary MCP client + mcp_client = HAMCPClient( + base_url=ha_url, + token=ha_token, + timeout=timeout + ) + + # Test API connectivity with Template API + logger.info(f"Testing Home Assistant API connection to {ha_url}...") + start_time = time.time() + + test_result = await mcp_client._render_template("{{ 1 + 1 }}") + connection_time_ms = int((time.time() - start_time) * 1000) + + if str(test_result).strip() != "2": + return { + "success": False, + "message": f"Unexpected template result: {test_result}", + "status": "error" + } + + # Try to fetch entities count for additional info + try: + entities = await mcp_client.get_all_entities() + entity_count = len(entities) + except Exception: + entity_count = None + + return { + "success": True, + "message": f"Successfully connected to Home Assistant at {ha_url}", + "status": "success", + "details": { + "ha_url": ha_url, + "connection_time_ms": connection_time_ms, + "entity_count": entity_count, + "api_test": "Template rendering successful" + } + } + + except Exception as e: + logger.error(f"Home Assistant connection test failed: {e}", exc_info=True) + return { + "success": False, + "message": f"Connection test failed: {str(e)}", + "status": "error" + } diff --git a/backends/advanced/src/advanced_omi_backend/plugins/router.py b/backends/advanced/src/advanced_omi_backend/plugins/router.py new file mode 100644 index 00000000..523fe3ed --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/router.py @@ -0,0 +1,256 @@ +""" +Plugin routing system for multi-level plugin architecture. + +Routes pipeline events to appropriate plugins based on access level and triggers. +""" + +import logging +import re +import string +from typing import Dict, List, Optional + +from .base import BasePlugin, PluginContext, PluginResult + +logger = logging.getLogger(__name__) + + +def normalize_text_for_wake_word(text: str) -> str: + """ + Normalize text for wake word matching. + - Lowercase + - Replace punctuation with spaces + - Collapse multiple spaces to single space + - Strip leading/trailing whitespace + + Example: + "Hey, Vivi!" -> "hey vivi" + "HEY VIVI" -> "hey vivi" + "Hey-Vivi" -> "hey vivi" + """ + # Lowercase + text = text.lower() + # Replace punctuation with spaces (instead of removing, to preserve word boundaries) + text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) + # Normalize whitespace (collapse multiple spaces to single space) + text = re.sub(r'\s+', ' ', text) + # Strip leading/trailing whitespace + return text.strip() + + +def extract_command_after_wake_word(transcript: str, wake_word: str) -> str: + """ + Intelligently extract command after wake word in original transcript. + + Handles punctuation and spacing variations by creating a flexible regex pattern. + + Example: + transcript: "Hey, Vivi, turn off lights" + wake_word: "hey vivi" + -> extracts: "turn off lights" + + Args: + transcript: Original transcript text with punctuation + wake_word: Configured wake word (will be normalized) + + Returns: + Command text after wake word, or full transcript if wake word boundary not found + """ + # Split wake word into parts (normalized) + wake_word_parts = normalize_text_for_wake_word(wake_word).split() + + if not wake_word_parts: + return transcript.strip() + + # Create regex pattern that allows punctuation/whitespace between parts + # Example: "hey" + "vivi" -> r"hey[\s,.\-!?]*vivi[\s,.\-!?]*" + # The pattern matches the wake word parts with optional punctuation/whitespace between and after + pattern_parts = [re.escape(part) for part in wake_word_parts] + # Allow optional punctuation/whitespace between parts + pattern = r'[\s,.\-!?;:]*'.join(pattern_parts) + # Add trailing punctuation/whitespace consumption after last wake word part + pattern = '^' + pattern + r'[\s,.\-!?;:]*' + + # Try to match wake word at start of transcript (case-insensitive) + match = re.match(pattern, transcript, re.IGNORECASE) + + if match: + # Extract everything after the matched wake word (including trailing punctuation) + command = transcript[match.end():].strip() + return command + else: + # Fallback: couldn't find wake word boundary, return full transcript + logger.warning(f"Could not find wake word boundary for '{wake_word}' in '{transcript}', using full transcript") + return transcript.strip() + + +class PluginRouter: + """Routes pipeline events to appropriate plugins based on event subscriptions""" + + def __init__(self): + self.plugins: Dict[str, BasePlugin] = {} + # Index plugins by event for fast lookup + self._plugins_by_event: Dict[str, List[str]] = {} + + def register_plugin(self, plugin_id: str, plugin: BasePlugin): + """Register a plugin with the router""" + self.plugins[plugin_id] = plugin + + # Index by each event + for event in plugin.events: + if event not in self._plugins_by_event: + self._plugins_by_event[event] = [] + self._plugins_by_event[event].append(plugin_id) + + logger.info(f"Registered plugin '{plugin_id}' for events: {plugin.events}") + + async def dispatch_event( + self, + event: str, + user_id: str, + data: Dict, + metadata: Optional[Dict] = None + ) -> List[PluginResult]: + """ + Dispatch event to all subscribed plugins. + + Args: + event: Event name (e.g., 'transcript.streaming', 'conversation.complete') + user_id: User ID for context + data: Event-specific data + metadata: Optional metadata + + Returns: + List of plugin results + """ + # Add at start + logger.info(f"πŸ”Œ ROUTER: Dispatching '{event}' event (user={user_id})") + + results = [] + + # Get plugins subscribed to this event + plugin_ids = self._plugins_by_event.get(event, []) + + # Add subscription check + if not plugin_ids: + logger.warning(f"πŸ”Œ ROUTER: No plugins subscribed to event '{event}'") + return results + + logger.info(f"πŸ”Œ ROUTER: Found {len(plugin_ids)} subscribed plugin(s): {plugin_ids}") + + for plugin_id in plugin_ids: + plugin = self.plugins[plugin_id] + + if not plugin.enabled: + logger.info(f" ⊘ Skipping '{plugin_id}': disabled") + continue + + # Check execution condition (wake_word, etc.) + logger.info(f" β†’ Checking execution condition for '{plugin_id}'") + if not await self._should_execute(plugin, data): + logger.info(f" ⊘ Skipping '{plugin_id}': condition not met") + continue + + # Execute plugin + try: + logger.info(f" β–Ά Executing '{plugin_id}' for event '{event}'") + context = PluginContext( + user_id=user_id, + event=event, + data=data, + metadata=metadata or {} + ) + + result = await self._execute_plugin(plugin, event, context) + + if result: + status_icon = "βœ“" if result.success else "βœ—" + logger.info( + f" {status_icon} Plugin '{plugin_id}' completed: " + f"success={result.success}, message={result.message}" + ) + results.append(result) + + # If plugin says stop processing, break + if not result.should_continue: + logger.info(f" βŠ— Plugin '{plugin_id}' stopped further processing") + break + + except Exception as e: + # CRITICAL: Log exception details + logger.error( + f" βœ— Plugin '{plugin_id}' FAILED with exception: {e}", + exc_info=True + ) + + # Add at end + logger.info( + f"πŸ”Œ ROUTER: Dispatch complete for '{event}': " + f"{len(results)} plugin(s) executed successfully" + ) + + return results + + async def _should_execute(self, plugin: BasePlugin, data: Dict) -> bool: + """Check if plugin should be executed based on condition configuration""" + condition_type = plugin.condition.get('type', 'always') + + if condition_type == 'always': + return True + + elif condition_type == 'wake_word': + # Normalize transcript for matching (handles punctuation and spacing) + transcript = data.get('transcript', '') + normalized_transcript = normalize_text_for_wake_word(transcript) + + # Support both singular 'wake_word' and plural 'wake_words' (list) + wake_words = plugin.condition.get('wake_words', []) + if not wake_words: + # Fallback to singular wake_word for backward compatibility + wake_word = plugin.condition.get('wake_word', '') + if wake_word: + wake_words = [wake_word] + + # Check if transcript starts with any wake word (after normalization) + for wake_word in wake_words: + normalized_wake_word = normalize_text_for_wake_word(wake_word) + if normalized_wake_word and normalized_transcript.startswith(normalized_wake_word): + # Smart extraction: find where wake word actually ends in original text + command = extract_command_after_wake_word(transcript, wake_word) + data['command'] = command + data['original_transcript'] = transcript + logger.debug(f"Wake word '{wake_word}' detected. Original: '{transcript}', Command: '{command}'") + return True + + return False + + elif condition_type == 'conditional': + # Future: Custom condition checking + return True + + return False + + async def _execute_plugin( + self, + plugin: BasePlugin, + event: str, + context: PluginContext + ) -> Optional[PluginResult]: + """Execute plugin method for specified event""" + # Map events to plugin callback methods + if event.startswith('transcript.'): + return await plugin.on_transcript(context) + elif event.startswith('conversation.'): + return await plugin.on_conversation_complete(context) + elif event.startswith('memory.'): + return await plugin.on_memory_processed(context) + + return None + + async def cleanup_all(self): + """Clean up all registered plugins""" + for plugin_id, plugin in self.plugins.items(): + try: + await plugin.cleanup() + logger.info(f"Cleaned up plugin '{plugin_id}'") + except Exception as e: + logger.error(f"Error cleaning up plugin '{plugin_id}': {e}") diff --git a/backends/advanced/src/advanced_omi_backend/plugins/test_event/__init__.py b/backends/advanced/src/advanced_omi_backend/plugins/test_event/__init__.py new file mode 100644 index 00000000..5f3f2ecf --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/test_event/__init__.py @@ -0,0 +1,5 @@ +"""Test Event Plugin for integration testing""" + +from .plugin import TestEventPlugin + +__all__ = ['TestEventPlugin'] diff --git a/backends/advanced/src/advanced_omi_backend/plugins/test_event/config.yml b/backends/advanced/src/advanced_omi_backend/plugins/test_event/config.yml new file mode 100644 index 00000000..8b4f776b --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/test_event/config.yml @@ -0,0 +1,9 @@ +# Test Event Plugin Configuration +# +# This plugin is for development and integration testing only. +# It logs all plugin events to SQLite for verification. +# +# Enable in config/plugins.yml by setting enabled: true + +# Database path for event storage +db_path: ${TEST_PLUGIN_DB_PATH:-/app/debug/test_plugin_events.db} diff --git a/backends/advanced/src/advanced_omi_backend/plugins/test_event/event_storage.py b/backends/advanced/src/advanced_omi_backend/plugins/test_event/event_storage.py new file mode 100644 index 00000000..4fb618f9 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/test_event/event_storage.py @@ -0,0 +1,323 @@ +""" +Event storage module for test plugin using SQLite. + +Provides async SQLite operations for logging and querying plugin events. +""" +import json +import logging +import os +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +import aiosqlite + +logger = logging.getLogger(__name__) + + +class EventStorage: + """SQLite-based event storage for test plugin""" + + def __init__(self, db_path: str = "/app/debug/test_plugin_events.db"): + self.db_path = db_path + self.db: Optional[aiosqlite.Connection] = None + + async def initialize(self): + """Initialize database and create tables""" + # Ensure directory exists + logger.info(f"πŸ” DEBUG: Initializing event storage with db_path={self.db_path}") + + db_dir = Path(self.db_path).parent + logger.info(f"πŸ” DEBUG: Database directory: {db_dir}") + logger.info(f"πŸ” DEBUG: Directory exists before mkdir: {db_dir.exists()}") + + try: + db_dir.mkdir(parents=True, exist_ok=True) + logger.info(f"πŸ” DEBUG: Directory created/verified: {db_dir}") + logger.info(f"πŸ” DEBUG: Directory permissions: {oct(db_dir.stat().st_mode)}") + except Exception as e: + logger.error(f"πŸ” DEBUG: Failed to create directory: {e}") + raise + + logger.info(f"πŸ” DEBUG: Attempting to connect to SQLite database...") + try: + self.db = await aiosqlite.connect(self.db_path) + logger.info(f"πŸ” DEBUG: Successfully connected to database") + + # Enable WAL mode for better concurrent access (allows concurrent reads/writes) + # This fixes the "readonly database" error when Robot tests access from host + await self.db.execute("PRAGMA journal_mode=WAL") + await self.db.execute("PRAGMA busy_timeout=5000") # Wait up to 5s for locks + logger.info(f"βœ“ Enabled WAL mode for concurrent access") + + # Set file permissions to 666 so host user can write (container runs as root) + # Robot tests run as host user and need write access to the database + try: + os.chmod(self.db_path, 0o666) + # Also set permissions on WAL and SHM files if they exist + wal_file = f"{self.db_path}-wal" + shm_file = f"{self.db_path}-shm" + if os.path.exists(wal_file): + os.chmod(wal_file, 0o666) + if os.path.exists(shm_file): + os.chmod(shm_file, 0o666) + logger.info(f"βœ“ Set database file permissions to 666 for host access") + except Exception as perm_error: + logger.warning(f"Could not set database permissions: {perm_error}") + + except Exception as e: + logger.error(f"πŸ” DEBUG: Failed to connect to database: {e}") + logger.error(f"πŸ” DEBUG: Database file exists: {Path(self.db_path).exists()}") + if Path(self.db_path).exists(): + logger.error(f"πŸ” DEBUG: Database file permissions: {oct(Path(self.db_path).stat().st_mode)}") + raise + + # Create events table + await self.db.execute(""" + CREATE TABLE IF NOT EXISTS plugin_events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp DATETIME NOT NULL, + event TEXT NOT NULL, + user_id TEXT NOT NULL, + data TEXT NOT NULL, + metadata TEXT, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Create index for faster queries + await self.db.execute(""" + CREATE INDEX IF NOT EXISTS idx_event_type + ON plugin_events(event) + """) + + await self.db.execute(""" + CREATE INDEX IF NOT EXISTS idx_user_id + ON plugin_events(user_id) + """) + + await self.db.commit() + logger.info(f"Event storage initialized at {self.db_path}") + + async def log_event( + self, + event: str, + user_id: str, + data: Dict[str, Any], + metadata: Optional[Dict[str, Any]] = None + ) -> int: + """ + Log an event to the database. + + Args: + event: Event name (e.g., 'transcript.batch') + user_id: User ID from context + data: Event data dictionary + metadata: Optional metadata dictionary + + Returns: + Row ID of inserted event + """ + # Add at start + logger.debug(f"πŸ’Ύ STORAGE: Logging event '{event}' for user {user_id}") + + if not self.db: + logger.error("πŸ’Ύ STORAGE: Database connection not initialized!") + raise RuntimeError("Event storage not initialized") + + timestamp = datetime.utcnow().isoformat() + + # Add before serialization + logger.debug(f"πŸ’Ύ STORAGE: Serializing event data...") + try: + data_json = json.dumps(data) + metadata_json = json.dumps(metadata) if metadata else None + except Exception as e: + logger.error( + f"πŸ’Ύ STORAGE: JSON serialization failed for event '{event}': {e}", + exc_info=True + ) + raise + + # Add before database operation + logger.debug(f"πŸ’Ύ STORAGE: Inserting into plugin_events table...") + + try: + cursor = await self.db.execute( + """ + INSERT INTO plugin_events (timestamp, event, user_id, data, metadata) + VALUES (?, ?, ?, ?, ?) + """, + (timestamp, event, user_id, data_json, metadata_json) + ) + + await self.db.commit() + row_id = cursor.lastrowid + + # Add success log + logger.info( + f"πŸ’Ύ STORAGE: Event '{event}' inserted successfully (row_id={row_id})" + ) + + return row_id + + except Exception as e: + logger.error( + f"πŸ’Ύ STORAGE: Database operation failed for event '{event}': {e}", + exc_info=True + ) + raise + + async def get_events_by_type(self, event: str) -> List[Dict[str, Any]]: + """ + Query events by event type. + + Args: + event: Event name to filter by + + Returns: + List of event dictionaries + """ + if not self.db: + raise RuntimeError("Event storage not initialized") + + cursor = await self.db.execute( + """ + SELECT id, timestamp, event, user_id, data, metadata, created_at + FROM plugin_events + WHERE event = ? + ORDER BY created_at DESC + """, + (event,) + ) + + rows = await cursor.fetchall() + return self._rows_to_dicts(rows) + + async def get_events_by_user(self, user_id: str) -> List[Dict[str, Any]]: + """ + Query events by user ID. + + Args: + user_id: User ID to filter by + + Returns: + List of event dictionaries + """ + if not self.db: + raise RuntimeError("Event storage not initialized") + + cursor = await self.db.execute( + """ + SELECT id, timestamp, event, user_id, data, metadata, created_at + FROM plugin_events + WHERE user_id = ? + ORDER BY created_at DESC + """, + (user_id,) + ) + + rows = await cursor.fetchall() + return self._rows_to_dicts(rows) + + async def get_all_events(self) -> List[Dict[str, Any]]: + """ + Get all logged events. + + Returns: + List of all event dictionaries + """ + if not self.db: + raise RuntimeError("Event storage not initialized") + + cursor = await self.db.execute( + """ + SELECT id, timestamp, event, user_id, data, metadata, created_at + FROM plugin_events + ORDER BY created_at DESC + """ + ) + + rows = await cursor.fetchall() + return self._rows_to_dicts(rows) + + async def clear_events(self) -> int: + """ + Clear all events from the database. + + Returns: + Number of rows deleted + """ + if not self.db: + raise RuntimeError("Event storage not initialized") + + cursor = await self.db.execute("DELETE FROM plugin_events") + await self.db.commit() + + deleted = cursor.rowcount + logger.info(f"Cleared {deleted} events from database") + + return deleted + + async def get_event_count(self, event: Optional[str] = None) -> int: + """ + Get count of events. + + Args: + event: Optional event type to filter by + + Returns: + Count of matching events + """ + if not self.db: + raise RuntimeError("Event storage not initialized") + + if event: + cursor = await self.db.execute( + "SELECT COUNT(*) FROM plugin_events WHERE event = ?", + (event,) + ) + else: + cursor = await self.db.execute( + "SELECT COUNT(*) FROM plugin_events" + ) + + row = await cursor.fetchone() + return row[0] if row else 0 + + def _rows_to_dicts(self, rows: List[tuple]) -> List[Dict[str, Any]]: + """ + Convert database rows to dictionaries. + + Args: + rows: List of database row tuples + + Returns: + List of event dictionaries + """ + events = [] + + for row in rows: + event_dict = { + 'id': row[0], + 'timestamp': row[1], + 'event': row[2], + 'user_id': row[3], + 'data': json.loads(row[4]) if row[4] else {}, + 'metadata': json.loads(row[5]) if row[5] else {}, + 'created_at': row[6] + } + + # Flatten data fields to top level for easier access in tests + if isinstance(event_dict['data'], dict): + event_dict.update(event_dict['data']) + + events.append(event_dict) + + return events + + async def cleanup(self): + """Close database connection""" + if self.db: + await self.db.close() + logger.info("Event storage connection closed") diff --git a/backends/advanced/src/advanced_omi_backend/plugins/test_event/plugin.py b/backends/advanced/src/advanced_omi_backend/plugins/test_event/plugin.py new file mode 100644 index 00000000..59dd652e --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/plugins/test_event/plugin.py @@ -0,0 +1,232 @@ +""" +Test Event Plugin + +Logs all plugin events to SQLite database for integration testing. +Subscribes to all event types to verify event dispatch system works correctly. +""" +import logging +from typing import Any, Dict, List, Optional + +from advanced_omi_backend.plugins.base import BasePlugin, PluginContext, PluginResult +from .event_storage import EventStorage + +logger = logging.getLogger(__name__) + + +class TestEventPlugin(BasePlugin): + """ + Test plugin that logs all events for verification. + + Subscribes to: + - transcript.streaming: Real-time WebSocket transcription + - transcript.batch: File upload batch transcription + - conversation.complete: Conversation processing complete + - memory.processed: Memory extraction complete + + All events are logged to SQLite database with full context for test verification. + """ + + SUPPORTED_ACCESS_LEVELS: List[str] = ['transcript', 'conversation', 'memory'] + + def __init__(self, config: Dict[str, Any]): + super().__init__(config) + self.storage = EventStorage( + db_path=config.get('db_path', '/app/debug/test_plugin_events.db') + ) + self.event_count = 0 + + async def initialize(self): + """Initialize the test plugin and event storage""" + try: + await self.storage.initialize() + logger.info("βœ… Test Event Plugin initialized successfully") + except Exception as e: + logger.error(f"❌ Failed to initialize Test Event Plugin: {e}") + raise + + async def on_transcript(self, context: PluginContext) -> Optional[PluginResult]: + """ + Log transcript events (streaming or batch). + + Context data contains: + - transcript: str - The transcript text + - conversation_id: str - Conversation ID + - For streaming: is_final, confidence, words, segments + - For batch: word_count, segments + + Args: + context: Plugin context with event data + + Returns: + PluginResult indicating success + """ + try: + # Determine which transcript event this is based on context.event + event_type = context.event # 'transcript.streaming' or 'transcript.batch' + + # Extract key data fields + transcript = context.data.get('transcript', '') + conversation_id = context.data.get('conversation_id', 'unknown') + + # Log to storage + row_id = await self.storage.log_event( + event=event_type, + user_id=context.user_id, + data=context.data, + metadata=context.metadata + ) + + self.event_count += 1 + + logger.info( + f"πŸ“ Logged {event_type} event (row_id={row_id}): " + f"user={context.user_id}, " + f"conversation={conversation_id}, " + f"transcript='{transcript[:50]}...'" + ) + + return PluginResult( + success=True, + message=f"Transcript event logged (row_id={row_id})", + should_continue=True # Don't block normal processing + ) + + except Exception as e: + logger.error(f"Error logging transcript event: {e}", exc_info=True) + return PluginResult( + success=False, + message=f"Failed to log transcript event: {e}", + should_continue=True + ) + + async def on_conversation_complete(self, context: PluginContext) -> Optional[PluginResult]: + """ + Log conversation completion events. + + Context data contains: + - conversation: dict - Full conversation data + - transcript: str - Complete conversation transcript + - duration: float - Conversation duration + - conversation_id: str - Conversation identifier + + Args: + context: Plugin context with event data + + Returns: + PluginResult indicating success + """ + conversation_id = context.data.get('conversation_id', 'unknown') + duration = context.data.get('duration', 0) + + # Add at start + logger.info( + f"πŸ“ HANDLER: on_conversation_complete called for {conversation_id[:12]}" + ) + logger.debug(f" Event: {context.event}") + logger.debug(f" Metadata: {context.metadata}") + logger.debug(f" Duration: {duration}s") + + try: + # Add before storage + logger.info(f" πŸ’Ύ Storing event to SQLite database...") + + row_id = await self.storage.log_event( + event=context.event, # 'conversation.complete' + user_id=context.user_id, + data=context.data, + metadata=context.metadata + ) + + # Add after storage + logger.info(f" βœ“ Event stored successfully (row_id={row_id})") + + self.event_count += 1 + + return PluginResult( + success=True, + message=f"Conversation event logged (row_id={row_id})", + data={"row_id": row_id}, + should_continue=True, + ) + + except Exception as e: + # Enhance error logging + logger.error( + f" βœ— Storage FAILED for {conversation_id[:12]}: {e}", + exc_info=True + ) + return PluginResult( + success=False, + message=f"Failed to log conversation event: {e}", + should_continue=True, + ) + + async def on_memory_processed(self, context: PluginContext) -> Optional[PluginResult]: + """ + Log memory processing events. + + Context data contains: + - memories: list - Extracted memories + - conversation: dict - Source conversation + - memory_count: int - Number of memories created + - conversation_id: str - Conversation identifier + + Metadata contains: + - processing_time: float - Time spent processing + - memory_provider: str - Provider name + + Args: + context: Plugin context with event data + + Returns: + PluginResult indicating success + """ + try: + conversation_id = context.data.get('conversation_id', 'unknown') + memory_count = context.data.get('memory_count', 0) + memory_provider = context.metadata.get('memory_provider', 'unknown') + processing_time = context.metadata.get('processing_time', 0) + + # Log to storage + row_id = await self.storage.log_event( + event=context.event, # 'memory.processed' + user_id=context.user_id, + data=context.data, + metadata=context.metadata + ) + + self.event_count += 1 + + logger.info( + f"πŸ“ Logged memory.processed event (row_id={row_id}): " + f"user={context.user_id}, " + f"conversation={conversation_id}, " + f"memory_count={memory_count}, " + f"provider={memory_provider}, " + f"processing_time={processing_time:.2f}s" + ) + + return PluginResult( + success=True, + message=f"Memory event logged (row_id={row_id})", + should_continue=True + ) + + except Exception as e: + logger.error(f"Error logging memory event: {e}", exc_info=True) + return PluginResult( + success=False, + message=f"Failed to log memory event: {e}", + should_continue=True + ) + + async def cleanup(self): + """Clean up plugin resources""" + try: + logger.info( + f"🧹 Test Event Plugin shutting down. " + f"Logged {self.event_count} total events" + ) + await self.storage.cleanup() + except Exception as e: + logger.error(f"Error during test plugin cleanup: {e}") diff --git a/backends/advanced/src/advanced_omi_backend/routers/api_router.py b/backends/advanced/src/advanced_omi_backend/routers/api_router.py index 9e761f8e..5a135c7e 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/api_router.py +++ b/backends/advanced/src/advanced_omi_backend/routers/api_router.py @@ -6,14 +6,18 @@ """ import logging +import os from fastapi import APIRouter from .modules import ( + admin_router, + annotation_router, audio_router, chat_router, client_router, conversation_router, + finetuning_router, memory_router, obsidian_router, queue_router, @@ -29,16 +33,27 @@ router = APIRouter(prefix="/api", tags=["api"]) # Include all sub-routers +router.include_router(admin_router) +router.include_router(annotation_router) router.include_router(audio_router) router.include_router(user_router) router.include_router(chat_router) router.include_router(client_router) router.include_router(conversation_router) +router.include_router(finetuning_router) router.include_router(memory_router) router.include_router(obsidian_router) router.include_router(system_router) router.include_router(queue_router) router.include_router(health_router) # Also include under /api for frontend compatibility +# Conditionally include test routes (only in test environments) +if os.getenv("DEBUG_DIR"): + try: + from .modules.test_routes import router as test_router + router.include_router(test_router) + logger.info("βœ… Test routes loaded (test environment detected)") + except Exception as e: + logger.error(f"Error loading test routes: {e}", exc_info=True) logger.info("API router initialized with all sub-modules") diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py b/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py index 21f89991..4025a6dc 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py @@ -7,17 +7,23 @@ - client_routes: Active client monitoring and management - conversation_routes: Conversation CRUD and audio processing - memory_routes: Memory management, search, and debug +- annotation_routes: Annotation CRUD for memories and transcripts +- finetuning_routes: Model fine-tuning and training management - system_routes: System utilities and metrics - queue_routes: Job queue management and monitoring - audio_routes: Audio file uploads and processing - health_routes: Health check endpoints - websocket_routes: WebSocket connection handling +- admin_routes: Admin-only system management endpoints """ +from .admin_routes import router as admin_router +from .annotation_routes import router as annotation_router from .audio_routes import router as audio_router from .chat_routes import router as chat_router from .client_routes import router as client_router from .conversation_routes import router as conversation_router +from .finetuning_routes import router as finetuning_router from .health_routes import router as health_router from .memory_routes import router as memory_router from .obsidian_routes import router as obsidian_router @@ -27,10 +33,13 @@ from .websocket_routes import router as websocket_router __all__ = [ + "admin_router", + "annotation_router", "audio_router", "chat_router", "client_router", "conversation_router", + "finetuning_router", "health_router", "memory_router", "obsidian_router", diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/admin_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/admin_routes.py new file mode 100644 index 00000000..a2ef4398 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/admin_routes.py @@ -0,0 +1,122 @@ +""" +Admin routes for Chronicle API. + +Provides admin-only endpoints for system management and cleanup operations. +""" + +import logging +from typing import Optional + +from fastapi import APIRouter, Depends, Query, HTTPException +from fastapi.responses import JSONResponse + +from advanced_omi_backend.auth import current_active_user +from advanced_omi_backend.users import User + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/admin", tags=["admin"]) + + +def require_admin(current_user: User = Depends(current_active_user)) -> User: + """Dependency to require admin/superuser permissions.""" + if not current_user.is_superuser: + raise HTTPException( + status_code=403, + detail="Admin permissions required" + ) + return current_user + + +@router.get("/cleanup/settings") +async def get_cleanup_settings_admin( + admin: User = Depends(require_admin) +): + """Get current cleanup settings (admin only).""" + from advanced_omi_backend.config import get_cleanup_settings + + settings = get_cleanup_settings() + return { + **settings, + "note": "Cleanup settings are stored in /app/data/cleanup_config.json" + } + + +@router.post("/cleanup") +async def trigger_cleanup( + dry_run: bool = Query(False, description="Preview what would be deleted"), + retention_days: Optional[int] = Query(None, description="Override retention period"), + admin: User = Depends(require_admin) +): + """Manually trigger cleanup of soft-deleted conversations (admin only).""" + try: + from advanced_omi_backend.workers.cleanup_jobs import purge_old_deleted_conversations + from advanced_omi_backend.controllers.queue_controller import get_queue + + # Enqueue cleanup job + queue = get_queue("default") + job = queue.enqueue( + purge_old_deleted_conversations, + retention_days=retention_days, # Will use config default if None + dry_run=dry_run, + job_timeout="30m", + ) + + logger.info(f"Admin {admin.email} triggered cleanup job {job.id} (dry_run={dry_run}, retention={retention_days or 'default'})") + + return JSONResponse( + status_code=200, + content={ + "message": f"Cleanup job {'(dry run) ' if dry_run else ''}queued successfully", + "job_id": job.id, + "retention_days": retention_days or "default (from config)", + "dry_run": dry_run, + "note": "Check job status at /api/queue/jobs/{job_id}" + } + ) + + except Exception as e: + logger.error(f"Failed to trigger cleanup: {e}") + return JSONResponse( + status_code=500, + content={"error": f"Failed to trigger cleanup: {str(e)}"} + ) + + +@router.get("/cleanup/preview") +async def preview_cleanup( + retention_days: Optional[int] = Query(None, description="Preview with specific retention period"), + admin: User = Depends(require_admin) +): + """Preview what would be deleted by cleanup (admin only).""" + try: + from advanced_omi_backend.config import get_cleanup_settings + from advanced_omi_backend.models.conversation import Conversation + from datetime import datetime, timedelta + + # Use provided retention or default from config + if retention_days is None: + settings_dict = get_cleanup_settings() + retention_days = settings_dict['retention_days'] + + cutoff_date = datetime.utcnow() - timedelta(days=retention_days) + + # Count conversations that would be deleted + count = await Conversation.find( + Conversation.deleted == True, + Conversation.deleted_at < cutoff_date + ).count() + + return { + "retention_days": retention_days, + "cutoff_date": cutoff_date.isoformat(), + "conversations_to_delete": count, + "note": f"Conversations deleted before {cutoff_date.date()} would be purged" + } + + except Exception as e: + logger.error(f"Failed to preview cleanup: {e}") + return JSONResponse( + status_code=500, + content={"error": f"Failed to preview cleanup: {str(e)}"} + ) diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/annotation_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/annotation_routes.py new file mode 100644 index 00000000..ebee7634 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/annotation_routes.py @@ -0,0 +1,635 @@ +""" +Annotation routes for Chronicle API. + +Handles annotation CRUD operations for memories and transcripts. +Supports both user edits and AI-powered suggestions. +""" + +import logging +from datetime import datetime, timezone +from typing import List + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import JSONResponse + +from advanced_omi_backend.auth import current_active_user +from advanced_omi_backend.models.annotation import ( + Annotation, + AnnotationResponse, + AnnotationStatus, + AnnotationType, + DiarizationAnnotationCreate, + MemoryAnnotationCreate, + TranscriptAnnotationCreate, +) +from advanced_omi_backend.models.conversation import Conversation +from advanced_omi_backend.services.memory import get_memory_service +from advanced_omi_backend.users import User + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/annotations", tags=["annotations"]) + + +@router.post("/memory", response_model=AnnotationResponse) +async def create_memory_annotation( + annotation_data: MemoryAnnotationCreate, + current_user: User = Depends(current_active_user), +): + """ + Create annotation for memory edit. + + - Validates user owns memory + - Creates annotation record + - Updates memory content in vector store + - Re-embeds if content changed + """ + try: + memory_service = get_memory_service() + + # Verify memory ownership + try: + memory = await memory_service.get_memory( + annotation_data.memory_id, current_user.user_id + ) + if not memory: + raise HTTPException(status_code=404, detail="Memory not found") + except Exception as e: + logger.error(f"Error fetching memory: {e}") + raise HTTPException(status_code=404, detail="Memory not found") + + # Create annotation + annotation = Annotation( + annotation_type=AnnotationType.MEMORY, + user_id=current_user.user_id, + memory_id=annotation_data.memory_id, + original_text=annotation_data.original_text, + corrected_text=annotation_data.corrected_text, + status=annotation_data.status, + ) + await annotation.save() + logger.info( + f"Created memory annotation {annotation.id} for memory {annotation_data.memory_id}" + ) + + # Update memory content if accepted + if annotation.status == AnnotationStatus.ACCEPTED: + try: + await memory_service.update_memory( + memory_id=annotation_data.memory_id, + content=annotation_data.corrected_text, + user_id=current_user.user_id, + ) + logger.info( + f"Updated memory {annotation_data.memory_id} with corrected text" + ) + except Exception as e: + logger.error(f"Error updating memory: {e}") + # Annotation is saved, but memory update failed - log but don't fail the request + logger.warning( + f"Memory annotation {annotation.id} saved but memory update failed" + ) + + return AnnotationResponse.model_validate(annotation) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error creating memory annotation: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to create memory annotation: {str(e)}", + ) + + +@router.post("/transcript", response_model=AnnotationResponse) +async def create_transcript_annotation( + annotation_data: TranscriptAnnotationCreate, + current_user: User = Depends(current_active_user), +): + """ + Create annotation for transcript segment edit. + + - Validates user owns conversation + - Creates annotation record (NOT applied to transcript yet) + - Annotation is marked as unprocessed (processed=False) + - Visual indication in UI (pending badge) + - Use unified apply endpoint to apply all annotations together + """ + try: + # Verify conversation ownership + conversation = await Conversation.find_one( + Conversation.conversation_id == annotation_data.conversation_id, + Conversation.user_id == current_user.user_id, + ) + if not conversation: + raise HTTPException(status_code=404, detail="Conversation not found") + + # Validate segment index + active_transcript = conversation.active_transcript + if ( + not active_transcript + or annotation_data.segment_index >= len(active_transcript.segments) + ): + raise HTTPException(status_code=400, detail="Invalid segment index") + + segment = active_transcript.segments[annotation_data.segment_index] + + # Create annotation (NOT applied yet) + annotation = Annotation( + annotation_type=AnnotationType.TRANSCRIPT, + user_id=current_user.user_id, + conversation_id=annotation_data.conversation_id, + segment_index=annotation_data.segment_index, + original_text=segment.text, # Use current segment text + corrected_text=annotation_data.corrected_text, + status=AnnotationStatus.PENDING, # Changed from ACCEPTED + processed=False, # Not applied yet + ) + await annotation.save() + logger.info( + f"Created transcript annotation {annotation.id} for conversation {annotation_data.conversation_id} segment {annotation_data.segment_index}" + ) + + # Do NOT modify transcript immediately + # Do NOT trigger memory reprocessing yet + # User must click "Apply Changes" button to apply all annotations together + + return AnnotationResponse.model_validate(annotation) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error creating transcript annotation: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to create transcript annotation: {str(e)}", + ) + + +@router.get("/memory/{memory_id}", response_model=List[AnnotationResponse]) +async def get_memory_annotations( + memory_id: str, + current_user: User = Depends(current_active_user), +): + """Get all annotations for a memory.""" + try: + annotations = await Annotation.find( + Annotation.annotation_type == AnnotationType.MEMORY, + Annotation.memory_id == memory_id, + Annotation.user_id == current_user.user_id, + ).to_list() + + return [AnnotationResponse.model_validate(a) for a in annotations] + + except Exception as e: + logger.error(f"Error fetching memory annotations: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to fetch memory annotations: {str(e)}", + ) + + +@router.get("/transcript/{conversation_id}", response_model=List[AnnotationResponse]) +async def get_transcript_annotations( + conversation_id: str, + current_user: User = Depends(current_active_user), +): + """Get all annotations for a conversation's transcript.""" + try: + annotations = await Annotation.find( + Annotation.annotation_type == AnnotationType.TRANSCRIPT, + Annotation.conversation_id == conversation_id, + Annotation.user_id == current_user.user_id, + ).to_list() + + return [AnnotationResponse.model_validate(a) for a in annotations] + + except Exception as e: + logger.error(f"Error fetching transcript annotations: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to fetch transcript annotations: {str(e)}", + ) + + +@router.patch("/{annotation_id}/status") +async def update_annotation_status( + annotation_id: str, + status: AnnotationStatus, + current_user: User = Depends(current_active_user), +): + """ + Accept or reject AI-generated suggestions. + + Used for pending model suggestions in the UI. + """ + try: + annotation = await Annotation.find_one( + Annotation.id == annotation_id, + Annotation.user_id == current_user.user_id, + ) + if not annotation: + raise HTTPException(status_code=404, detail="Annotation not found") + + old_status = annotation.status + annotation.status = status + annotation.updated_at = datetime.now(timezone.utc) + + # If accepting a pending suggestion, apply the correction + if ( + status == AnnotationStatus.ACCEPTED + and old_status == AnnotationStatus.PENDING + ): + if annotation.is_memory_annotation(): + # Update memory + try: + memory_service = get_memory_service() + await memory_service.update_memory( + memory_id=annotation.memory_id, + content=annotation.corrected_text, + user_id=current_user.user_id, + ) + logger.info( + f"Applied suggestion to memory {annotation.memory_id}" + ) + except Exception as e: + logger.error(f"Error applying memory suggestion: {e}") + # Don't fail the status update if memory update fails + elif annotation.is_transcript_annotation(): + # Update transcript segment + try: + conversation = await Conversation.find_one( + Conversation.conversation_id == annotation.conversation_id, + Conversation.user_id == annotation.user_id + ) + if conversation: + transcript = conversation.active_transcript + if ( + transcript + and annotation.segment_index < len(transcript.segments) + ): + transcript.segments[ + annotation.segment_index + ].text = annotation.corrected_text + await conversation.save() + logger.info( + f"Applied suggestion to transcript segment {annotation.segment_index}" + ) + except Exception as e: + logger.error(f"Error applying transcript suggestion: {e}") + # Don't fail the status update if segment update fails + + await annotation.save() + logger.info(f"Updated annotation {annotation_id} status to {status}") + + return {"status": "updated", "annotation_id": annotation_id, "new_status": status} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error updating annotation status: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to update annotation status: {str(e)}", + ) + + +# === Diarization Annotation Routes === + + +@router.post("/diarization", response_model=AnnotationResponse) +async def create_diarization_annotation( + annotation_data: DiarizationAnnotationCreate, + current_user: User = Depends(current_active_user), +): + """ + Create annotation for speaker identification correction. + + - Validates user owns conversation + - Creates annotation record (NOT applied to transcript yet) + - Annotation is marked as unprocessed (processed=False) + - Visual indication in UI (strikethrough + corrected name) + """ + try: + # Verify conversation ownership + conversation = await Conversation.find_one( + Conversation.conversation_id == annotation_data.conversation_id, + Conversation.user_id == current_user.user_id, + ) + if not conversation: + raise HTTPException(status_code=404, detail="Conversation not found") + + # Validate segment index + active_transcript = conversation.active_transcript + if ( + not active_transcript + or annotation_data.segment_index >= len(active_transcript.segments) + ): + raise HTTPException(status_code=400, detail="Invalid segment index") + + # Create annotation (NOT applied yet) + annotation = Annotation( + annotation_type=AnnotationType.DIARIZATION, + user_id=current_user.user_id, + conversation_id=annotation_data.conversation_id, + segment_index=annotation_data.segment_index, + original_speaker=annotation_data.original_speaker, + corrected_speaker=annotation_data.corrected_speaker, + segment_start_time=annotation_data.segment_start_time, + original_text="", # Not used for diarization + corrected_text="", # Not used for diarization + status=annotation_data.status, + processed=False, # Not applied or sent to training yet + ) + await annotation.save() + logger.info( + f"Created diarization annotation {annotation.id} for conversation {annotation_data.conversation_id} segment {annotation_data.segment_index}" + ) + + return AnnotationResponse.model_validate(annotation) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error creating diarization annotation: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to create diarization annotation: {str(e)}", + ) + + +@router.get("/diarization/{conversation_id}", response_model=List[AnnotationResponse]) +async def get_diarization_annotations( + conversation_id: str, + current_user: User = Depends(current_active_user), +): + """Get all diarization annotations for a conversation.""" + try: + annotations = await Annotation.find( + Annotation.annotation_type == AnnotationType.DIARIZATION, + Annotation.conversation_id == conversation_id, + Annotation.user_id == current_user.user_id, + ).to_list() + + return [AnnotationResponse.model_validate(a) for a in annotations] + + except Exception as e: + logger.error(f"Error fetching diarization annotations: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to fetch diarization annotations: {str(e)}", + ) + + +@router.post("/diarization/{conversation_id}/apply") +async def apply_diarization_annotations( + conversation_id: str, + current_user: User = Depends(current_active_user), +): + """ + Apply pending diarization annotations to create new transcript version. + + - Finds all unprocessed diarization annotations for conversation + - Creates NEW transcript version with corrected speaker labels + - Marks annotations as processed (processed=True, processed_by="apply") + - Chains memory reprocessing since speaker changes affect meaning + - Returns job status with new version_id + """ + try: + # Verify conversation ownership + conversation = await Conversation.find_one( + Conversation.conversation_id == conversation_id, + Conversation.user_id == current_user.user_id, + ) + if not conversation: + raise HTTPException(status_code=404, detail="Conversation not found") + + # Get unprocessed diarization annotations + annotations = await Annotation.find( + Annotation.annotation_type == AnnotationType.DIARIZATION, + Annotation.conversation_id == conversation_id, + Annotation.user_id == current_user.user_id, + Annotation.processed == False, # Only unprocessed + ).to_list() + + if not annotations: + return JSONResponse( + content={"message": "No pending annotations to apply", "applied_count": 0} + ) + + # Get active transcript version + active_transcript = conversation.active_transcript + if not active_transcript: + raise HTTPException(status_code=404, detail="No active transcript found") + + # Create NEW transcript version with corrected speakers + import uuid + new_version_id = str(uuid.uuid4()) + + # Copy segments and apply corrections + corrected_segments = [] + for segment_idx, segment in enumerate(active_transcript.segments): + # Find annotation for this segment index + annotation_for_segment = next( + (a for a in annotations if a.segment_index == segment_idx), None + ) + + if annotation_for_segment: + # Apply correction + corrected_segment = segment.model_copy() + corrected_segment.speaker = annotation_for_segment.corrected_speaker + corrected_segments.append(corrected_segment) + else: + # No correction, keep original + corrected_segments.append(segment.model_copy()) + + # Add new version + conversation.add_transcript_version( + version_id=new_version_id, + transcript=active_transcript.transcript, # Same transcript text + words=active_transcript.words, # Same word timings + segments=corrected_segments, # Corrected speaker labels + provider=active_transcript.provider, + model=active_transcript.model, + processing_time_seconds=None, + metadata={ + "reprocessing_type": "diarization_annotations", + "source_version_id": active_transcript.version_id, + "trigger": "manual_annotation_apply", + "applied_annotation_count": len(annotations), + }, + set_as_active=True, + ) + + await conversation.save() + logger.info( + f"Created new transcript version {new_version_id} with {len(annotations)} diarization corrections" + ) + + # Mark annotations as processed + for annotation in annotations: + annotation.processed = True + annotation.processed_at = datetime.now(timezone.utc) + annotation.processed_by = "apply" + await annotation.save() + + # Chain memory reprocessing + from advanced_omi_backend.models.job import JobPriority + from advanced_omi_backend.workers.memory_jobs import enqueue_memory_processing + + enqueue_memory_processing( + client_id=conversation.client_id, + user_id=current_user.user_id, + user_email=current_user.email, + conversation_id=conversation_id, + priority=JobPriority.NORMAL, + ) + + return JSONResponse(content={ + "message": "Diarization annotations applied", + "version_id": new_version_id, + "applied_count": len(annotations), + "status": "success" + }) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error applying diarization annotations: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to apply diarization annotations: {str(e)}", + ) + + +@router.post("/{conversation_id}/apply") +async def apply_all_annotations( + conversation_id: str, + current_user: User = Depends(current_active_user), +): + """ + Apply all pending annotations (diarization + transcript) to create new version. + + - Finds all unprocessed annotations (both DIARIZATION and TRANSCRIPT types) + - Creates ONE new transcript version with all changes applied + - Marks all annotations as processed + - Triggers memory reprocessing once + """ + try: + # Verify conversation ownership + conversation = await Conversation.find_one( + Conversation.conversation_id == conversation_id, + Conversation.user_id == current_user.user_id, + ) + if not conversation: + raise HTTPException(status_code=404, detail="Conversation not found") + + # Get ALL unprocessed annotations (both types) + annotations = await Annotation.find( + Annotation.conversation_id == conversation_id, + Annotation.user_id == current_user.user_id, + Annotation.processed == False, + ).to_list() + + if not annotations: + return JSONResponse(content={ + "message": "No pending annotations to apply", + "diarization_count": 0, + "transcript_count": 0, + }) + + # Separate by type + diarization_annotations = [a for a in annotations if a.annotation_type == AnnotationType.DIARIZATION] + transcript_annotations = [a for a in annotations if a.annotation_type == AnnotationType.TRANSCRIPT] + + # Get active transcript + active_transcript = conversation.active_transcript + if not active_transcript: + raise HTTPException(status_code=404, detail="No active transcript found") + + # Create new version with ALL corrections applied + import uuid + new_version_id = str(uuid.uuid4()) + corrected_segments = [] + + for segment_idx, segment in enumerate(active_transcript.segments): + corrected_segment = segment.model_copy() + + # Apply diarization correction (if exists) + diar_annotation = next( + (a for a in diarization_annotations if a.segment_index == segment_idx), + None + ) + if diar_annotation: + corrected_segment.speaker = diar_annotation.corrected_speaker + + # Apply transcript correction (if exists) + transcript_annotation = next( + (a for a in transcript_annotations if a.segment_index == segment_idx), + None + ) + if transcript_annotation: + corrected_segment.text = transcript_annotation.corrected_text + + corrected_segments.append(corrected_segment) + + # Add new version + conversation.add_transcript_version( + version_id=new_version_id, + transcript=active_transcript.transcript, + words=active_transcript.words, # Preserved (may be misaligned for text edits) + segments=corrected_segments, + provider=active_transcript.provider, + model=active_transcript.model, + metadata={ + "reprocessing_type": "unified_annotations", + "source_version_id": active_transcript.version_id, + "trigger": "manual_annotation_apply", + "diarization_count": len(diarization_annotations), + "transcript_count": len(transcript_annotations), + }, + set_as_active=True, + ) + + await conversation.save() + logger.info( + f"Applied {len(annotations)} annotations (diarization: {len(diarization_annotations)}, transcript: {len(transcript_annotations)})" + ) + + # Mark all annotations as processed + for annotation in annotations: + annotation.processed = True + annotation.processed_at = datetime.now(timezone.utc) + annotation.processed_by = "apply" + annotation.status = AnnotationStatus.ACCEPTED + await annotation.save() + + # Trigger memory reprocessing (once for all changes) + from advanced_omi_backend.models.job import JobPriority + from advanced_omi_backend.workers.memory_jobs import enqueue_memory_processing + + enqueue_memory_processing( + client_id=conversation.client_id, + user_id=current_user.user_id, + user_email=current_user.email, + conversation_id=conversation_id, + priority=JobPriority.NORMAL, + ) + + return JSONResponse(content={ + "message": f"Applied {len(diarization_annotations)} diarization and {len(transcript_annotations)} transcript annotations", + "version_id": new_version_id, + "diarization_count": len(diarization_annotations), + "transcript_count": len(transcript_annotations), + "status": "success", + }) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error applying annotations: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Failed to apply annotations: {str(e)}", + ) diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py index 056e7667..7cef955a 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py @@ -2,17 +2,26 @@ Audio file upload and serving routes. Handles audio file uploads, processing job management, and audio file serving. +Audio is served from MongoDB chunks with Opus compression. """ +import io from typing import Optional -from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile -from fastapi.responses import FileResponse +from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile, Request +from fastapi.responses import FileResponse, StreamingResponse, Response from advanced_omi_backend.auth import current_superuser, current_active_user_optional, get_user_from_token_param from advanced_omi_backend.controllers import audio_controller from advanced_omi_backend.models.user import User +from advanced_omi_backend.models.conversation import Conversation from advanced_omi_backend.app_config import get_audio_chunk_dir from advanced_omi_backend.utils.gdrive_audio_utils import download_audio_files_from_drive, AudioValidationError +from advanced_omi_backend.utils.audio_chunk_utils import ( + reconstruct_wav_from_conversation, + retrieve_audio_chunks, + concatenate_chunks_to_pcm, + build_wav_from_pcm, +) router = APIRouter(prefix="/audio", tags=["audio"]) @@ -22,45 +31,46 @@ async def upload_audio_from_drive_folder( gdrive_folder_id: str = Query(..., description="Google Drive Folder ID containing audio files (e.g., the string after /folders/ in the URL)"), current_user: User = Depends(current_superuser), device_name: str = Query(default="upload"), - auto_generate_client: bool = Query(default=True), ): - try: - files = await download_audio_files_from_drive(gdrive_folder_id) - except AudioValidationError as e: + try: + files = await download_audio_files_from_drive(gdrive_folder_id, current_user.id) + except AudioValidationError as e: raise HTTPException(status_code=400, detail=str(e)) return await audio_controller.upload_and_process_audio_files( - current_user, files, device_name, auto_generate_client, source="gdrive" + current_user, files, device_name, source="gdrive" ) @router.get("/get_audio/{conversation_id}") async def get_conversation_audio( conversation_id: str, - cropped: bool = Query(default=False, description="Serve cropped (speech-only) audio instead of original"), + request: Request, token: Optional[str] = Query(default=None, description="JWT token for audio element access"), current_user: Optional[User] = Depends(current_active_user_optional), ): """ - Serve audio file for a conversation. + Serve complete audio file for a conversation from MongoDB chunks. - This endpoint uses conversation_id for direct lookup and ownership verification, - which is more efficient than querying by filename. + Reconstructs audio by: + 1. Retrieving all Opus-compressed chunks from MongoDB + 2. Decoding each chunk to PCM + 3. Concatenating PCM data + 4. Building complete WAV file with headers Supports both header-based auth (Authorization: Bearer) and query param token for