Refactor audio processing to utilize MongoDB chunks and enhance job handling

AnkushMalaker · AnkushMalaker · commit e662f4674b41 · 2026-01-12T07:25:45.000Z
- Removed audio file path parameters from various functions, transitioning to audio data retrieval from MongoDB chunks.
- Updated the `start_post_conversation_jobs` function to reflect changes in audio handling, ensuring jobs reconstruct audio from database chunks.
- Enhanced the `transcribe_full_audio_job` and `recognise_speakers_job` to process audio directly from memory, eliminating the need for temporary files.
- Improved error handling and logging for audio data retrieval, ensuring better feedback during processing.
- Added a new utility function for converting PCM data to WAV format in memory, streamlining audio format handling.
diff --git a/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py
@@ -156,7 +156,6 @@ async def upload_and_process_audio_files(
                 job_ids = start_post_conversation_jobs(
                     conversation_id=conversation_id,
                     audio_uuid=audio_uuid,
-                    audio_file_path=None,  # No file path - using MongoDB chunks
                     user_id=user.user_id,
                     post_transcription=True,  # Run batch transcription for uploads
                     client_id=client_id  # Pass client_id for UI tracking
diff --git a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py
@@ -4,6 +4,7 @@
 
 import logging
 import time
+from datetime import datetime
 from pathlib import Path
 
 from fastapi.responses import JSONResponse
@@ -382,34 +383,20 @@ async def reprocess_transcript(conversation_id: str, user: User):
         if not user.is_superuser and conversation_model.user_id != str(user.user_id):
             return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only reprocess your own conversations."})
 
-        # Get audio_uuid and file path from conversation
+        # Get audio_uuid from conversation
         audio_uuid = conversation_model.audio_uuid
-        audio_path = conversation_model.audio_path
 
-        if not audio_path:
-            return JSONResponse(
-                status_code=400, content={"error": "No audio file found for this conversation"}
-            )
-
-        # Check if file exists - try multiple possible locations
-        possible_paths = [
-            Path("/app/audio_chunks") / audio_path,
-            Path(audio_path),  # fallback to relative path
-        ]
+        # Validate audio chunks exist in MongoDB
+        chunks = await AudioChunkDocument.find(
+            AudioChunkDocument.conversation_id == conversation_id
+        ).to_list()
 
-        full_audio_path = None
-        for path in possible_paths:
-            if path.exists():
-                full_audio_path = path
-                break
-
-        if not full_audio_path:
+        if not chunks:
             return JSONResponse(
-                status_code=422,
+                status_code=404,
                 content={
-                    "error": "Audio file not found on disk",
-                    "details": f"Conversation exists but audio file '{audio_path}' is missing from expected locations",
-                    "searched_paths": [str(p) for p in possible_paths]
+                    "error": "No audio data found for this conversation",
+                    "details": f"Conversation '{conversation_id}' exists but has no audio chunks in MongoDB"
                 }
             )
 
@@ -430,12 +417,11 @@ async def reprocess_transcript(conversation_id: str, user: User):
             transcribe_full_audio_job,
         )
 
-        # Job 1: Transcribe audio to text
+        # Job 1: Transcribe audio to text (reconstructs from MongoDB chunks)
         transcript_job = transcription_queue.enqueue(
             transcribe_full_audio_job,
             conversation_id,
             audio_uuid,
-            str(full_audio_path),
             version_id,
             "reprocess",
             job_timeout=600,
@@ -446,14 +432,11 @@ async def reprocess_transcript(conversation_id: str, user: User):
         )
         logger.info(f"📥 RQ: Enqueued transcription job {transcript_job.id}")
 
-        # Job 2: Recognize speakers (depends on transcription)
+        # Job 2: Recognize speakers (depends on transcription, reads data from DB)
         speaker_job = transcription_queue.enqueue(
             recognise_speakers_job,
             conversation_id,
             version_id,
-            str(full_audio_path),
-            "",  # transcript_text - will be read from DB
-            [],  # words - will be read from DB
             depends_on=transcript_job,
             job_timeout=600,
             result_ttl=JOB_RESULT_TTL,
diff --git a/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py
@@ -366,7 +366,6 @@ def start_streaming_jobs(
 def start_post_conversation_jobs(
     conversation_id: str,
     audio_uuid: str,
-    audio_file_path: str,
     user_id: str,
     post_transcription: bool = True,
     transcript_version_id: Optional[str] = None,
@@ -382,15 +381,17 @@ def start_post_conversation_jobs(
     3. Memory extraction job - Extracts memories from conversation (parallel)
     4. Title/summary generation job - Generates title and summary (parallel)
 
+    Note: Audio is reconstructed from MongoDB chunks, not files.
+
     Args:
         conversation_id: Conversation identifier
         audio_uuid: Audio UUID for job tracking
-        audio_file_path: Path to audio file
         user_id: User identifier
         post_transcription: If True, run batch transcription step (for uploads)
                            If False, skip transcription (streaming already has it)
         transcript_version_id: Transcript version ID (auto-generated if None)
         depends_on_job: Optional job dependency for first job
+        client_id: Client ID for UI tracking
 
     Returns:
         Dict with job IDs (transcription will be None if post_transcription=False)
@@ -416,7 +417,6 @@ def start_post_conversation_jobs(
         transcribe_full_audio_job,
         conversation_id,
         audio_uuid,
-        audio_file_path,
         version_id,
         "batch",  # trigger
         job_timeout=1800,  # 30 minutes
@@ -439,9 +439,6 @@ def start_post_conversation_jobs(
         recognise_speakers_job,
         conversation_id,
         version_id,
-        audio_file_path,
-        "",  # transcript_text - will be read from DB
-        [],  # words - will be read from DB
         job_timeout=1200,  # 20 minutes
         result_ttl=JOB_RESULT_TTL,
         depends_on=speaker_depends_on,
diff --git a/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py
@@ -896,7 +896,6 @@ async def _process_batch_audio_complete(
         job_ids = start_post_conversation_jobs(
             conversation_id=conversation_id,
             audio_uuid=audio_uuid,
-            audio_file_path=None,  # No file path - using MongoDB chunks
             user_id=None,  # Will be read from conversation in DB by jobs
             post_transcription=True,  # Run batch transcription for uploads
             client_id=client_id  # Pass client_id for UI tracking
diff --git a/backends/advanced/src/advanced_omi_backend/utils/audio_utils.py b/backends/advanced/src/advanced_omi_backend/utils/audio_utils.py
@@ -258,6 +258,49 @@ async def process_audio_chunk(
         client_state.update_audio_received(chunk)
 
 
+def pcm_to_wav_bytes(
+    pcm_data: bytes,
+    sample_rate: int = 16000,
+    channels: int = 1,
+    sample_width: int = 2
+) -> bytes:
+    """
+    Convert raw PCM audio data to WAV format in memory.
+
+    Args:
+        pcm_data: Raw PCM audio bytes
+        sample_rate: Sample rate in Hz (default: 16000)
+        channels: Number of audio channels (default: 1 for mono)
+        sample_width: Sample width in bytes (default: 2 for 16-bit)
+
+    Returns:
+        WAV file data as bytes
+    """
+    import wave
+    import io
+
+    logger.debug(
+        f"Converting PCM to WAV in memory: {len(pcm_data)} bytes "
+        f"(rate={sample_rate}, channels={channels}, width={sample_width})"
+    )
+
+    # Use BytesIO to create WAV in memory
+    wav_buffer = io.BytesIO()
+
+    with wave.open(wav_buffer, 'wb') as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(sample_width)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(pcm_data)
+
+    # Get the WAV bytes
+    wav_bytes = wav_buffer.getvalue()
+
+    logger.debug(f"Created WAV in memory: {len(wav_bytes)} bytes")
+
+    return wav_bytes
+
+
 def write_pcm_to_wav(
     pcm_data: bytes,
     output_path: str,
diff --git a/backends/advanced/src/advanced_omi_backend/workers/conversation_jobs.py b/backends/advanced/src/advanced_omi_backend/workers/conversation_jobs.py
@@ -527,7 +527,6 @@ async def open_conversation_job(
     job_ids = start_post_conversation_jobs(
         conversation_id=conversation_id,
         audio_uuid=session_id,
-        audio_file_path=file_path,
         user_id=user_id,
         post_transcription=True,  # Run batch transcription for streaming audio
         client_id=client_id  # Pass client_id for UI tracking
diff --git a/backends/advanced/src/advanced_omi_backend/workers/speaker_jobs.py b/backends/advanced/src/advanced_omi_backend/workers/speaker_jobs.py
@@ -121,26 +121,25 @@ async def check_enrolled_speakers_job(
 async def recognise_speakers_job(
     conversation_id: str,
     version_id: str,
-    audio_path: str,
-    transcript_text: str,
-    words: list,
+    transcript_text: str = "",
+    words: list = None,
     *,
     redis_client=None
 ) -> Dict[str, Any]:
     """
     RQ job function for identifying speakers in a transcribed conversation.
 
     This job runs after transcription and:
-    1. Calls speaker recognition service to identify speakers
-    2. Updates the transcript version with identified speaker labels
-    3. Returns results for downstream jobs (memory)
+    1. Reconstructs audio from MongoDB chunks
+    2. Calls speaker recognition service to identify speakers
+    3. Updates the transcript version with identified speaker labels
+    4. Returns results for downstream jobs (memory)
 
     Args:
         conversation_id: Conversation ID
         version_id: Transcript version ID to update
-        audio_path: Path to audio file
-        transcript_text: Transcript text from transcription job
-        words: Word-level timing data from transcription job
+        transcript_text: Transcript text from transcription job (optional, reads from DB if empty)
+        words: Word-level timing data from transcription job (optional, reads from DB if empty)
         redis_client: Redis client (injected by decorator)
 
     Returns:
@@ -186,77 +185,52 @@ async def recognise_speakers_job(
         }
 
     # Reconstruct audio from MongoDB chunks
-    import tempfile
-    from pathlib import Path
     from advanced_omi_backend.utils.audio_chunk_utils import reconstruct_wav_from_conversation
 
     logger.info(f"📦 Reconstructing audio from MongoDB chunks for conversation {conversation_id}")
 
     # Call speaker recognition service
     try:
-        # Reconstruct WAV from MongoDB chunks
+        # Reconstruct WAV from MongoDB chunks (already in memory as bytes)
         wav_data = await reconstruct_wav_from_conversation(conversation_id)
 
-        # Write to temporary file for speaker recognition service
-        temp_wav_file = tempfile.NamedTemporaryFile(
-            suffix=".wav",
-            delete=False,
-            prefix=f"speaker_recog_{conversation_id[:8]}_"
+        logger.info(
+            f"📦 Reconstructed audio from MongoDB chunks: "
+            f"{len(wav_data) / 1024 / 1024:.2f} MB"
         )
 
-        try:
-            temp_wav_file.write(wav_data)
-            temp_wav_file.flush()
-            temp_wav_path = temp_wav_file.name
-            temp_wav_file.close()
+        # Read transcript text and words from the transcript version
+        # (Parameters may be empty if called via job dependency)
+        actual_transcript_text = transcript_text or transcript_version.transcript or ""
+        actual_words = words if words else []
 
-            logger.info(
-                f"📁 Created temporary WAV file for speaker recognition: {temp_wav_path} "
-                f"({len(wav_data) / 1024 / 1024:.2f} MB)"
-            )
+        # If words not provided, we need to get them from metadata
+        if not actual_words and transcript_version.metadata:
+            actual_words = transcript_version.metadata.get("words", [])
 
-            # Read transcript text and words from the transcript version
-            # (Parameters may be empty if called via job dependency)
-            actual_transcript_text = transcript_text or transcript_version.transcript or ""
-            actual_words = words if words else []
-
-            # If words not provided, we need to get them from metadata
-            if not actual_words and transcript_version.metadata:
-                actual_words = transcript_version.metadata.get("words", [])
-
-            if not actual_transcript_text:
-                logger.warning(f"🎤 No transcript text found in version {version_id}")
-                # Clean up temp file before returning
-                Path(temp_wav_path).unlink(missing_ok=True)
-                return {
-                    "success": False,
-                    "conversation_id": conversation_id,
-                    "version_id": version_id,
-                    "error": "No transcript text available",
-                    "processing_time_seconds": 0
-                }
-
-            transcript_data = {
-                "text": actual_transcript_text,
-                "words": actual_words
+        if not actual_transcript_text:
+            logger.warning(f"🎤 No transcript text found in version {version_id}")
+            return {
+                "success": False,
+                "conversation_id": conversation_id,
+                "version_id": version_id,
+                "error": "No transcript text available",
+                "processing_time_seconds": 0
             }
 
-            logger.info(f"🎤 Calling speaker recognition service...")
+        transcript_data = {
+            "text": actual_transcript_text,
+            "words": actual_words
+        }
 
-            # Call speaker service with temporary file path
-            speaker_result = await speaker_client.diarize_identify_match(
-                audio_path=temp_wav_path,
-                transcript_data=transcript_data,
-                user_id=user_id
-            )
+        logger.info(f"🎤 Calling speaker recognition service...")
 
-        finally:
-            # Clean up temporary file
-            try:
-                Path(temp_wav_path).unlink(missing_ok=True)
-                logger.debug(f"🧹 Deleted temporary WAV file: {temp_wav_path}")
-            except Exception as cleanup_error:
-                logger.warning(f"Failed to delete temporary file {temp_wav_path}: {cleanup_error}")
+        # Call speaker service with in-memory audio data (no temp file needed!)
+        speaker_result = await speaker_client.diarize_identify_match(
+            audio_data=wav_data,  # Pass bytes directly, no disk I/O
+            transcript_data=transcript_data,
+            user_id=user_id
+        )
 
     except ValueError as e:
         # No chunks found for conversation
diff --git a/backends/advanced/src/advanced_omi_backend/workers/transcription_jobs.py b/backends/advanced/src/advanced_omi_backend/workers/transcription_jobs.py
diff --git a/tests/setup/test_manager_keywords.robot b/tests/setup/test_manager_keywords.robot