Enhance conversation retrieval with pagination and orphan handling

AnkushMalaker · AnkushMalaker · commit 25527a656d00 · 2026-02-09T20:17:37.000Z
- Updated `get_conversations` function to support pagination through `limit` and `offset` parameters, improving performance for large datasets.
- Consolidated query logic to fetch both normal and orphan conversations in a single database call, reducing round-trips and enhancing efficiency.
- Modified the response structure to include total count, limit, and offset in the returned data for better client-side handling.
- Adjusted database indexing to optimize queries for paginated results, ensuring faster access to conversation data.
diff --git a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py
@@ -157,59 +157,85 @@ async def get_conversation(conversation_id: str, user: User):
         return JSONResponse(status_code=500, content={"error": "Error fetching conversation"})
 
 
-async def get_conversations(user: User, include_deleted: bool = False, include_unprocessed: bool = False):
-    """Get conversations with speech only (speech-driven architecture)."""
+async def get_conversations(
+    user: User,
+    include_deleted: bool = False,
+    include_unprocessed: bool = False,
+    limit: int = 200,
+    offset: int = 0,
+):
+    """Get conversations with speech only (speech-driven architecture).
+
+    Uses a single consolidated query with ``$or`` when ``include_unprocessed``
+    is True, eliminating multiple round-trips and Python-side merge/sort.
+    Results are paginated with ``limit``/``offset``.
+    """
     try:
-        # Build base query conditions
         user_filter = {} if user.is_superuser else {"user_id": str(user.user_id)}
 
+        # Build query conditions — single $or when orphans are requested
+        conditions = []
+
+        # Condition 1: normal (non-deleted or all) conversations
         if include_deleted:
-            # No deleted filter - show everything
-            base_query = user_filter
+            conditions.append({})  # no filter on deleted
         else:
-            base_query = {**user_filter, "deleted": False}
-
-        user_conversations = (
-            await Conversation.find(base_query)
-            .sort(-Conversation.created_at)
-            .to_list()
-        )
+            conditions.append({"deleted": False})
 
-        # If include_unprocessed, also fetch orphan conversations
-        orphan_ids = set()
         if include_unprocessed:
-            # Orphan type 1: always_persist conversations stuck in pending/failed (not deleted)
-            orphan_query_1 = {
-                **user_filter,
+            # Orphan type 1: always_persist stuck in pending/failed (not deleted)
+            conditions.append({
                 "always_persist": True,
                 "processing_status": {"$in": ["pending_transcription", "transcription_failed"]},
                 "deleted": False,
-            }
+            })
             # Orphan type 2: soft-deleted due to no speech but have audio data
-            orphan_query_2 = {
-                **user_filter,
+            conditions.append({
                 "deleted": True,
                 "deletion_reason": {"$in": [
                     "no_meaningful_speech",
                     "audio_file_not_ready",
                     "no_meaningful_speech_batch_transcription",
                 ]},
                 "audio_chunks_count": {"$gt": 0},
-            }
+            })
+
+        # Assemble final query
+        if len(conditions) == 1:
+            query = {**user_filter, **conditions[0]}
+        else:
+            query = {**user_filter, "$or": conditions}
 
-            orphan_convs_1 = await Conversation.find(orphan_query_1).sort(-Conversation.created_at).to_list()
-            orphan_convs_2 = await Conversation.find(orphan_query_2).sort(-Conversation.created_at).to_list()
+        total = await Conversation.find(query).count()
 
-            # Merge orphans that aren't already in the main list
-            existing_ids = {c.conversation_id for c in user_conversations}
-            for conv in orphan_convs_1 + orphan_convs_2:
-                orphan_ids.add(conv.conversation_id)
-                if conv.conversation_id not in existing_ids:
-                    user_conversations.append(conv)
-                    existing_ids.add(conv.conversation_id)
+        user_conversations = (
+            await Conversation.find(query)
+            .sort(-Conversation.created_at)
+            .skip(offset)
+            .limit(limit)
+            .to_list()
+        )
 
-            # Re-sort after merge
-            user_conversations.sort(key=lambda c: c.created_at or datetime.min, reverse=True)
+        # Mark orphans in results (lightweight in-memory check on the page)
+        orphan_ids: set = set()
+        if include_unprocessed:
+            for conv in user_conversations:
+                is_orphan_type1 = (
+                    conv.always_persist
+                    and conv.processing_status in ("pending_transcription", "transcription_failed")
+                    and not conv.deleted
+                )
+                is_orphan_type2 = (
+                    conv.deleted
+                    and conv.deletion_reason in (
+                        "no_meaningful_speech",
+                        "audio_file_not_ready",
+                        "no_meaningful_speech_batch_transcription",
+                    )
+                    and (conv.audio_chunks_count or 0) > 0
+                )
+                if is_orphan_type1 or is_orphan_type2:
+                    orphan_ids.add(conv.conversation_id)
 
         # Build response with explicit curated fields - minimal for list view
         conversations = []
@@ -245,7 +271,12 @@ async def get_conversations(user: User, include_deleted: bool = False, include_u
                 }
             )
 
-        return {"conversations": conversations}
+        return {
+            "conversations": conversations,
+            "total": total,
+            "limit": limit,
+            "offset": offset,
+        }
 
     except Exception as e:
         logger.exception(f"Error fetching conversations: {e}")
diff --git a/backends/advanced/src/advanced_omi_backend/models/conversation.py b/backends/advanced/src/advanced_omi_backend/models/conversation.py
@@ -384,7 +384,7 @@ class Settings:
             "conversation_id",
             "user_id",
             "created_at",
-            [("user_id", 1), ("created_at", -1)],  # Compound index for user queries
+            [("user_id", 1), ("deleted", 1), ("created_at", -1)],  # Compound index for paginated list queries
             IndexModel([("external_source_id", 1)], sparse=True)  # Sparse index for deduplication
         ]
 
diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/conversation_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/conversation_routes.py
@@ -33,10 +33,12 @@ async def close_current_conversation(
 async def get_conversations(
     include_deleted: bool = Query(False, description="Include soft-deleted conversations"),
     include_unprocessed: bool = Query(False, description="Include orphan audio sessions (always_persist with failed/pending transcription)"),
+    limit: int = Query(200, ge=1, le=500, description="Max conversations to return"),
+    offset: int = Query(0, ge=0, description="Number of conversations to skip"),
     current_user: User = Depends(current_active_user)
 ):
     """Get conversations. Admins see all conversations, users see only their own."""
-    return await conversation_controller.get_conversations(current_user, include_deleted, include_unprocessed)
+    return await conversation_controller.get_conversations(current_user, include_deleted, include_unprocessed, limit, offset)
 
 
 @router.get("/{conversation_id}")
diff --git a/backends/advanced/webui/src/services/api.ts b/backends/advanced/webui/src/services/api.ts
@@ -107,10 +107,12 @@ export const authApi = {
 }
 
 export const conversationsApi = {
-  getAll: (includeDeleted?: boolean, includeUnprocessed?: boolean) => api.get('/api/conversations', {
+  getAll: (includeDeleted?: boolean, includeUnprocessed?: boolean, limit?: number, offset?: number) => api.get('/api/conversations', {
     params: {
       ...(includeDeleted !== undefined && { include_deleted: includeDeleted }),
       ...(includeUnprocessed !== undefined && { include_unprocessed: includeUnprocessed }),
+      ...(limit !== undefined && { limit }),
+      ...(offset !== undefined && { offset }),
     }
   }),
   getById: (id: string) => api.get(`/api/conversations/${id}`),

Original file line number	Diff line number	Diff line change
`@@ -384,7 +384,7 @@ class Settings:`
`384`	`384`	`"conversation_id",`
`385`	`385`	`"user_id",`
`386`	`386`	`"created_at",`
`387`		`- [("user_id", 1), ("created_at", -1)], # Compound index for user queries`
	`387`	`+ [("user_id", 1), ("deleted", 1), ("created_at", -1)], # Compound index for paginated list queries`
`388`	`388`	`IndexModel([("external_source_id", 1)], sparse=True) # Sparse index for deduplication`
`389`	`389`	`]`
`390`	`390`
Original file line number	Diff line number	Diff line change
`@@ -107,10 +107,12 @@ export const authApi = {`
`107`	`107`	`}`
`108`	`108`
`109`	`109`	`export const conversationsApi = {`
`110`		`- getAll: (includeDeleted?: boolean, includeUnprocessed?: boolean) => api.get('/api/conversations', {`
	`110`	`+ getAll: (includeDeleted?: boolean, includeUnprocessed?: boolean, limit?: number, offset?: number) => api.get('/api/conversations', {`
`111`	`111`	`params: {`
`112`	`112`	`...(includeDeleted !== undefined && { include_deleted: includeDeleted }),`
`113`	`113`	`...(includeUnprocessed !== undefined && { include_unprocessed: includeUnprocessed }),`
	`114`	`+ ...(limit !== undefined && { limit }),`
	`115`	`+ ...(offset !== undefined && { offset }),`
`114`	`116`	`}`
`115`	`117`	`}),`
`116`	`118`	getById: (id: string) => api.get(`/api/conversations/${id}`),