added support to show performanace during indexing

jsbattig · jsbattig · commit 7e849e9ceb02 · 2025-06-19T14:41:34.000-05:00
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ Choose an installation method:
 sudo apt update && sudo apt install pipx
 
 # Install code-indexer using pipx (from latest release)
-pipx install https://github.com/jsbattig/code-indexer/releases/download/v0.0.19.0/code_indexer-0.0.19.0-py3-none-any.whl
+pipx install https://github.com/jsbattig/code-indexer/releases/download/v0.0.20.0/code_indexer-0.0.20.0-py3-none-any.whl
 
 # Or install directly from git (latest development)
 pipx install git+https://github.com/jsbattig/code-indexer.git
@@ -47,7 +47,7 @@ python3 -m venv ~/code-indexer-env
 source ~/code-indexer-env/bin/activate
 
 # Install from GitHub releases
-pip install https://github.com/jsbattig/code-indexer/releases/download/v0.0.19.0/code_indexer-0.0.19.0-py3-none-any.whl
+pip install https://github.com/jsbattig/code-indexer/releases/download/v0.0.20.0/code_indexer-0.0.20.0-py3-none-any.whl
 
 # Or install directly from git (latest development)
 pip install git+https://github.com/jsbattig/code-indexer.git
diff --git a/src/code_indexer/__init__.py b/src/code_indexer/__init__.py
@@ -5,5 +5,5 @@
 to provide code search capabilities.
 """
 
-__version__ = "0.0.19.0"
+__version__ = "0.0.20.0"
 __author__ = "Code Indexer Team"
diff --git a/src/code_indexer/cli.py b/src/code_indexer/cli.py
@@ -609,6 +609,11 @@ def progress_callback(current, total, file_path, error=None, info=None):
                 console.print(f"ℹ️  {info}", style="cyan")
                 return
 
+            # Handle info-only updates (for status messages during processing)
+            if file_path == Path("") and info and progress_bar:
+                progress_bar.update(task_id, description=f"ℹ️  {info}")
+                return
+
             # Initialize progress bar on first call
             if progress_bar is None:
                 progress_bar = Progress(
@@ -643,11 +648,18 @@ def progress_callback(current, total, file_path, error=None, info=None):
                 except ValueError:
                     relative_path = file_path.name
 
-            # Truncate long paths to fit display
-            if len(relative_path) > 47:
-                relative_path = "..." + relative_path[-44:]
+            # Truncate long paths to fit display (leave room for throughput info)
+            max_path_length = 35 if info else 47
+            if len(relative_path) > max_path_length:
+                relative_path = "..." + relative_path[-(max_path_length - 3) :]
 
-            progress_bar.update(task, advance=1, description=relative_path)
+            # Create description with throughput info
+            if info:
+                description = f"{relative_path} | {info}"
+            else:
+                description = relative_path
+
+            progress_bar.update(task, advance=1, description=description)
 
             # Show errors
             if error:
@@ -675,12 +687,20 @@ def progress_callback(current, total, file_path, error=None, info=None):
             console.print(f"❌ Indexing failed: {e}", style="red")
             sys.exit(1)
 
-        # Show completion summary
+        # Show completion summary with throughput
         console.print("✅ Indexing complete!", style="green")
         console.print(f"📄 Files processed: {stats.files_processed}")
         console.print(f"📦 Chunks indexed: {stats.chunks_created}")
         console.print(f"⏱️  Duration: {stats.duration:.2f}s")
 
+        # Calculate final throughput
+        if stats.duration > 0:
+            files_per_min = (stats.files_processed / stats.duration) * 60
+            chunks_per_min = (stats.chunks_created / stats.duration) * 60
+            console.print(
+                f"🚀 Throughput: {files_per_min:.1f} files/min, {chunks_per_min:.1f} chunks/min"
+            )
+
         if stats.failed_files > 0:
             console.print(f"⚠️  Failed files: {stats.failed_files}", style="yellow")
 
@@ -1367,7 +1387,12 @@ def status(ctx, force_docker: bool):
         qdrant_details = ""
         if qdrant_ok:
             try:
-                count = qdrant_client.count_points()
+                # Get the correct collection name using the current embedding provider
+                embedding_provider = EmbeddingProviderFactory.create(config, console)
+                collection_name = qdrant_client.resolve_collection_name(
+                    config, embedding_provider
+                )
+                count = qdrant_client.count_points(collection_name)
                 qdrant_details = f"Documents: {count}"
             except Exception:
                 qdrant_details = "Collection ready"
@@ -1432,7 +1457,12 @@ def status(ctx, force_docker: bool):
         # Storage information
         if qdrant_ok:
             try:
-                size_info = qdrant_client.get_collection_size()
+                # Use the correct collection name for storage info too
+                embedding_provider = EmbeddingProviderFactory.create(config, console)
+                collection_name = qdrant_client.resolve_collection_name(
+                    config, embedding_provider
+                )
+                size_info = qdrant_client.get_collection_size(collection_name)
                 if "error" not in size_info:
                     storage_details = f"Size: ~{size_info['estimated_vector_size_mb']}MB | Points: {size_info['points_count']:,}"
                     table.add_row("Storage", "📊", storage_details)
diff --git a/src/code_indexer/services/smart_indexer.py b/src/code_indexer/services/smart_indexer.py
@@ -5,6 +5,7 @@
 import time
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Callable
+from dataclasses import dataclass
 
 from ..config import Config
 from ..services import QdrantClient
@@ -14,6 +15,18 @@
 from .progressive_metadata import ProgressiveMetadata
 
 
+@dataclass
+class ThroughputStats:
+    """Statistics for tracking indexing throughput and throttling."""
+
+    files_per_minute: float = 0.0
+    chunks_per_minute: float = 0.0
+    embedding_requests_per_minute: float = 0.0
+    is_throttling: bool = False
+    throttle_reason: str = ""
+    average_processing_time_per_file: float = 0.0
+
+
 class SmartIndexer(GitAwareDocumentProcessor):
     """Smart indexer with progressive metadata and resumability."""
 
@@ -197,13 +210,20 @@ def _do_incremental_index(
     def _process_files_with_metadata(
         self, files: List[Path], batch_size: int, progress_callback: Optional[Callable]
     ) -> ProcessingStats:
-        """Process files with progressive metadata updates."""
+        """Process files with progressive metadata updates and throughput monitoring."""
 
         stats = ProcessingStats()
         stats.start_time = time.time()
 
         batch_points = []
 
+        # Throughput tracking
+        throughput_window_start = time.time()
+        throughput_window_files = 0
+        throughput_window_chunks = 0
+        throughput_window_size = 60.0  # 1 minute window
+        last_throttle_check = time.time()
+
         def update_metadata(chunks_count=0, failed=False):
             """Update metadata after each file."""
             self.progressive_metadata.update_progress(
@@ -212,6 +232,55 @@ def update_metadata(chunks_count=0, failed=False):
                 failed_files=1 if failed else 0,
             )
 
+        def calculate_throughput() -> ThroughputStats:
+            """Calculate current throughput and detect throttling."""
+            current_time = time.time()
+            elapsed = current_time - throughput_window_start
+
+            if elapsed <= 0:
+                return ThroughputStats()
+
+            # Calculate rates per minute
+            files_per_min = (throughput_window_files / elapsed) * 60
+            chunks_per_min = (throughput_window_chunks / elapsed) * 60
+            avg_time_per_file = elapsed / max(throughput_window_files, 1)
+
+            # Detect throttling by checking embedding provider
+            is_throttling = False
+            throttle_reason = ""
+
+            # Check if we're using VoyageAI and detect rate limiting
+            provider_name = self.embedding_provider.get_provider_name()
+            if provider_name == "voyage-ai":
+                # Check if rate limiter indicates throttling
+                if hasattr(self.embedding_provider, "rate_limiter"):
+                    rate_limiter = self.embedding_provider.rate_limiter
+                    wait_time = rate_limiter.wait_time(100)  # Estimate for 100 tokens
+                    if wait_time > 0.5:  # If we need to wait more than 0.5 seconds
+                        is_throttling = True
+                        throttle_reason = f"API rate limiting (wait: {wait_time:.1f}s)"
+                    elif rate_limiter.request_tokens < 10:  # Low on request tokens
+                        is_throttling = True
+                        throttle_reason = "API request quota running low"
+
+            # Detect slow processing (could indicate network issues or service slowdown)
+            if (
+                avg_time_per_file > 5.0 and not is_throttling
+            ):  # More than 5 seconds per file
+                is_throttling = True
+                throttle_reason = (
+                    f"Slow processing detected ({avg_time_per_file:.1f}s/file)"
+                )
+
+            return ThroughputStats(
+                files_per_minute=files_per_min,
+                chunks_per_minute=chunks_per_min,
+                embedding_requests_per_minute=chunks_per_min,  # Assuming 1 request per chunk
+                is_throttling=is_throttling,
+                throttle_reason=throttle_reason,
+                average_processing_time_per_file=avg_time_per_file,
+            )
+
         for i, file_path in enumerate(files):
             points = []
 
@@ -222,9 +291,11 @@ def update_metadata(chunks_count=0, failed=False):
                 if points:
                     batch_points.extend(points)
                     stats.chunks_created += len(points)
+                    throughput_window_chunks += len(points)
 
                 stats.files_processed += 1
                 stats.total_size += file_path.stat().st_size
+                throughput_window_files += 1
 
                 # Process batch if full
                 if len(batch_points) >= batch_size:
@@ -235,9 +306,39 @@ def update_metadata(chunks_count=0, failed=False):
                 # Update metadata after successful processing
                 update_metadata(chunks_count=len(points), failed=False)
 
-                # Call progress callback
+                # Calculate throughput every 30 seconds or every 50 files
+                current_time = time.time()
+                if (current_time - last_throttle_check > 30) or (i % 50 == 0 and i > 0):
+                    throughput_stats = calculate_throughput()
+                    last_throttle_check = current_time
+
+                    # Reset throughput window if it's been more than window size
+                    if current_time - throughput_window_start > throughput_window_size:
+                        throughput_window_start = current_time
+                        throughput_window_files = 0
+                        throughput_window_chunks = 0
+
+                # Call progress callback with throughput info
                 if progress_callback:
-                    progress_callback(i + 1, len(files), file_path)
+                    throughput_stats = calculate_throughput()
+
+                    # Create enhanced info string
+                    info_parts = []
+                    if throughput_stats.files_per_minute > 0:
+                        info_parts.append(
+                            f"{throughput_stats.files_per_minute:.1f} files/min"
+                        )
+                    if throughput_stats.chunks_per_minute > 0:
+                        info_parts.append(
+                            f"{throughput_stats.chunks_per_minute:.1f} chunks/min"
+                        )
+                    if throughput_stats.is_throttling:
+                        info_parts.append(f"🐌 {throughput_stats.throttle_reason}")
+                    elif throughput_stats.files_per_minute > 60:  # Fast processing
+                        info_parts.append("🚀 Full speed")
+
+                    info = " | ".join(info_parts) if info_parts else None
+                    progress_callback(i + 1, len(files), file_path, info=info)
 
             except Exception as e:
                 stats.failed_files += 1