Skip to content

Commit 7e849e9

Browse files
committed
added support to show performanace during indexing
1 parent 48682d4 commit 7e849e9

File tree

4 files changed

+144
-13
lines changed

4 files changed

+144
-13
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Choose an installation method:
3131
sudo apt update && sudo apt install pipx
3232

3333
# Install code-indexer using pipx (from latest release)
34-
pipx install https://github.com/jsbattig/code-indexer/releases/download/v0.0.19.0/code_indexer-0.0.19.0-py3-none-any.whl
34+
pipx install https://github.com/jsbattig/code-indexer/releases/download/v0.0.20.0/code_indexer-0.0.20.0-py3-none-any.whl
3535

3636
# Or install directly from git (latest development)
3737
pipx install git+https://github.com/jsbattig/code-indexer.git
@@ -47,7 +47,7 @@ python3 -m venv ~/code-indexer-env
4747
source ~/code-indexer-env/bin/activate
4848

4949
# Install from GitHub releases
50-
pip install https://github.com/jsbattig/code-indexer/releases/download/v0.0.19.0/code_indexer-0.0.19.0-py3-none-any.whl
50+
pip install https://github.com/jsbattig/code-indexer/releases/download/v0.0.20.0/code_indexer-0.0.20.0-py3-none-any.whl
5151

5252
# Or install directly from git (latest development)
5353
pip install git+https://github.com/jsbattig/code-indexer.git

src/code_indexer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
to provide code search capabilities.
66
"""
77

8-
__version__ = "0.0.19.0"
8+
__version__ = "0.0.20.0"
99
__author__ = "Code Indexer Team"

src/code_indexer/cli.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,11 @@ def progress_callback(current, total, file_path, error=None, info=None):
609609
console.print(f"ℹ️ {info}", style="cyan")
610610
return
611611

612+
# Handle info-only updates (for status messages during processing)
613+
if file_path == Path("") and info and progress_bar:
614+
progress_bar.update(task_id, description=f"ℹ️ {info}")
615+
return
616+
612617
# Initialize progress bar on first call
613618
if progress_bar is None:
614619
progress_bar = Progress(
@@ -643,11 +648,18 @@ def progress_callback(current, total, file_path, error=None, info=None):
643648
except ValueError:
644649
relative_path = file_path.name
645650

646-
# Truncate long paths to fit display
647-
if len(relative_path) > 47:
648-
relative_path = "..." + relative_path[-44:]
651+
# Truncate long paths to fit display (leave room for throughput info)
652+
max_path_length = 35 if info else 47
653+
if len(relative_path) > max_path_length:
654+
relative_path = "..." + relative_path[-(max_path_length - 3) :]
649655

650-
progress_bar.update(task, advance=1, description=relative_path)
656+
# Create description with throughput info
657+
if info:
658+
description = f"{relative_path} | {info}"
659+
else:
660+
description = relative_path
661+
662+
progress_bar.update(task, advance=1, description=description)
651663

652664
# Show errors
653665
if error:
@@ -675,12 +687,20 @@ def progress_callback(current, total, file_path, error=None, info=None):
675687
console.print(f"❌ Indexing failed: {e}", style="red")
676688
sys.exit(1)
677689

678-
# Show completion summary
690+
# Show completion summary with throughput
679691
console.print("✅ Indexing complete!", style="green")
680692
console.print(f"📄 Files processed: {stats.files_processed}")
681693
console.print(f"📦 Chunks indexed: {stats.chunks_created}")
682694
console.print(f"⏱️ Duration: {stats.duration:.2f}s")
683695

696+
# Calculate final throughput
697+
if stats.duration > 0:
698+
files_per_min = (stats.files_processed / stats.duration) * 60
699+
chunks_per_min = (stats.chunks_created / stats.duration) * 60
700+
console.print(
701+
f"🚀 Throughput: {files_per_min:.1f} files/min, {chunks_per_min:.1f} chunks/min"
702+
)
703+
684704
if stats.failed_files > 0:
685705
console.print(f"⚠️ Failed files: {stats.failed_files}", style="yellow")
686706

@@ -1367,7 +1387,12 @@ def status(ctx, force_docker: bool):
13671387
qdrant_details = ""
13681388
if qdrant_ok:
13691389
try:
1370-
count = qdrant_client.count_points()
1390+
# Get the correct collection name using the current embedding provider
1391+
embedding_provider = EmbeddingProviderFactory.create(config, console)
1392+
collection_name = qdrant_client.resolve_collection_name(
1393+
config, embedding_provider
1394+
)
1395+
count = qdrant_client.count_points(collection_name)
13711396
qdrant_details = f"Documents: {count}"
13721397
except Exception:
13731398
qdrant_details = "Collection ready"
@@ -1432,7 +1457,12 @@ def status(ctx, force_docker: bool):
14321457
# Storage information
14331458
if qdrant_ok:
14341459
try:
1435-
size_info = qdrant_client.get_collection_size()
1460+
# Use the correct collection name for storage info too
1461+
embedding_provider = EmbeddingProviderFactory.create(config, console)
1462+
collection_name = qdrant_client.resolve_collection_name(
1463+
config, embedding_provider
1464+
)
1465+
size_info = qdrant_client.get_collection_size(collection_name)
14361466
if "error" not in size_info:
14371467
storage_details = f"Size: ~{size_info['estimated_vector_size_mb']}MB | Points: {size_info['points_count']:,}"
14381468
table.add_row("Storage", "📊", storage_details)

src/code_indexer/services/smart_indexer.py

Lines changed: 104 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import time
66
from pathlib import Path
77
from typing import List, Dict, Any, Optional, Callable
8+
from dataclasses import dataclass
89

910
from ..config import Config
1011
from ..services import QdrantClient
@@ -14,6 +15,18 @@
1415
from .progressive_metadata import ProgressiveMetadata
1516

1617

18+
@dataclass
19+
class ThroughputStats:
20+
"""Statistics for tracking indexing throughput and throttling."""
21+
22+
files_per_minute: float = 0.0
23+
chunks_per_minute: float = 0.0
24+
embedding_requests_per_minute: float = 0.0
25+
is_throttling: bool = False
26+
throttle_reason: str = ""
27+
average_processing_time_per_file: float = 0.0
28+
29+
1730
class SmartIndexer(GitAwareDocumentProcessor):
1831
"""Smart indexer with progressive metadata and resumability."""
1932

@@ -197,13 +210,20 @@ def _do_incremental_index(
197210
def _process_files_with_metadata(
198211
self, files: List[Path], batch_size: int, progress_callback: Optional[Callable]
199212
) -> ProcessingStats:
200-
"""Process files with progressive metadata updates."""
213+
"""Process files with progressive metadata updates and throughput monitoring."""
201214

202215
stats = ProcessingStats()
203216
stats.start_time = time.time()
204217

205218
batch_points = []
206219

220+
# Throughput tracking
221+
throughput_window_start = time.time()
222+
throughput_window_files = 0
223+
throughput_window_chunks = 0
224+
throughput_window_size = 60.0 # 1 minute window
225+
last_throttle_check = time.time()
226+
207227
def update_metadata(chunks_count=0, failed=False):
208228
"""Update metadata after each file."""
209229
self.progressive_metadata.update_progress(
@@ -212,6 +232,55 @@ def update_metadata(chunks_count=0, failed=False):
212232
failed_files=1 if failed else 0,
213233
)
214234

235+
def calculate_throughput() -> ThroughputStats:
236+
"""Calculate current throughput and detect throttling."""
237+
current_time = time.time()
238+
elapsed = current_time - throughput_window_start
239+
240+
if elapsed <= 0:
241+
return ThroughputStats()
242+
243+
# Calculate rates per minute
244+
files_per_min = (throughput_window_files / elapsed) * 60
245+
chunks_per_min = (throughput_window_chunks / elapsed) * 60
246+
avg_time_per_file = elapsed / max(throughput_window_files, 1)
247+
248+
# Detect throttling by checking embedding provider
249+
is_throttling = False
250+
throttle_reason = ""
251+
252+
# Check if we're using VoyageAI and detect rate limiting
253+
provider_name = self.embedding_provider.get_provider_name()
254+
if provider_name == "voyage-ai":
255+
# Check if rate limiter indicates throttling
256+
if hasattr(self.embedding_provider, "rate_limiter"):
257+
rate_limiter = self.embedding_provider.rate_limiter
258+
wait_time = rate_limiter.wait_time(100) # Estimate for 100 tokens
259+
if wait_time > 0.5: # If we need to wait more than 0.5 seconds
260+
is_throttling = True
261+
throttle_reason = f"API rate limiting (wait: {wait_time:.1f}s)"
262+
elif rate_limiter.request_tokens < 10: # Low on request tokens
263+
is_throttling = True
264+
throttle_reason = "API request quota running low"
265+
266+
# Detect slow processing (could indicate network issues or service slowdown)
267+
if (
268+
avg_time_per_file > 5.0 and not is_throttling
269+
): # More than 5 seconds per file
270+
is_throttling = True
271+
throttle_reason = (
272+
f"Slow processing detected ({avg_time_per_file:.1f}s/file)"
273+
)
274+
275+
return ThroughputStats(
276+
files_per_minute=files_per_min,
277+
chunks_per_minute=chunks_per_min,
278+
embedding_requests_per_minute=chunks_per_min, # Assuming 1 request per chunk
279+
is_throttling=is_throttling,
280+
throttle_reason=throttle_reason,
281+
average_processing_time_per_file=avg_time_per_file,
282+
)
283+
215284
for i, file_path in enumerate(files):
216285
points = []
217286

@@ -222,9 +291,11 @@ def update_metadata(chunks_count=0, failed=False):
222291
if points:
223292
batch_points.extend(points)
224293
stats.chunks_created += len(points)
294+
throughput_window_chunks += len(points)
225295

226296
stats.files_processed += 1
227297
stats.total_size += file_path.stat().st_size
298+
throughput_window_files += 1
228299

229300
# Process batch if full
230301
if len(batch_points) >= batch_size:
@@ -235,9 +306,39 @@ def update_metadata(chunks_count=0, failed=False):
235306
# Update metadata after successful processing
236307
update_metadata(chunks_count=len(points), failed=False)
237308

238-
# Call progress callback
309+
# Calculate throughput every 30 seconds or every 50 files
310+
current_time = time.time()
311+
if (current_time - last_throttle_check > 30) or (i % 50 == 0 and i > 0):
312+
throughput_stats = calculate_throughput()
313+
last_throttle_check = current_time
314+
315+
# Reset throughput window if it's been more than window size
316+
if current_time - throughput_window_start > throughput_window_size:
317+
throughput_window_start = current_time
318+
throughput_window_files = 0
319+
throughput_window_chunks = 0
320+
321+
# Call progress callback with throughput info
239322
if progress_callback:
240-
progress_callback(i + 1, len(files), file_path)
323+
throughput_stats = calculate_throughput()
324+
325+
# Create enhanced info string
326+
info_parts = []
327+
if throughput_stats.files_per_minute > 0:
328+
info_parts.append(
329+
f"{throughput_stats.files_per_minute:.1f} files/min"
330+
)
331+
if throughput_stats.chunks_per_minute > 0:
332+
info_parts.append(
333+
f"{throughput_stats.chunks_per_minute:.1f} chunks/min"
334+
)
335+
if throughput_stats.is_throttling:
336+
info_parts.append(f"🐌 {throughput_stats.throttle_reason}")
337+
elif throughput_stats.files_per_minute > 60: # Fast processing
338+
info_parts.append("🚀 Full speed")
339+
340+
info = " | ".join(info_parts) if info_parts else None
341+
progress_callback(i + 1, len(files), file_path, info=info)
241342

242343
except Exception as e:
243344
stats.failed_files += 1

0 commit comments

Comments
 (0)