From 54b9be517a324cf76b51ff84efc0a9406b9f60f4 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Mon, 26 Jan 2026 15:05:22 +0530 Subject: [PATCH] feat: Qdrant Vector Search Signed-off-by: Anush008 --- package.json | 1 + pnpm-lock.yaml | 27 ++ src/swe/vector/core/autoDetect.ts | 49 ++- src/swe/vector/core/config.ts | 30 ++ src/swe/vector/qdrant/index.ts | 3 + src/swe/vector/qdrant/qdrantAdapter.ts | 166 ++++++++ src/swe/vector/qdrant/qdrantConfig.ts | 45 ++ src/swe/vector/qdrant/qdrantOrchestrator.ts | 428 ++++++++++++++++++++ 8 files changed, 743 insertions(+), 6 deletions(-) create mode 100644 src/swe/vector/qdrant/index.ts create mode 100644 src/swe/vector/qdrant/qdrantAdapter.ts create mode 100644 src/swe/vector/qdrant/qdrantConfig.ts create mode 100644 src/swe/vector/qdrant/qdrantOrchestrator.ts diff --git a/package.json b/package.json index ab3a8b14..1aca16a3 100644 --- a/package.json +++ b/package.json @@ -149,6 +149,7 @@ "@opentelemetry/sdk-trace-base": "^1.30.1", "@opentelemetry/semantic-conventions": "^1.29.0", "@perplexity-ai/perplexity_ai": "^0.10.0", + "@qdrant/js-client-rest": "^1.16.2", "@sinclair/typebox": "^0.34.41", "@slack/bolt": "^4.4.0", "@slack/web-api": "^7.9.3", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 53729b6f..78f7dc7d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -170,6 +170,9 @@ importers: '@perplexity-ai/perplexity_ai': specifier: ^0.10.0 version: 0.10.0 + '@qdrant/js-client-rest': + specifier: ^1.16.2 + version: 1.16.2(typescript@5.9.3) '@sinclair/typebox': specifier: ^0.34.41 version: 0.34.41 @@ -2497,6 +2500,16 @@ packages: engines: {node: '>=18'} hasBin: true + '@qdrant/js-client-rest@1.16.2': + resolution: {integrity: sha512-Zm4wEZURrZ24a+Hmm4l1QQYjiz975Ep3vF0yzWR7ICGcxittNz47YK2iBOk8kb8qseCu8pg7WmO1HOIsO8alvw==} + engines: {node: '>=18.17.0', pnpm: '>=8'} + peerDependencies: + typescript: '>=4.7' + + '@qdrant/openapi-typescript-fetch@1.2.6': + resolution: {integrity: sha512-oQG/FejNpItrxRHoyctYvT3rwGZOnK4jr3JdppO/c78ktDvkWiPXPHNsrDf33K9sZdRb6PR7gi4noIapu5q4HA==} + engines: {node: '>=18.0.0', pnpm: '>=8'} + '@readme/better-ajv-errors@2.3.2': resolution: {integrity: sha512-T4GGnRAlY3C339NhoUpgJJFsMYko9vIgFAlhgV+/vEGFw66qEY4a4TRJIAZBcX/qT1pq5DvXSme+SQODHOoBrw==} engines: {node: '>=18'} @@ -8247,6 +8260,10 @@ packages: undici-types@6.21.0: resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} + undici@6.23.0: + resolution: {integrity: sha512-VfQPToRA5FZs/qJxLIinmU59u0r7LXqoJkCzinq3ckNJp3vKEh7jTWN589YQ5+aoAC/TGRLyJLCPKcLQbM8r9g==} + engines: {node: '>=18.17'} + unicode-emoji-modifier-base@1.0.0: resolution: {integrity: sha512-yLSH4py7oFH3oG/9K+XWrz1pSi3dfUrWEnInbxMfArOfc1+33BlGPQtLsOYwvdMy11AwUBetYuaRxSPqgkq+8g==} engines: {node: '>=4'} @@ -11051,6 +11068,14 @@ snapshots: - bare-buffer - supports-color + '@qdrant/js-client-rest@1.16.2(typescript@5.9.3)': + dependencies: + '@qdrant/openapi-typescript-fetch': 1.2.6 + typescript: 5.9.3 + undici: 6.23.0 + + '@qdrant/openapi-typescript-fetch@1.2.6': {} + '@readme/better-ajv-errors@2.3.2(ajv@8.17.1)': dependencies: '@babel/code-frame': 7.27.1 @@ -17847,6 +17872,8 @@ snapshots: undici-types@6.21.0: {} + undici@6.23.0: {} + unicode-emoji-modifier-base@1.0.0: {} unicorn-magic@0.1.0: {} diff --git a/src/swe/vector/core/autoDetect.ts b/src/swe/vector/core/autoDetect.ts index 2cc64914..3e053636 100644 --- a/src/swe/vector/core/autoDetect.ts +++ b/src/swe/vector/core/autoDetect.ts @@ -1,6 +1,6 @@ -import type { AlloyDBNestedConfig, ChromaNestedConfig, DiscoveryEngineConfig, GoogleCloudConfig, VectorStoreConfig } from './config'; +import type { AlloyDBNestedConfig, ChromaNestedConfig, DiscoveryEngineConfig, GoogleCloudConfig, QdrantNestedConfig, VectorStoreConfig } from './config'; -export type VectorBackend = 'alloydb' | 'discovery-engine' | 'chroma'; +export type VectorBackend = 'alloydb' | 'discovery-engine' | 'chroma' | 'qdrant'; export interface BackendDetection { backend: VectorBackend | null; @@ -18,7 +18,28 @@ export interface BackendDetection { * 4. null - No backend detected */ export function detectBackend(): BackendDetection { - // 1. Check for ChromaDB (local-first) + // 1. Check for Qdrant + const qdrantUrl = process.env.QDRANT_URL; + if (qdrantUrl) { + const qdrant: QdrantNestedConfig = { + url: qdrantUrl, + apiKey: process.env.QDRANT_API_KEY, + }; + + return { + backend: 'qdrant', + reason: 'Qdrant detected via QDRANT_URL', + config: { + qdrant, + embedding: { + provider: 'ollama', + model: process.env.OLLAMA_EMBEDDING_MODEL || 'manutic/nomic-embed-code', + }, + }, + }; + } + + // 2. Check for ChromaDB (local-first) const chromaUrl = process.env.CHROMA_URL; if (chromaUrl) { const chroma: ChromaNestedConfig = { @@ -41,7 +62,7 @@ export function detectBackend(): BackendDetection { }; } - // 2. Check for AlloyDB/Postgres + // 3. Check for AlloyDB/Postgres const pgHost = process.env.ALLOYDB_HOST || process.env.PGHOST; if (pgHost) { const alloydb: AlloyDBNestedConfig = { @@ -59,7 +80,7 @@ export function detectBackend(): BackendDetection { }; } - // 3. Check for Discovery Engine + // 4. Check for Discovery Engine const gcpProject = process.env.GCLOUD_PROJECT; if (gcpProject) { const googleCloud: GoogleCloudConfig = { @@ -80,7 +101,7 @@ export function detectBackend(): BackendDetection { }; } - // 4. No backend detected + // 5. No backend detected return { backend: null, reason: 'No backend detected', @@ -96,6 +117,7 @@ export function requireBackend(): BackendDetection { if (!detection.backend) { throw new Error( 'No vector backend detected. Set one of:\n' + + ' - QDRANT_URL (for Qdrant + Ollama, local development)\n' + ' - CHROMA_URL (for ChromaDB + Ollama, local development)\n' + ' - ALLOYDB_HOST or PGHOST (for AlloyDB/Postgres)\n' + ' - GCLOUD_PROJECT (for Discovery Engine)', @@ -109,6 +131,19 @@ export function requireBackend(): BackendDetection { */ export function buildBackendConfig(backend: VectorBackend): Partial { switch (backend) { + case 'qdrant': { + const qdrant: QdrantNestedConfig = { + url: process.env.QDRANT_URL || 'http://localhost:6333', + apiKey: process.env.QDRANT_API_KEY, + }; + return { + qdrant, + embedding: { + provider: 'ollama', + model: process.env.OLLAMA_EMBEDDING_MODEL || 'manutic/nomic-embed-code', + }, + }; + } case 'chroma': { const chroma: ChromaNestedConfig = { url: process.env.CHROMA_URL || 'http://localhost:8000', @@ -146,5 +181,7 @@ export function buildBackendConfig(backend: VectorBackend): Partial { + this.config = config; + logger.info({ collectionName: this.collectionName }, 'Initializing Qdrant adapter'); + + try { + const { collections } = await this.client.getCollections(); + if (!collections.some((c) => c.name === this.collectionName)) { + await this.client.createCollection(this.collectionName, { + vectors: { + size: this.qdrantConfig.embeddingDimension, + distance: this.qdrantConfig.distanceFunction || 'Cosine', + }, + }); + + for (const field of ['config_name', 'filename', 'language']) { + await this.client.createPayloadIndex(this.collectionName, { field_name: field, field_schema: 'keyword' }); + } + logger.info({ collectionName: this.collectionName }, 'Created collection'); + } + + this.initialized = true; + } catch (error) { + logger.error({ error, collectionName: this.collectionName }, 'Failed to initialize'); + throw error; + } + } + + private generatePointId(chunk: EmbeddedChunk): string { + const key = `${chunk.filePath}:${chunk.chunk.sourceLocation.startLine}:${chunk.chunk.sourceLocation.endLine}`; + return uuidv5(key, uuidv5.DNS); + } + + private configFilter(configName: string) { + return { must: [{ key: 'config_name', match: { value: configName } }] } as any; + } + + async indexChunks(chunks: EmbeddedChunk[]): Promise { + if (!chunks.length) return; + if (!this.initialized) throw new Error('Not initialized'); + + const configName = this.config.name || 'default'; + logger.info({ chunkCount: chunks.length }, 'Indexing'); + + for (let i = 0; i < chunks.length; i += 100) { + const points = chunks.slice(i, i + 100).map((chunk) => ({ + id: this.generatePointId(chunk), + vector: chunk.embedding, + payload: { + config_name: configName, + filename: chunk.filePath, + line_from: chunk.chunk.sourceLocation.startLine, + line_to: chunk.chunk.sourceLocation.endLine, + original_text: chunk.chunk.content, + contextualized_text: 'contextualizedContent' in chunk.chunk ? chunk.chunk.contextualizedContent : chunk.chunk.content, + language: chunk.language, + chunk_type: chunk.chunk.chunkType, + function_name: chunk.chunk.metadata?.functionName || '', + class_name: chunk.chunk.metadata?.className || '', + natural_language_description: chunk.naturalLanguageDescription || '', + }, + })); + + await this.client.upsert(this.collectionName, { wait: true, points }); + } + } + + async deleteByFilePath(filePath: string): Promise { + if (!this.initialized) throw new Error('Not initialized'); + + const configName = this.config.name || 'default'; + const filter = { + must: [ + { key: 'filename', match: { value: filePath } }, + { key: 'config_name', match: { value: configName } }, + ], + } as any; + + const { count } = await this.client.count(this.collectionName, { filter, exact: true }); + if (count > 0) { + await this.client.delete(this.collectionName, { wait: true, filter }); + logger.info({ filePath, deletedCount: count }, 'Deleted'); + } + return count; + } + + async search(query: string, queryEmbedding: number[], maxResults: number, config: VectorStoreConfig): Promise { + if (!this.initialized) throw new Error('Not initialized'); + + const results = await this.client.query(this.collectionName, { + query: queryEmbedding, + limit: maxResults, + filter: this.configFilter(config.name || 'default'), + with_payload: true, + with_vector: false, + }); + + return results.points.map((r) => ({ + id: String(r.id), + score: r.score, + document: { + filePath: String(r.payload?.filename || ''), + functionName: r.payload?.function_name ? String(r.payload.function_name) : undefined, + className: r.payload?.class_name ? String(r.payload.class_name) : undefined, + startLine: Number(r.payload?.line_from) || 0, + endLine: Number(r.payload?.line_to) || 0, + language: String(r.payload?.language || 'unknown'), + originalCode: String(r.payload?.original_text || ''), + naturalLanguageDescription: r.payload?.natural_language_description ? String(r.payload.natural_language_description) : undefined, + }, + metadata: { chunkType: r.payload?.chunk_type }, + })); + } + + async purge(): Promise { + if (!this.initialized) throw new Error('Not initialized'); + await this.client.delete(this.collectionName, { wait: true, filter: this.configFilter(this.config.name || 'default') }); + } + + async getStats(): Promise<{ totalDocuments: number; totalChunks: number; storageSize?: number }> { + if (!this.initialized) throw new Error('Not initialized'); + + const [info, { count }] = await Promise.all([ + this.client.getCollection(this.collectionName), + this.client.count(this.collectionName, { filter: this.configFilter(this.config.name || 'default'), exact: true }), + ]); + + return { totalDocuments: count, totalChunks: count, storageSize: info.points_count ?? undefined }; + } + + async isAvailable(): Promise { + try { + await this.client.getCollections(); + return true; + } catch { + return false; + } + } + + async deleteCollection(): Promise { + await this.client.deleteCollection(this.collectionName); + this.initialized = false; + logger.info({ collectionName: this.collectionName }, 'Collection deleted'); + } +} diff --git a/src/swe/vector/qdrant/qdrantConfig.ts b/src/swe/vector/qdrant/qdrantConfig.ts new file mode 100644 index 00000000..dc0d6751 --- /dev/null +++ b/src/swe/vector/qdrant/qdrantConfig.ts @@ -0,0 +1,45 @@ +import type { VectorStoreConfig } from '../core/config'; + +export interface QdrantConfig { + url: string; + apiKey?: string; + collectionPrefix?: string; + embeddingDimension: number; + distanceFunction?: 'Cosine' | 'Euclid' | 'Dot' | 'Manhattan'; +} + +const DEFAULT_URL = 'http://localhost:6333'; +const DEFAULT_PREFIX = 'code_chunks'; + +export function buildQdrantConfig(config: VectorStoreConfig, embeddingDimension: number): QdrantConfig { + return { + url: config.qdrant?.url || process.env.QDRANT_URL || DEFAULT_URL, + apiKey: config.qdrant?.apiKey || process.env.QDRANT_API_KEY, + collectionPrefix: config.qdrant?.collectionPrefix || DEFAULT_PREFIX, + embeddingDimension, + distanceFunction: config.qdrant?.distanceFunction || 'Cosine', + }; +} + +export function sanitizeRepoName(repoIdentifier: string): string { + let name = repoIdentifier + .replace(/^https?:\/\//, '') + .replace(/^git@/, '') + .replace(/\.git$/, '') + .replace(/github\.com[:/]/, '') + .replace(/gitlab\.com[:/]/, '') + .replace(/bitbucket\.org[:/]/, '') + .replace(/[^a-zA-Z0-9_-]/g, '_') + .replace(/[^a-zA-Z0-9]+$/, '') + .toLowerCase(); + + if (/^[^a-zA-Z]/.test(name)) name = `repo_${name}`; + if (name.length > 45) name = name.substring(0, 45); + if (name.length < 3) name = `${name}_repo`; + + return name; +} + +export function getCollectionNameForRepo(repoIdentifier: string, prefix?: string): string { + return `${prefix || DEFAULT_PREFIX}_${sanitizeRepoName(repoIdentifier)}`; +} diff --git a/src/swe/vector/qdrant/qdrantOrchestrator.ts b/src/swe/vector/qdrant/qdrantOrchestrator.ts new file mode 100644 index 00000000..640a24be --- /dev/null +++ b/src/swe/vector/qdrant/qdrantOrchestrator.ts @@ -0,0 +1,428 @@ +import * as fs from 'node:fs/promises'; +import * as path from 'node:path'; +import pLimit from 'p-limit'; +import { logger } from '#o11y/logger'; +import { span } from '#o11y/trace'; +import { readFilesToIndex } from '../codeLoader'; +import { LLMCodeTranslator } from '../core/codeTranslator'; +import type { RerankingConfig, VectorStoreConfig } from '../core/config'; +import { addOrUpdateVectorConfig, loadVectorConfig, printConfigSummary } from '../core/config'; +import { LLMContextualizer } from '../core/contextualizer'; +import type { IChunker, IEmbedder, IReranker } from '../core/interfaces'; +import type { + ContextualizedChunk, + EmbeddedChunk, + FileInfo, + IVectorSearchOrchestrator, + ProgressCallback, + RawChunk, + SearchResult, + VectorSearchOptions, +} from '../core/interfaces'; +import { OLLAMA_EMBEDDING_MODELS, OllamaEmbedderAdapter } from '../ollama/ollamaEmbedder'; +import { createReranker } from '../reranking'; +import { MerkleSynchronizer } from '../sync/merkleSynchronizer'; +import { QdrantAdapter } from './qdrantAdapter'; +import type { QdrantConfig } from './qdrantConfig'; +import { buildQdrantConfig } from './qdrantConfig'; + +const FILE_PROCESSING_PARALLEL_BATCH_SIZE = 5; + +interface IndexingStats { + fileCount: number; + filesProcessed: number; + failedFiles: string[]; + totalChunks: number; + failedChunks: number; +} + +export class QdrantOrchestrator implements IVectorSearchOrchestrator { + private config: VectorStoreConfig; + private qdrantConfig: QdrantConfig; + private repoIdentifier: string; + private _chunker: IChunker | null = null; + private contextualizer: LLMContextualizer; + private translator: LLMCodeTranslator; + private embedder: IEmbedder; + private vectorStore: QdrantAdapter; + private synchronizer: MerkleSynchronizer; + private _reranker: IReranker | null = null; + private _rerankerConfig: RerankingConfig | null = null; + + constructor(repoIdentifier: string, config?: VectorStoreConfig) { + this.repoIdentifier = repoIdentifier; + this.config = config || { + chunking: { dualEmbedding: false, contextualChunking: false }, + embedding: { provider: 'ollama', model: OLLAMA_EMBEDDING_MODELS.NOMIC_EMBED_CODE.model }, + }; + + this.embedder = this.createEmbedder(); + this.qdrantConfig = buildQdrantConfig(this.config, this.embedder.getDimension()); + this.contextualizer = new LLMContextualizer(); + this.translator = new LLMCodeTranslator(); + this.vectorStore = new QdrantAdapter(repoIdentifier, this.qdrantConfig); + this.synchronizer = new MerkleSynchronizer(this.config.includePatterns); + } + + private async getChunker(): Promise { + if (!this._chunker) { + const { ASTChunker } = await import('../chunking/astChunker.js'); + this._chunker = new ASTChunker(); + } + return this._chunker!; + } + + private createEmbedder(): IEmbedder { + const provider = this.config.embedding?.provider || 'ollama'; + const model = this.config.embedding?.model; + + if (provider === 'ollama') { + let modelName: string = OLLAMA_EMBEDDING_MODELS.NOMIC_EMBED_CODE.model; + let dimension: number = OLLAMA_EMBEDDING_MODELS.NOMIC_EMBED_CODE.dimension; + + if (model) { + const foundConfig = Object.values(OLLAMA_EMBEDDING_MODELS).find((m) => m.model === model); + if (foundConfig) { + modelName = foundConfig.model; + dimension = foundConfig.dimension; + } else { + modelName = model; + dimension = 768; + } + } + + return new OllamaEmbedderAdapter({ + apiUrl: this.config.ollama?.apiUrl, + model: modelName, + dimension: dimension, + }); + } + + return new OllamaEmbedderAdapter({ + apiUrl: this.config.ollama?.apiUrl, + model: OLLAMA_EMBEDDING_MODELS.NOMIC_EMBED_CODE.model, + dimension: OLLAMA_EMBEDDING_MODELS.NOMIC_EMBED_CODE.dimension, + }); + } + + private getReranker(): IReranker | null { + const config = this.config.search?.reranking; + if (!config) return null; + + if (!this._reranker || !this.configsEqual(this._rerankerConfig, config)) { + this._reranker = createReranker(config, undefined, this.config.ollama); + this._rerankerConfig = config; + } + return this._reranker; + } + + private configsEqual(a: RerankingConfig | null, b: RerankingConfig | null): boolean { + if (a === b) return true; + if (!a || !b) return false; + return a.provider === b.provider && a.model === b.model && a.topK === b.topK; + } + + @span() + async indexRepository( + repoRoot: string, + options?: { + subFolder?: string; + incremental?: boolean; + config?: VectorStoreConfig; + onProgress?: ProgressCallback; + }, + ): Promise { + const startTime = Date.now(); + + if (!options?.config) { + try { + this.config = loadVectorConfig(repoRoot); + } catch { + logger.info('No .typedai.json found, using default config'); + } + } else { + this.config = { ...this.config, ...options.config }; + } + + printConfigSummary(this.config); + + this.embedder = this.createEmbedder(); + this.qdrantConfig = buildQdrantConfig(this.config, this.embedder.getDimension()); + this.vectorStore = new QdrantAdapter(this.repoIdentifier, this.qdrantConfig); + + if (this.embedder instanceof OllamaEmbedderAdapter) { + const available = await this.embedder.isAvailable(); + if (!available) { + throw new Error( + `Ollama is not available or model "${this.embedder.getModel()}" is not loaded.\nStart Ollama with: ollama serve\nPull the model with: ollama pull ${this.embedder.getModel()}`, + ); + } + } + + const qdrantAvailable = await this.vectorStore.isAvailable(); + if (!qdrantAvailable) { + throw new Error( + `Qdrant is not available at ${this.qdrantConfig.url}.\nStart Qdrant with: docker run -p 6333:6333 qdrant/qdrant\nOr download from: https://qdrant.tech/documentation/quick-start/`, + ); + } + + await this.vectorStore.initialize(this.config); + + logger.info({ repoRoot, incremental: options?.incremental }, 'Starting repository indexing'); + + let filesToIndex: string[]; + + if (options?.incremental) { + logger.info('Performing incremental update using Merkle sync'); + const changes = await this.synchronizer.detectChanges(repoRoot); + + filesToIndex = [...changes.added, ...changes.modified]; + + for (const deletedFile of changes.deleted) { + await this.vectorStore.deleteByFilePath(deletedFile); + } + + logger.info( + { + added: changes.added.length, + modified: changes.modified.length, + deleted: changes.deleted.length, + }, + 'Incremental changes detected', + ); + + if (filesToIndex.length === 0) { + return; + } + } else { + logger.info('Performing full repository indexing'); + const codeFiles = await readFilesToIndex(repoRoot, options?.subFolder || './', this.config.includePatterns); + filesToIndex = codeFiles.map((f) => f.filePath); + logger.info({ fileCount: codeFiles.length }, 'Loaded code files'); + } + + if (filesToIndex.length === 0) return; + + await this.indexFiles(repoRoot, filesToIndex, options?.onProgress); + await this.synchronizer.saveSnapshot(repoRoot, filesToIndex); + + addOrUpdateVectorConfig(repoRoot, { ...this.config, indexed: true }); + + const duration = Date.now() - startTime; + logger.info({ duration, fileCount: filesToIndex.length }, 'Repository indexing completed, indexed=true set'); + } + + async search(query: string, options?: VectorSearchOptions): Promise { + const maxResults = options?.maxResults || 10; + const rerankConfig = this.config.search?.reranking; + const useReranking = options?.reranking ?? !!rerankConfig; + const rerankingTopK = rerankConfig?.topK ?? 50; + const useHybridSearch = options?.hybridSearch ?? this.config.search?.hybridSearch ?? true; + + logger.info({ query, maxResults, reranking: useReranking, rerankingProvider: rerankConfig?.provider, hybridSearch: useHybridSearch }, 'Performing search'); + + const queryEmbedding = await this.embedder.embed(query, 'RETRIEVAL_QUERY'); + const searchLimit = useReranking ? Math.max(maxResults * 2, rerankingTopK) : maxResults; + const searchConfig = { ...this.config, search: { ...this.config.search, hybridSearch: useHybridSearch } }; + const results = await this.vectorStore.search(query, queryEmbedding, searchLimit, searchConfig); + + let filteredResults = results; + + if (options?.fileFilter && options.fileFilter.length > 0) { + filteredResults = filteredResults.filter((r) => options.fileFilter!.some((filter) => r.document.filePath.includes(filter))); + } + + if (options?.languageFilter && options.languageFilter.length > 0) { + filteredResults = filteredResults.filter((r) => options.languageFilter!.includes(r.document.language)); + } + + let finalResults = filteredResults; + + if (useReranking && filteredResults.length > 0) { + const reranker = this.getReranker(); + if (reranker) { + logger.info({ inputCount: filteredResults.length, maxResults, rerankingTopK }, 'Applying reranking'); + finalResults = await reranker.rerank(query, filteredResults, maxResults); + } else { + finalResults = filteredResults.slice(0, maxResults); + } + } else { + finalResults = filteredResults.slice(0, maxResults); + } + + logger.info({ resultCount: finalResults.length, reranked: useReranking }, 'Search completed'); + + return finalResults; + } + + getConfig(): VectorStoreConfig { + return this.config; + } + + updateConfig(config: Partial): void { + this.config = { ...this.config, ...config }; + logger.info({ config: this.config }, 'Configuration updated'); + } + + private async indexFiles(repoRoot: string, filePaths: string[], onProgress?: ProgressCallback): Promise { + const stats: IndexingStats = { + fileCount: filePaths.length, + filesProcessed: 0, + failedFiles: [], + totalChunks: 0, + failedChunks: 0, + }; + + const limit = pLimit(FILE_PROCESSING_PARALLEL_BATCH_SIZE); + logger.info({ fileCount: filePaths.length, concurrency: FILE_PROCESSING_PARALLEL_BATCH_SIZE }, 'Starting parallel file indexing'); + + const processingPromises = filePaths.map((filePath) => + limit(async () => { + try { + onProgress?.({ + phase: 'loading', + currentFile: filePath, + filesProcessed: stats.filesProcessed, + totalFiles: stats.fileCount, + }); + + const fileInfo = await this.loadFile(repoRoot, filePath); + const chunks = await this.processFile(fileInfo, stats, onProgress); + + if (chunks.length > 0) { + onProgress?.({ + phase: 'indexing', + currentFile: filePath, + filesProcessed: stats.filesProcessed, + totalFiles: stats.fileCount, + chunksProcessed: chunks.length, + }); + + await this.vectorStore.indexChunks(chunks); + stats.totalChunks += chunks.length; + } + + stats.filesProcessed++; + logger.debug({ filePath, chunkCount: chunks.length }, 'File indexed successfully'); + } catch (error) { + stats.failedFiles.push(filePath); + logger.error({ error, filePath }, 'Failed to process file'); + } + }), + ); + + await Promise.all(processingPromises); + + logger.info( + { + filesProcessed: stats.filesProcessed, + failedFiles: stats.failedFiles.length, + totalChunks: stats.totalChunks, + failedChunks: stats.failedChunks, + }, + 'File indexing completed', + ); + } + + private async processFile(fileInfo: FileInfo, stats: IndexingStats, onProgress?: ProgressCallback): Promise { + try { + let chunks: Array; + + if (this.config.chunking?.contextualChunking) { + onProgress?.({ + phase: 'contextualizing', + currentFile: fileInfo.filePath, + filesProcessed: stats.filesProcessed, + totalFiles: stats.fileCount, + }); + + chunks = await this.contextualizer.contextualize([], fileInfo, this.config); + } else { + onProgress?.({ + phase: 'chunking', + currentFile: fileInfo.filePath, + filesProcessed: stats.filesProcessed, + totalFiles: stats.fileCount, + }); + + const chunker = await this.getChunker(); + chunks = await chunker.chunk(fileInfo, this.config); + } + + if (chunks.length === 0) { + logger.debug({ filePath: fileInfo.filePath }, 'No chunks generated'); + return []; + } + + onProgress?.({ + phase: 'embedding', + currentFile: fileInfo.filePath, + filesProcessed: stats.filesProcessed, + totalFiles: stats.fileCount, + chunksProcessed: 0, + totalChunks: chunks.length, + }); + + const contextualizedTexts = chunks.map((chunk) => ('contextualizedContent' in chunk ? chunk.contextualizedContent : chunk.content)); + const primaryEmbeddings = await this.embedder.embedBatch(contextualizedTexts, 'RETRIEVAL_DOCUMENT'); + + logger.debug({ chunkCount: chunks.length }, 'Generated embeddings'); + + return chunks.map((chunk, index) => ({ + filePath: fileInfo.filePath, + language: fileInfo.language, + chunk, + embedding: primaryEmbeddings[index], + })); + } catch (error) { + logger.error({ error, filePath: fileInfo.filePath }, 'Failed to process file'); + throw error; + } + } + + private async loadFile(repoRoot: string, filePath: string): Promise { + const fullPath = path.join(repoRoot, filePath); + const content = await fs.readFile(fullPath, 'utf-8'); + const stats = await fs.stat(fullPath); + + return { + filePath, + relativePath: filePath, + language: this.detectLanguage(path.extname(filePath)), + content, + size: stats.size, + lastModified: stats.mtime, + }; + } + + private detectLanguage(extension: string): string { + const languageMap: Record = { + '.ts': 'typescript', + '.tsx': 'typescript', + '.js': 'javascript', + '.jsx': 'javascript', + '.py': 'python', + '.java': 'java', + '.cpp': 'cpp', + '.c': 'c', + '.h': 'c', + '.go': 'go', + '.rs': 'rust', + '.rb': 'ruby', + '.php': 'php', + '.cs': 'csharp', + '.swift': 'swift', + '.kt': 'kotlin', + }; + + return languageMap[extension.toLowerCase()] || 'unknown'; + } + + async getStats(): Promise<{ totalDocuments: number; totalChunks: number; storageSize?: number }> { + return this.vectorStore.getStats(); + } + + async purge(): Promise { + await this.vectorStore.purge(); + } +}