@@ -507,9 +507,12 @@ def insert(
507507 ) -> None :
508508 """
509509 Insert a single vector.
510-
511- Uses native Rust FFI for persistence and HNSW indexing.
512-
510+
511+ Writes to BOTH the in-memory HNSW index (for fast vector_search /
512+ vector_search_exact) AND the KV store (for keyword_search / hybrid_search
513+ BM25 fallback). Previously only the HNSW path was populated, making
514+ documents inserted with insert() invisible to keyword/hybrid search.
515+
513516 Args:
514517 id: Unique document ID
515518 vector: Vector embedding
@@ -519,22 +522,35 @@ def insert(
519522 # Auto-dimension from first vector
520523 if self ._config .dimension is None :
521524 object .__setattr__ (self ._config , 'dimension' , len (vector ))
522-
525+
523526 # Validate dimension
524527 if len (vector ) != self ._config .dimension :
525528 raise DimensionMismatchError (self ._config .dimension , len (vector ))
526-
529+
527530 # Build metadata dict
528531 meta = metadata .copy () if metadata else {}
529532 if content :
530533 meta ["_content" ] = content
531-
532- # 1. Insert into in-memory HNSW (fast, used for same-session search)
534+
535+ # 1. Insert into in-memory HNSW (fast, used for same-session vector search)
533536 self ._ensure_index (self ._config .dimension )
534537 internal_id = self ._get_internal_id (id )
535538 self ._vector_index .insert (internal_id , vector )
536539 self ._metadata_store [id ] = meta
537540 self ._raw_vectors [id ] = vector # keep for snapshot
541+
542+ # 2. Persist to KV store so keyword_search / hybrid_search find this doc.
543+ # Uses the same JSON schema as insert_multi() so FFI BM25 and the
544+ # Python scan fallback can both read it.
545+ with self ._db .transaction () as txn :
546+ doc_data = {
547+ "id" : str (id ),
548+ "vector" : vector ,
549+ "metadata" : meta ,
550+ "content" : content or meta .get ("_content" , "" ),
551+ "is_multi_vector" : False ,
552+ }
553+ txn .put (self ._vector_key (id ), json .dumps (doc_data ).encode ())
538554
539555 def insert_batch (
540556 self ,
@@ -546,63 +562,76 @@ def insert_batch(
546562 ) -> int :
547563 """
548564 Insert multiple vectors in a batch.
549-
550- Uses native Rust FFI for durable persistence and HNSW indexing.
551-
565+
566+ Writes to BOTH in-memory HNSW (for fast vector search) AND KV store
567+ (for keyword/hybrid search BM25). Previously only the HNSW path was
568+ populated, leaving batch-inserted docs invisible to keyword search.
569+
552570 Supports two calling conventions:
553571 1. Tuple format: insert_batch([(id, vector, metadata, content), ...])
554572 2. Keyword format: insert_batch(ids=[...], vectors=[...], metadatas=[...])
555-
573+
556574 Args:
557575 documents: List of (id, vector, metadata, content) tuples
558576 ids: List of document IDs (keyword format)
559577 vectors: List of vector embeddings (keyword format)
560578 metadatas: List of metadata dicts (keyword format)
561-
579+
562580 Returns:
563581 Number of documents inserted
564582 """
565583 import numpy as np
566-
584+
567585 # Handle keyword argument format
568586 if ids is not None and vectors is not None :
569587 if metadatas is None :
570588 metadatas = [None ] * len (ids )
571589 documents = [(id , vec , meta , None ) for id , vec , meta in zip (ids , vectors , metadatas )]
572-
590+
573591 if not documents :
574592 return 0
575-
593+
576594 # Auto-dimension inference from first vector
577595 first_vec = documents [0 ][1 ]
578596 if self ._config .dimension is None :
579597 object .__setattr__ (self ._config , 'dimension' , len (first_vec ))
580-
598+
581599 # Validate dimensions
582600 for doc_id , vector , metadata , content in documents :
583601 if len (vector ) != self ._config .dimension :
584602 raise DimensionMismatchError (self ._config .dimension , len (vector ))
585-
586- # Build ID and metadata lists for FFI batch insert
587- batch_ids = [str (doc [0 ]) for doc in documents ]
588- batch_vectors = [doc [1 ] for doc in documents ]
603+
604+ # Build per-document metadata
589605 batch_metadatas = []
590606 for doc_id , vector , metadata , content in documents :
591607 meta = metadata .copy () if metadata else {}
592608 if content :
593609 meta ["_content" ] = content
594- batch_metadatas .append (meta if meta else None )
595-
596- # 1. Fast in-memory HNSW insert (used for same-session search)
610+ batch_metadatas .append (meta )
611+
612+ # 1. Fast in-memory HNSW insert (used for same-session vector search)
597613 self ._ensure_index (self ._config .dimension )
598614 internal_ids = np .array ([self ._get_internal_id (doc [0 ]) for doc in documents ], dtype = np .uint64 )
599615 vectors_array = np .array ([doc [1 ] for doc in documents ], dtype = np .float32 )
600616 count = self ._vector_index .insert_batch (internal_ids , vectors_array )
601617 for i , (doc_id , vector , metadata , content ) in enumerate (documents ):
602- meta = batch_metadatas [i ] if batch_metadatas [i ] else {}
603- self ._metadata_store [doc_id ] = meta
604- self ._raw_vectors [doc_id ] = batch_vectors [i ] # keep for snapshot
605-
618+ self ._metadata_store [doc_id ] = batch_metadatas [i ]
619+ self ._raw_vectors [doc_id ] = vector # keep for snapshot
620+
621+ # 2. Persist to KV store so keyword_search / hybrid_search find all docs.
622+ # Written in one transaction for atomicity.
623+ with self ._db .transaction () as txn :
624+ for i , (doc_id , vector , metadata , content ) in enumerate (documents ):
625+ meta = batch_metadatas [i ]
626+ doc_data = {
627+ "id" : str (doc_id ),
628+ "vector" : vector ,
629+ "metadata" : meta ,
630+ "content" : content or meta .get ("_content" , "" ),
631+ "is_multi_vector" : False ,
632+ }
633+ txn .put (self ._vector_key (doc_id ), json .dumps (doc_data ).encode ())
634+
606635 return count
607636
608637 def add (
@@ -1115,51 +1144,90 @@ def _keyword_search(self, request: SearchRequest) -> SearchResults:
11151144 vector_results = 0 ,
11161145 )
11171146
1118- # Fallback: Python scan
1147+ # Fallback: Python BM25 scan over KV store
1148+ # Uses proper BM25 formula (k1=1.2, b=0.75) instead of raw TF counting.
1149+ # Previously this used simple count(term in doc) with no IDF weighting,
1150+ # which caused popular terms to dominate and domain-specific terms to rank low.
11191151 all_docs = []
11201152 prefix = self ._vectors_prefix ()
11211153 with self ._db .transaction () as txn :
11221154 for key , value in txn .scan_prefix (prefix ):
11231155 doc = json .loads (value .decode ())
11241156 all_docs .append (doc )
1125-
1126- # Simple keyword matching on content and metadata
1127- query_lower = request .text_query .lower ()
1128- query_terms = query_lower .split ()
1129-
1130- scored_docs = []
1157+
1158+ if not all_docs :
1159+ query_time_ms = (time .time () - start_time ) * 1000
1160+ return SearchResults (results = [], total_count = 0 , query_time_ms = query_time_ms )
1161+
1162+ # Tokenise query (stopword-filtered list already computed above)
1163+ query_terms = [w for w in cleaned_query .split () if w ]
1164+ if not query_terms :
1165+ query_terms = request .text_query .lower ().split ()
1166+
1167+ # --- BM25 parameters (Lucene / Elasticsearch defaults) ---
1168+ K1 = 1.2 # term-frequency saturation
1169+ B = 0.75 # length normalisation
1170+
1171+ # Build corpus for IDF: token → document-frequency count
1172+ import math
1173+ corpus_texts = []
11311174 for doc in all_docs :
1132- # Search in content field
1133- content = doc .get ("content" , "" ) or ""
1175+ content = doc .get ("content" , "" ) or ""
11341176 metadata = doc .get ("metadata" , {})
1135-
1136- # Also search in metadata text fields
11371177 text_fields = [content ]
11381178 for v in metadata .values ():
11391179 if isinstance (v , str ):
11401180 text_fields .append (v )
1141-
1142- combined_text = " " .join (text_fields ).lower ()
1143-
1144- # Simple term frequency scoring
1181+ corpus_texts .append (" " .join (text_fields ).lower ())
1182+
1183+ N = len (all_docs )
1184+ avgdl = sum (len (t .split ()) for t in corpus_texts ) / N if N else 1.0
1185+
1186+ # Document-frequency per query term
1187+ df : dict = {}
1188+ for term in query_terms :
1189+ for text in corpus_texts :
1190+ if term in text .split ():
1191+ df [term ] = df .get (term , 0 ) + 1
1192+
1193+ # IDF (Robertson-Spärck Jones formula, +1 to keep non-negative)
1194+ idf : dict = {
1195+ term : math .log ((N - df .get (term , 0 ) + 0.5 ) / (df .get (term , 0 ) + 0.5 ) + 1 )
1196+ for term in query_terms
1197+ }
1198+
1199+ scored_docs = []
1200+ for doc , text in zip (all_docs , corpus_texts ):
1201+ tokens = text .split ()
1202+ dl = len (tokens )
1203+ metadata = doc .get ("metadata" , {})
1204+
1205+ # Build TF map
1206+ tf_map : dict = {}
1207+ for tok in tokens :
1208+ tf_map [tok ] = tf_map .get (tok , 0 ) + 1
1209+
1210+ # BM25 score
11451211 score = 0.0
11461212 for term in query_terms :
1147- if term in combined_text :
1148- score += combined_text .count (term )
1149-
1213+ tf = tf_map .get (term , 0 )
1214+ if tf == 0 :
1215+ continue
1216+ numerator = tf * (K1 + 1 )
1217+ denominator = tf + K1 * (1 - B + B * dl / avgdl )
1218+ score += idf [term ] * numerator / denominator
1219+
11501220 if score > 0 :
1151- # Apply metadata filter
1152- if request .filter :
1153- if not self ._matches_filter (metadata , request .filter ):
1154- continue
1221+ if request .filter and not self ._matches_filter (metadata , request .filter ):
1222+ continue
11551223 scored_docs .append ((score , doc ))
1156-
1157- # Sort by score descending
1224+
1225+ # Sort by BM25 score descending
11581226 scored_docs .sort (key = lambda x : x [0 ], reverse = True )
1159-
1227+
11601228 # Take top k
11611229 top_k = scored_docs [:request .k ]
1162-
1230+
11631231 # Build results
11641232 results = []
11651233 for score , doc in top_k :
@@ -1170,9 +1238,9 @@ def _keyword_search(self, request: SearchRequest) -> SearchResults:
11701238 vector = doc .get ("vector" ) if request .include_vectors else None ,
11711239 )
11721240 results .append (result )
1173-
1241+
11741242 query_time_ms = (time .time () - start_time ) * 1000
1175-
1243+
11761244 return SearchResults (
11771245 results = results ,
11781246 total_count = len (scored_docs ),
0 commit comments