Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BM25Encoder.update() which updates encoder values with new documents #75

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions pinecone_text/sparse/bm25_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,41 @@ def fit(self, corpus: List[str]) -> "BM25Encoder":
self.avgdl = sum_doc_len / n_docs
return self

def update(self, new_corpus: List[str]) -> "BM25Encoder":
"""
Update BM25 by incorporating new documents into the existing model

Args:
new_corpus: list of new texts to update BM25 with
"""
if self.doc_freq is None or self.n_docs is None or self.avgdl is None:
raise ValueError("BM25 must be fit before updating")

sum_doc_len = 0
doc_freq_counter: Counter = Counter()

for doc in tqdm(new_corpus):
if not isinstance(doc, str):
raise ValueError("new_corpus must be a list of strings")

indices, tf = self._tf(doc)
if len(indices) == 0:
continue
self.n_docs += 1
sum_doc_len += sum(tf)

# Count the number of documents that contain each token
doc_freq_counter.update(indices)

# Merge the new document frequencies with the existing ones
for idx, freq in doc_freq_counter.items():
self.doc_freq[idx] = self.doc_freq.get(idx, 0) + freq

# Update the average document length
self.avgdl = (self.avgdl * (self.n_docs - len(new_corpus)) + sum_doc_len) / self.n_docs

return self

def encode_documents(
self, texts: Union[str, List[str]]
) -> Union[SparseVector, List[SparseVector]]:
Expand Down
Loading