Skip to content

Commit

Permalink
reduce-bm25-memory-usage
Browse files Browse the repository at this point in the history
  • Loading branch information
raphaelsty committed Jun 2, 2024
1 parent 6564878 commit e99e0ac
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions python/lenlp/sparse/bm25_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ def update(self, matrix: csr_matrix) -> csr_matrix:
self.count = matrix.shape[0]

self.idf = np.squeeze(
a=np.asarray(a=np.log((self.count - self.tf + 0.5) / (self.tf + 0.5) + 1))
a=np.asarray(
a=np.log((self.count - self.tf + 0.5) / (self.tf + 0.5) + 1),
dtype=np.float32,
)
)

def _transform(self, matrix: csr_matrix) -> csr_matrix:
Expand All @@ -106,11 +109,11 @@ def _transform(self, matrix: csr_matrix) -> csr_matrix:
)
)

numerator = matrix.copy()
denominator = matrix.copy().tocsc()
numerator.data = numerator.data * (self.k1 + 1)
denominator.data += np.take(a=regularization, indices=denominator.indices)
matrix.data = (numerator.data / denominator.tocsr().data) + self.epsilon
matrix.data = (
(matrix.data * (self.k1 + 1)) / denominator.tocsr().data
) + self.epsilon

matrix = matrix.multiply(other=self.idf).tocsr()
inplace_csr_row_normalize_l2(matrix)
Expand Down

0 comments on commit e99e0ac

Please sign in to comment.