Skip to content

Commit

Permalink
Sparse dot fix (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
MaartenGr authored Jun 8, 2021
1 parent a60dfc6 commit d2ef5b2
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 10 deletions.
6 changes: 4 additions & 2 deletions docs/releases.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
v0.3.1
- Fix exploding memory usage when using `top_n`

v0.3.0
- Use `top_n` in `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings`

Expand All @@ -16,8 +19,7 @@ v0.1.0
- More thorough documentation
- Prepare for public release

v0.0.1

v0.0.1
- First release of `PolyFuzz`
- Matching through:
- Edit Distance
Expand Down
2 changes: 1 addition & 1 deletion polyfuzz/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .polyfuzz import PolyFuzz
__version__ = "0.3.0"
__version__ = "0.3.1"
33 changes: 27 additions & 6 deletions polyfuzz/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ def cosine_similarity(from_vector: np.ndarray,
similarity_matrix.setdiag(0.)
similarity_matrix = similarity_matrix.tocsr()

indices = np.flip(np.argsort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
similarities = np.flip(np.sort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]
indices = _top_n_idx_sparse(similarity_matrix, top_n)
similarities = _top_n_similarities_sparse(similarity_matrix, top_n, indices)
indices = np.array(np.nan_to_num(np.array(indices, dtype=np.float), nan=0), dtype=np.int)

# Faster than knn and slower than sparse but uses more memory
else:
Expand All @@ -101,9 +101,9 @@ def cosine_similarity(from_vector: np.ndarray,
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]

# Convert results to df
columns = (["From"] +
["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)]); columns
columns = (["From"] +
["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)])
matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])]
matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns)

Expand All @@ -119,3 +119,24 @@ def cosine_similarity(from_vector: np.ndarray,
matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None

return matches


def _top_n_idx_sparse(matrix, n):
""" Return index of top n values in each row of a sparse matrix """
top_n_idx = []
for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
n_row_pick = min(n, ri - le)
values = list(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])[::-1]
values = [values[index] if len(values) >= index + 1 else None for index in range(n)]
top_n_idx.append(values)
return np.array(top_n_idx)


def _top_n_similarities_sparse(matrix, n, indices):
""" Return similarity scores of top n values in each row of a sparse matrix """
similarity_scores = []
for row, values in enumerate(indices):
scores = [round(matrix[row, value], n) if value is not None else 0 for value in values]
similarity_scores.append(scores)
similarity_scores = np.array(similarity_scores).T
return similarity_scores
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
setup(
name="polyfuzz",
packages=find_packages(exclude=["notebooks", "docs"]),
version="0.3.0",
version="0.3.1",
author="Maarten Grootendorst",
author_email="maartengrootendorst@gmail.com",
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",
Expand Down

0 comments on commit d2ef5b2

Please sign in to comment.