Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add implementation for built-in jaccard similarity #70

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
default_language_version:
python: python3.11

default_stages: [commit, push]
default_stages: [pre-commit, pre-push]

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
1 change: 1 addition & 0 deletions mismo/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
damerau_levenshtein_ratio as damerau_levenshtein_ratio,
)
from mismo.text._similarity import double_metaphone as double_metaphone
from mismo.text._similarity import jaccard as jaccard
from mismo.text._similarity import jaro_similarity as jaro_similarity
from mismo.text._similarity import jaro_winkler_similarity as jaro_winkler_similarity
from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio
Expand Down
74 changes: 72 additions & 2 deletions mismo/text/_similarity.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from __future__ import annotations

from typing import Literal

import ibis
from ibis.expr import types as ir

from mismo import _util
from mismo.sets import jaccard as _jaccard


def double_metaphone(s: ir.StringValue) -> ir.ArrayValue[ir.StringValue]:
Expand Down Expand Up @@ -109,8 +112,68 @@ def _dist_ratio(s1, s2, dist):
return (lenmax - dist(s1, s2)) / lenmax


def jaccard(
s1: ir.StringValue,
s2: ir.StringValue,
*,
tokenize: Literal["by_character", "on_whitespace"],
) -> ir.FloatingValue:
"""The Jaccard similarity between two strings.

This is a measure of the overlap of the number of elements in two sets of unique
tokens, where tokenization is defined by `tokenize`. Tokenization by character is
most suited for situations where character-level variations are important,
such as with typos, short text or languages without clear word boundaries
(e.g. Japanese and Chinese).
In contrast, word-level similarity is preferred when the semantic content of the
text is more important, rather than minor variations in the spelling or syntax.

Examples
--------
>>> import ibis
>>> from mismo.text import jaccard

`tokenize='by_character'` replicates the implementation built into duckdb.
>>> jaccard(ibis.literal("foo"),
... ibis.literal("foo"), tokenize='by_character').execute()
np.float64(1.0)
>>> jaccard(ibis.literal("foo"),
... ibis.literal("food"), tokenize='by_character').execute()
np.float64(0.6666666666666666)
>>> jaccard(ibis.null(str),
... ibis.literal("food"), tokenize='by_character').execute()
np.float64(nan)

word-level similarity can be achieved using `tokenize='on_whitespace'`.
>>> jaccard(ibis.literal("Paris is the capital of France"),
... ibis.literal("The largest city in France is Paris"),
... tokenize='on_whitespace').execute()
np.float64(0.3)

In both cases, comparing to an empty string will return a similarity of 0
>>> jaccard(ibis.literal("foo"), ibis.literal(""),
... tokenize='on_whitespace').execute()
np.float64(0.0)
>>> jaccard(ibis.literal("foo"), ibis.literal(""),
... tokenize='by_character').execute()
np.float64(0.0)

"""
if tokenize == "by_character":
reg = ""
elif tokenize == "on_whitespace":
reg = r"\s+"
#
s1 = _util.ensure_ibis(s1, "string")
s2 = _util.ensure_ibis(s2, "string")
t1 = s1.re_split(reg).unique()
t2 = s2.re_split(reg).unique()
return _jaccard(t1, t2)


@ibis.udf.scalar.builtin(name="jaro_similarity")
def _jaro_similarity(s1: str, s2: str) -> float: ...
def _jaro_similarity(s1: str, s2: str) -> float:
...


def jaro_similarity(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
Expand Down Expand Up @@ -145,7 +208,8 @@ def jaro_similarity(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:

# TODO: This isn't portable between backends
@ibis.udf.scalar.builtin(name="jaro_winkler_similarity")
def _jaro_winkler_similarity(s1: str, s2: str) -> float: ...
def _jaro_winkler_similarity(s1: str, s2: str) -> float:
...


def jaro_winkler_similarity(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
Expand Down Expand Up @@ -179,3 +243,9 @@ def jaro_winkler_similarity(s1: ir.StringValue, s2: ir.StringValue) -> ir.Floati
np.float64(0.0)
"""
return _jaro_winkler_similarity(s1, s2)


if __name__ == "__main__":
import doctest

doctest.testmod()
38 changes: 38 additions & 0 deletions mismo/text/tests/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,44 @@ def test_levenshtein_ratio(string1, string2, expected):
assert expected == result


@pytest.mark.parametrize(
"string1,string2,expected",
[
("foo", "foo", 1),
("foo bar", "foo", 0.3333),
("foo bar", "bar foo", 1),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's figure out the semantics first in the main comment thread, but eventually I will want to see

  • empty case
  • NULL case
  • case with repeated elements in one set, eg jaccard("foo foo bar", "foo baz") -> 1/3

("foo foo bar", "foo baz", 0.7143),
("foo", "", 0),
(None, "foo", np.nan),
],
)
def test_jaccard_string_similarity_character(string1, string2, expected):
result = text.jaccard(string1, string2, tokenize="by_character").execute()
if np.isnan(expected):
assert np.isnan(result)
else:
assert result == pytest.approx(expected, 0.001)


@pytest.mark.parametrize(
"string1,string2,expected",
[
("foo", "foo", 1),
("foo bar", "foo", 0.5),
("foo bar", "bar foo", 1),
("foo foo bar", "foo baz", 0.3333),
("foo", "", 0),
(None, "foo", np.nan),
],
)
def test_jaccard_string_similarity_word(string1, string2, expected):
result = text.jaccard(string1, string2, tokenize="on_whitespace").execute()
if np.isnan(expected):
assert np.isnan(result)
else:
assert result == pytest.approx(expected, 0.001)


@pytest.mark.parametrize(
"string1,string2,expected",
[
Expand Down
Loading