Skip to content

Commit

Permalink
decomplexify levensthein computation
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee committed Sep 20, 2024
1 parent 7f591e0 commit 2a8769c
Showing 1 changed file with 2 additions and 46 deletions.
48 changes: 2 additions & 46 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from difflib import SequenceMatcher
from pathlib import Path

from mindee.mindee_http.mindee_api import (
Expand Down Expand Up @@ -27,56 +28,11 @@ def dummy_envvars(monkeypatch) -> None:
EXTRAS_DIR = Path("./tests/data/extras/")


def levenshtein_distance(reference_str: str, target_str: str) -> int:
"""
Calculate the Levenshtein distance between two strings.
The Levenshtein distance is a measure of the difference between two sequences.
Informally, the Levenshtein distance between two words is the minimum number
of single-character edits (insertions, deletions or substitutions) required
to change one word into the other.
:param reference_str: The reference string.
:param target_str: The target string.
:return: The distance between the two strings.
"""
reference_len, target_len = len(reference_str), len(target_str)
previous_row = list(range(target_len + 1))
current_row = [0] * (target_len + 1)

for i in range(reference_len):
current_row[0] = i + 1

for j in range(target_len):
deletion_cost = previous_row[j + 1] + 1
insertion_cost = current_row[j] + 1
substitution_cost = (
previous_row[j]
if reference_str[i] == target_str[j]
else previous_row[j] + 1
)

current_row[j + 1] = min(deletion_cost, insertion_cost, substitution_cost)

previous_row, current_row = current_row, previous_row

return previous_row[target_len]


def levenshtein_ratio(ref_str: str, target_str: str) -> float:
"""
Calculates the Levenshtein ratio between two strings.
:param ref_str: Reference string.
:param target_str: Target String.
:return: Ratio between the two strings
"""
lev_distance = levenshtein_distance(ref_str, target_str)
max_len = max(len(ref_str), len(target_str))

if max_len == 0:
return 1.0

return 1.0 - (lev_distance / max_len)
return SequenceMatcher(None, ref_str, target_str).ratio()

0 comments on commit 2a8769c

Please sign in to comment.