Skip to content

Commit

Permalink
update test to include levenshtein difference
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee committed Sep 20, 2024
1 parent de86c78 commit b3cd932
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 1 deletion.
Empty file added tests/api/__init__.py
Empty file.
3 changes: 2 additions & 1 deletion tests/extraction/test_invoice_splitter_auto_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from mindee.product import InvoiceSplitterV1, InvoiceV4
from tests.product import get_id, get_version
from tests.test_inputs import PRODUCT_DATA_DIR
from tests.utils import levenshtein_ratio


@pytest.fixture
Expand Down Expand Up @@ -52,4 +53,4 @@ def test_pdf_should_extract_invoices_strict():
PRODUCT_DATA_DIR / "invoices" / "response_v4" / "summary_full_invoice_p1.rst",
invoice_0.document,
)
assert test_string_rst_invoice_0 == str(invoice_0.document)
assert levenshtein_ratio(test_string_rst_invoice_0, str(invoice_0.document)) >= 0.97
Empty file added tests/input/__init__.py
Empty file.
File renamed without changes.
Empty file added tests/mindee_http/__init__.py
Empty file.
51 changes: 51 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,54 @@ def dummy_envvars(monkeypatch) -> None:


EXTRAS_DIR = Path("./tests/data/extras/")


def levenshtein_distance(reference_str: str, target_str: str) -> int:
"""
Calculate the Levenshtein distance between two strings.
The Levenshtein distance is a measure of the difference between two sequences.
Informally, the Levenshtein distance between two words is the minimum number
of single-character edits (insertions, deletions or substitutions) required
to change one word into the other.
:param reference_str: The reference string.
:param target_str: The target string.
:return: The distance between the two strings.
"""
reference_len, target_len = len(reference_str), len(target_str)
previous_row = list(range(target_len + 1))
current_row = [0] * (target_len + 1)

for i in range(reference_len):
current_row[0] = i + 1

for j in range(target_len):
deletion_cost = previous_row[j + 1] + 1
insertion_cost = current_row[j] + 1
substitution_cost = previous_row[j] if reference_str[i] == target_str[j] else previous_row[j] + 1

current_row[j + 1] = min(deletion_cost, insertion_cost, substitution_cost)

previous_row, current_row = current_row, previous_row

return previous_row[target_len]


def levenshtein_ratio(ref_str: str, target_str: str) -> float:
"""
Calculates the Levenshtein ratio between two strings.
:param ref_str: Reference string.
:param target_str: Target String.
:return: Ratio between the two strings
"""
lev_distance = levenshtein_distance(ref_str, target_str)
max_len = max(len(ref_str), len(target_str))

if max_len == 0:
return 1.0

return 1.0 - (lev_distance / max_len)

0 comments on commit b3cd932

Please sign in to comment.