From 9d20008cde6022f8678f7e2a736c76b6c332e751 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 8 Dec 2023 08:49:43 -0800
Subject: [PATCH] Add: Levenshtein distance benchmarks

---
 README.md                                |  4 +-
 scripts/bench.ipynb                      |  9 +++
 scripts/bench_levenshtein.py             | 93 ++++++++++++++++++++++++
 scripts/{bench.py => bench_substring.py} |  0
 4 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 scripts/bench_levenshtein.py
 rename scripts/{bench.py => bench_substring.py} (100%)

diff --git a/README.md b/README.md
index 9d3467f2..3f391821 100644
--- a/README.md
+++ b/README.md
@@ -179,13 +179,13 @@ npm install && npm test
 To benchmark on some custom file and pattern combinations:
 
 ```sh
-python scripts/bench.py --haystack_path "your file" --needle "your pattern"
+python scripts/bench_substring.py --haystack_path "your file" --needle "your pattern"
 ```
 
 To benchmark on synthetic data:
 
 ```sh
-python scripts/bench.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce"
+python scripts/bench_substring.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce"
 ```
 
 ### Packaging
diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
index 52ae5e56..95edd753 100644
--- a/scripts/bench.ipynb
+++ b/scripts/bench.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget -O leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/scripts/bench_levenshtein.py b/scripts/bench_levenshtein.py
new file mode 100644
index 00000000..69aa5cb9
--- /dev/null
+++ b/scripts/bench_levenshtein.py
@@ -0,0 +1,93 @@
+# Benchmark for Levenshtein distance computation for most popular Python libraries.
+# Prior to benchmarking, downloads a file with tokens and runs a small fuzzy test,
+# comparing the outputs of different libraries.
+#
+# Downloading commonly used datasets:
+# !wget --no-clobber -O ./leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt
+#
+# Install the libraries:
+# !pip install python-levenshtein  # 4.8 M/mo: https://github.com/maxbachmann/python-Levenshtein
+# !pip install levenshtein # 4.2 M/mo: https://github.com/maxbachmann/Levenshtein
+# !pip install jellyfish # 2.3 M/mo: https://github.com/jamesturk/jellyfish/
+# !pip install editdistance # 700 k/mo: https://github.com/roy-ht/editdistance
+# !pip install distance # 160 k/mo: https://github.com/doukremt/distance
+# !pip install polyleven # 34 k/mo: https://github.com/fujimotos/polyleven
+
+import time
+import random
+import multiprocessing as mp
+
+import fire
+
+import stringzilla as sz
+import polyleven as pl
+import editdistance as ed
+import jellyfish as jf
+import Levenshtein as le
+
+
+def log(name: str, bytes_length: int, operator: callable):
+    a = time.time_ns()
+    checksum = operator()
+    b = time.time_ns()
+    secs = (b - a) / 1e9
+    gb_per_sec = bytes_length / (1e9 * secs)
+    print(
+        f"{name}: took {secs:.2f} seconds ~ {gb_per_sec:.3f} GB/s - checksum is {checksum:,}"
+    )
+
+
+def compute_distances(func, words, sample_words) -> int:
+    result = 0
+    for word in sample_words:
+        for other in words:
+            result += func(word, other)
+    return result
+
+
+def log_distances(name, func, words, sample_words) -> int:
+    total_bytes = sum(len(w) for w in words) * len(sample_words)
+    log(name, total_bytes, lambda: compute_distances(func, words, sample_words))
+
+
+def bench(text_path: str = None, threads: int = 0):
+    text: str = open(text_path, "r").read()
+    words: list = text.split(" ")
+
+    targets = (
+        ("levenshtein", le.distance),
+        ("stringzilla", sz.levenshtein),
+        ("polyleven", pl.levenshtein),
+        ("editdistance", ed.eval),
+        ("jellyfish", jf.levenshtein_distance),
+    )
+
+    # Fuzzy Test
+    for _ in range(100):  # Test 100 random pairs
+        word1, word2 = random.sample(words, 2)
+        results = [func(word1, word2) for _, func in targets]
+        assert all(
+            r == results[0] for r in results
+        ), f"Inconsistent results for pair {word1}, {word2}"
+
+    print("Fuzzy test passed. All libraries returned consistent results.")
+
+    # Run the Benchmark
+    sample_words = random.sample(words, 100)  # Sample 100 words for benchmarking
+
+    if threads == 1:
+        for name, func in targets:
+            log_distances(name, func, words, sample_words)
+    else:
+        processes = []
+        for name, func in targets:
+            p = mp.Process(target=log_distances, args=(name, func, words, sample_words))
+            processes.append(p)
+            p.start()
+
+        for p in processes:
+            p.join()
+
+
+if __name__ == "__main__":
+    fire.Fire(bench)
diff --git a/scripts/bench.py b/scripts/bench_substring.py
similarity index 100%
rename from scripts/bench.py
rename to scripts/bench_substring.py