Skip to content

Commit

Permalink
Improved distance calculation speed when a maximum cost is set.
Browse files Browse the repository at this point in the history
- Reduced complexity from O(max(|s1|,|s2|)*maxCost) to O(min(|s1|,|s2|)*maxCost).
- Bypass the calculation when distance is guaranteed to be greater than maxCost.
- Added tests for more edge cases.
  • Loading branch information
alex-alrux committed Apr 13, 2016
1 parent 3e941b9 commit 58b6ed8
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 28 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This package implements distance and similarity metrics for strings, based on th

[![Build Status](https://travis-ci.org/agext/levenshtein.svg?branch=master)](https://travis-ci.org/agext/levenshtein)

v1.1 Stable: Guaranteed no breaking changes to the API in future v1.x releases. No known bugs or performance issues. Probably safe to use in production, though provided on "AS IS" basis.
v1.2 Stable: Guaranteed no breaking changes to the API in future v1.x releases. No known bugs or performance issues. Probably safe to use in production, though provided on "AS IS" basis.

## Overview

Expand Down
83 changes: 58 additions & 25 deletions levenshtein.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,44 +66,60 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist,
return
}

// prefer the shorter string first, to minimize space;
// a swap also transposes the meanings of insertion and deletion.
if l1 > l2 {
str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
}
d := make([]int, l1+1)

// variables used in inner "for" loops
var y, dy, c, l int

// if maxCost is higher than the maximum possible distance, it's equivalent to 'unlimited'
// if maxCost is greater than or equal to the maximum possible distance, it's equivalent to 'unlimited'
if maxCost > 0 {
if subCost < delCost+insCost {
if maxCost > l1*subCost+(l2-l1)*insCost {
if maxCost >= l1*subCost+(l2-l1)*insCost {
maxCost = 0
}
} else {
if maxCost > l1*delCost+l2*insCost {
if maxCost >= l1*delCost+l2*insCost {
maxCost = 0
}
}
}

if maxCost > 0 {
// prefer the longer string first, to minimize time;
// a swap also transposes the meanings of insertion and deletion.
if l1 < l2 {
str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
}

// the length differential times cost of deletion is a lower bound for the cost;
// if it is higher than the maxCost, there is no point going into the main calculation.
if dist = (l1 - l2) * delCost; dist > maxCost {
return
}

d := make([]int, l1+1)

// offset and length of d in the current row
do, dl := 0, 1
for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dl++ {
doff, dlen := 0, 1
for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dlen++ {
d[y] = dy
y++
dy = y * delCost
}
// fmt.Printf("%q -> %q: init doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, doff, dlen, doff, doff+dlen, d[doff:doff+dlen])

for x := 0; x < l2; x++ {
dy, d[do] = d[do], d[do]+insCost
if l = do + dl; l > l1 {
l = l1
dy, d[doff] = d[doff], d[doff]+insCost
for d[doff] > maxCost && dlen > 0 {
if str1[doff] != str2[x] {
dy += subCost
}
doff++
dlen--
if c = d[doff] + insCost; c < dy {
dy = c
}
dy, d[doff] = d[doff], dy
}
for y = do; y < l; dy, d[y] = d[y], dy {
for y, l = doff, doff+dlen-1; y < l; dy, d[y] = d[y], dy {
if str1[y] != str2[x] {
dy += subCost
}
Expand All @@ -114,25 +130,42 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist,
if c = d[y] + insCost; c < dy {
dy = c
}
if dy > maxCost {
dl = y - do
break
}
}
for d[do] > maxCost {
do++
dl--
if y < l1 {
if str1[y] != str2[x] {
dy += subCost
}
if c = d[y] + delCost; c < dy {
dy = c
}
for ; dy <= maxCost && y < l1; dy, d[y] = dy+delCost, dy {
y++
dlen++
}
}
if dl == 0 {
// fmt.Printf("%q -> %q: x=%d doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, x, doff, dlen, doff, doff+dlen, d[doff:doff+dlen])
if dlen == 0 {
dist = maxCost + 1
return
}
}
if doff+dlen-1 < l1 {
dist = maxCost + 1
return
}
dist = d[l1]
} else {
// ToDo: This is O(l1*l2) time and O(min(l1,l2)) space; investigate if it is
// worth to implement diagonal approach - O(l1*(1+dist)) time, up to O(l1*l2) space
// http://www.csse.monash.edu.au/~lloyd/tildeStrings/Alignment/92.IPL.html

// prefer the shorter string first, to minimize space; time is O(l1*l2) anyway;
// a swap also transposes the meanings of insertion and deletion.
if l1 > l2 {
str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
}
d := make([]int, l1+1)

for y = 1; y <= l1; y++ {
d[y] = y * delCost
}
Expand All @@ -151,9 +184,9 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist,
}
}
}
dist = d[l1]
}

dist = d[l1]
return
}

Expand Down
6 changes: 4 additions & 2 deletions levenshtein_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,16 @@ func Test_Metrics(t *testing.T) {
{"passwor", "password", " (D=2)", NewParams().DelCost(2), e{1, 7, 0, 7.0 / 8, 7.4 / 8}},

// When setting a maxCost (should not affect Similarity() and Match())...
{"password", "pass1", "(maxCost=1)", NewParams().MaxCost(1), e{2, 4, 0, 4. / 8, 4. / 8}},
{"password", "1password2", "(maxCost=6)", NewParams().MaxCost(6), e{2, 0, 0, 8. / 10, 8. / 10}},
{"password", "pass1234", "(maxCost=1)", NewParams().MaxCost(1), e{2, 4, 0, 4. / 8, 4. / 8}},
{"pass1word", "passwords1", "(maxCost=2)", NewParams().MaxCost(2), e{3, 4, 0, 7. / 10, 8.2 / 10}},
{"password", "1234", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{2, 0, 0, 0, 0}},
{"password", "1passwo", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{2, 0, 0, 4. / 9, 4. / 9}},
{"pwd", "password", " (I=0,maxCost=0)", NewParams().InsCost(0).MaxCost(0), e{0, 1, 1, 1, 1}},
{"passXword", "password", "(maxCost=10)", NewParams().MaxCost(10), e{1, 4, 4, 8. / 9, 8.4 / 9}},
{"passXord", "password", "(S=3,maxCost=17)", NewParams().SubCost(3).MaxCost(17), e{2, 4, 3, 14. / 16, 14.8 / 16}},
// ... no change because the Calculate is calculated without getting into the main algorithm:
{"password", "pass", "(maxCost=1)", NewParams().MaxCost(1), e{4, 4, 0, 4. / 8, 4. / 8}},
{"password", "1234", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{8, 0, 0, 0, 0}},

// When setting a minScore (should not affect Calculate() and Distance())...
{"password", "pass1", "(minScore=0.3)", NewParams().MinScore(.3), e{4, 4, 0, 4. / 8, 4. / 8}},
Expand Down

0 comments on commit 58b6ed8

Please sign in to comment.