From 58b6ed8b40a5d83e32eb430fa7c2407f3c5fe5a6 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 13 Apr 2016 15:22:30 -0400 Subject: [PATCH] Improved distance calculation speed when a maximum cost is set. - Reduced complexity from O(max(|s1|,|s2|)*maxCost) to O(min(|s1|,|s2|)*maxCost). - Bypass the calculation when distance is guaranteed to be greater than maxCost. - Added tests for more edge cases. --- README.md | 2 +- levenshtein.go | 83 +++++++++++++++++++++++++++++++-------------- levenshtein_test.go | 6 ++-- 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index e3e9b78..3a43449 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This package implements distance and similarity metrics for strings, based on th [![Build Status](https://travis-ci.org/agext/levenshtein.svg?branch=master)](https://travis-ci.org/agext/levenshtein) -v1.1 Stable: Guaranteed no breaking changes to the API in future v1.x releases. No known bugs or performance issues. Probably safe to use in production, though provided on "AS IS" basis. +v1.2 Stable: Guaranteed no breaking changes to the API in future v1.x releases. No known bugs or performance issues. Probably safe to use in production, though provided on "AS IS" basis. ## Overview diff --git a/levenshtein.go b/levenshtein.go index 69ca17d..1444a7b 100644 --- a/levenshtein.go +++ b/levenshtein.go @@ -66,44 +66,60 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist, return } - // prefer the shorter string first, to minimize space; - // a swap also transposes the meanings of insertion and deletion. - if l1 > l2 { - str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost - } - d := make([]int, l1+1) - // variables used in inner "for" loops var y, dy, c, l int - // if maxCost is higher than the maximum possible distance, it's equivalent to 'unlimited' + // if maxCost is greater than or equal to the maximum possible distance, it's equivalent to 'unlimited' if maxCost > 0 { if subCost < delCost+insCost { - if maxCost > l1*subCost+(l2-l1)*insCost { + if maxCost >= l1*subCost+(l2-l1)*insCost { maxCost = 0 } } else { - if maxCost > l1*delCost+l2*insCost { + if maxCost >= l1*delCost+l2*insCost { maxCost = 0 } } } if maxCost > 0 { + // prefer the longer string first, to minimize time; + // a swap also transposes the meanings of insertion and deletion. + if l1 < l2 { + str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost + } + + // the length differential times cost of deletion is a lower bound for the cost; + // if it is higher than the maxCost, there is no point going into the main calculation. + if dist = (l1 - l2) * delCost; dist > maxCost { + return + } + + d := make([]int, l1+1) + // offset and length of d in the current row - do, dl := 0, 1 - for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dl++ { + doff, dlen := 0, 1 + for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dlen++ { d[y] = dy y++ dy = y * delCost } + // fmt.Printf("%q -> %q: init doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, doff, dlen, doff, doff+dlen, d[doff:doff+dlen]) for x := 0; x < l2; x++ { - dy, d[do] = d[do], d[do]+insCost - if l = do + dl; l > l1 { - l = l1 + dy, d[doff] = d[doff], d[doff]+insCost + for d[doff] > maxCost && dlen > 0 { + if str1[doff] != str2[x] { + dy += subCost + } + doff++ + dlen-- + if c = d[doff] + insCost; c < dy { + dy = c + } + dy, d[doff] = d[doff], dy } - for y = do; y < l; dy, d[y] = d[y], dy { + for y, l = doff, doff+dlen-1; y < l; dy, d[y] = d[y], dy { if str1[y] != str2[x] { dy += subCost } @@ -114,25 +130,42 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist, if c = d[y] + insCost; c < dy { dy = c } - if dy > maxCost { - dl = y - do - break - } } - for d[do] > maxCost { - do++ - dl-- + if y < l1 { + if str1[y] != str2[x] { + dy += subCost + } + if c = d[y] + delCost; c < dy { + dy = c + } + for ; dy <= maxCost && y < l1; dy, d[y] = dy+delCost, dy { + y++ + dlen++ + } } - if dl == 0 { + // fmt.Printf("%q -> %q: x=%d doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, x, doff, dlen, doff, doff+dlen, d[doff:doff+dlen]) + if dlen == 0 { dist = maxCost + 1 return } } + if doff+dlen-1 < l1 { + dist = maxCost + 1 + return + } + dist = d[l1] } else { // ToDo: This is O(l1*l2) time and O(min(l1,l2)) space; investigate if it is // worth to implement diagonal approach - O(l1*(1+dist)) time, up to O(l1*l2) space // http://www.csse.monash.edu.au/~lloyd/tildeStrings/Alignment/92.IPL.html + // prefer the shorter string first, to minimize space; time is O(l1*l2) anyway; + // a swap also transposes the meanings of insertion and deletion. + if l1 > l2 { + str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost + } + d := make([]int, l1+1) + for y = 1; y <= l1; y++ { d[y] = y * delCost } @@ -151,9 +184,9 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist, } } } + dist = d[l1] } - dist = d[l1] return } diff --git a/levenshtein_test.go b/levenshtein_test.go index c544e88..7a16548 100644 --- a/levenshtein_test.go +++ b/levenshtein_test.go @@ -125,14 +125,16 @@ func Test_Metrics(t *testing.T) { {"passwor", "password", " (D=2)", NewParams().DelCost(2), e{1, 7, 0, 7.0 / 8, 7.4 / 8}}, // When setting a maxCost (should not affect Similarity() and Match())... - {"password", "pass1", "(maxCost=1)", NewParams().MaxCost(1), e{2, 4, 0, 4. / 8, 4. / 8}}, + {"password", "1password2", "(maxCost=6)", NewParams().MaxCost(6), e{2, 0, 0, 8. / 10, 8. / 10}}, + {"password", "pass1234", "(maxCost=1)", NewParams().MaxCost(1), e{2, 4, 0, 4. / 8, 4. / 8}}, {"pass1word", "passwords1", "(maxCost=2)", NewParams().MaxCost(2), e{3, 4, 0, 7. / 10, 8.2 / 10}}, - {"password", "1234", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{2, 0, 0, 0, 0}}, + {"password", "1passwo", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{2, 0, 0, 4. / 9, 4. / 9}}, {"pwd", "password", " (I=0,maxCost=0)", NewParams().InsCost(0).MaxCost(0), e{0, 1, 1, 1, 1}}, {"passXword", "password", "(maxCost=10)", NewParams().MaxCost(10), e{1, 4, 4, 8. / 9, 8.4 / 9}}, {"passXord", "password", "(S=3,maxCost=17)", NewParams().SubCost(3).MaxCost(17), e{2, 4, 3, 14. / 16, 14.8 / 16}}, // ... no change because the Calculate is calculated without getting into the main algorithm: {"password", "pass", "(maxCost=1)", NewParams().MaxCost(1), e{4, 4, 0, 4. / 8, 4. / 8}}, + {"password", "1234", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{8, 0, 0, 0, 0}}, // When setting a minScore (should not affect Calculate() and Distance())... {"password", "pass1", "(minScore=0.3)", NewParams().MinScore(.3), e{4, 4, 0, 4. / 8, 4. / 8}},