Skip to content

Commit f126bf0

Browse files
committed
refactor: Improve string comparison robustness
* Fix undefined token inclusion caused by token lists with differing lengths * Always compare longer to shorter string so sameness parameter order is invariant * Add comments to make logic easier to understand * Add tests to test new functionality
1 parent 064a435 commit f126bf0

File tree

3 files changed

+80
-18
lines changed

3 files changed

+80
-18
lines changed

src/backend/tests/scrobbler/scrobblers.test.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,6 @@ describe('Detects duplicate and unique scrobbles from client recent history', fu
285285

286286
const sonDiffPlay = clone(son);
287287
sonDiffPlay.data.playDate = sonDiffPlay.data.playDate.subtract(son.data.duration + 1, 's');
288-
sonDiffPlay.data.artists = [sonDiffPlay.data.artists[1]]
289288
assert.isTrue(await testScrobbler.alreadyScrobbled(sonDiffPlay));
290289
});
291290

src/backend/tests/strings.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,25 @@ describe('String Comparisons', function () {
7373
assert.isAtMost( result.highScore, 99, `Comparing: '${test[0]}' | '${test[1]}'`);
7474
}
7575
});
76+
77+
it('should handle strings with different lengths', async function () {
78+
const tests = [
79+
['The Amazing Bongo Hop', 'The Bongo Hop'],
80+
]
81+
82+
for(const test of tests) {
83+
const result = compareNormalizedStrings(test[0], test[1]);
84+
assert.isAtMost( result.highScore, 53, `Comparing: '${test[0]}' | '${test[1]}'`);
85+
}
86+
});
87+
88+
it('should be string parameter order invariant', async function () {
89+
const longerString = 'Nidia Gongora TEST';
90+
const shorterString = 'Nidia Gongora'
91+
92+
const result1 = compareNormalizedStrings(longerString, shorterString);
93+
const result2 = compareNormalizedStrings(shorterString, longerString);
94+
95+
assert.equal( result1.highScore, result2.highScore, `Comparing: '${longerString}' | '${shorterString}'`);
96+
});
7697
});

src/backend/utils/StringUtils.ts

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -167,23 +167,55 @@ export const compareScrobbleArtists = (existing: PlayObject, candidate: PlayObje
167167
return compareNormalizedStrings(existingArtists.reduce((acc, curr) => `${acc} ${curr}`, ''), candidateArtists.reduce((acc, curr) => `${acc} ${curr}`, '')).highScore;
168168
}
169169

170+
/**
171+
* Compare the sameness of two strings after making them token-order independent
172+
*
173+
* Transform two strings before comparing in order to have as little difference between them as possible:
174+
*
175+
* * First, normalize (lower case, remove extraneous whitespace, remove punctuation, make all characters standard ANSI) strings and split into tokens
176+
* * Second, reorder tokens in the shorter list so that they mirror order of tokens in longer list as closely as possible
177+
* * Finally, concat back to strings and compare with sameness strategies
178+
*
179+
* */
170180
export const compareNormalizedStrings = (existing: string, candidate: string): StringSamenessResult => {
171181

182+
// there may be scenarios where a track differs in *ordering* of ancillary information between sources
183+
// EX My Track (feat. Art1, Art2) -- My Track (feat. Art2 Art1)
184+
185+
// first remove lower case, extraneous whitespace, punctuation, and replace non-ansi with ansi characters
172186
const normalExisting = normalizeStr(existing, {keepSingleWhitespace: true});
173187
const normalCandidate = normalizeStr(candidate, {keepSingleWhitespace: true});
174188

175-
// there may be scenarios where a track differs in *ordering* of ancillary information between sources
176-
// EX My Track (feat. Art1, Art2) -- My Track (feat. Art2 Art1)
177-
// so instead of naively comparing the entire track string against the candidate we
178-
// * first try to match up all white-space separated tokens
179-
// * recombine with closest tokens in order
180-
// * then check sameness
189+
// split by "token"
181190
const eTokens = normalExisting.split(' ');
182191
const cTokens = normalCandidate.split(' ');
183192

184-
const orderedCandidateTokens = eTokens.reduce((acc: { ordered: string[], remaining: string[] }, curr) => {
193+
194+
let longerTokens: string[],
195+
shorterTokens: string[];
196+
197+
if (eTokens.length > cTokens.length) {
198+
longerTokens = eTokens;
199+
shorterTokens = cTokens;
200+
} else {
201+
longerTokens = cTokens;
202+
shorterTokens = eTokens;
203+
}
204+
205+
// we will use longest string (token list) as the reducer and order the shorter list to match it
206+
// so we don't have to deal with undefined positions in the shorter list
207+
208+
const orderedCandidateTokens = longerTokens.reduce((acc: { ordered: string[], remaining: string[] }, curr) => {
209+
// if we've run out of tokens in the shorter list just return
210+
if (acc.remaining.length === 0) {
211+
return acc;
212+
}
213+
214+
// on each iteration of tokens in the long list
215+
// we iterate through remaining tokens from the shorter list and find the token with the most sameness
216+
185217
let highScore = 0;
186-
let highIndex = undefined;
218+
let highIndex = 0;
187219
let index = 0;
188220
for (const token of acc.remaining) {
189221
const result = stringSameness(curr, token);
@@ -194,18 +226,28 @@ export const compareNormalizedStrings = (existing: string, candidate: string): S
194226
index++;
195227
}
196228

229+
// then remove the most same token from the remaining short list tokens
197230
const splicedRemaining = [...acc.remaining];
198231
splicedRemaining.splice(highIndex, 1);
199232

200-
return {ordered: acc.ordered.concat(acc.remaining[highIndex]), remaining: splicedRemaining};
201-
}, {ordered: [], remaining: cTokens});
202-
203-
const allOrderedCandidateTokens = orderedCandidateTokens.ordered.concat(orderedCandidateTokens.remaining);
204-
const orderedCandidateString = allOrderedCandidateTokens.join(' ');
205-
206-
// since we have already "matched" up words by order we don't want to use cosine strat
233+
return {
234+
// finally add the most same token to the ordered short list
235+
ordered: acc.ordered.concat(acc.remaining[highIndex]),
236+
// and return the remaining short list tokens
237+
remaining: splicedRemaining
238+
};
239+
}, {
240+
// "ordered" is the result of ordering tokens in the shorter list to match longer token order
241+
ordered: [],
242+
// remaining is the initial shorter list
243+
remaining: shorterTokens
244+
});
245+
246+
// since we have already "matched" up tokens by order we don't want to use cosine strat
207247
// bc it only does comparisons between whole words in a sentence (instead of all letters in a string)
208248
// which makes it inaccurate for small-n sentences and typos
209-
210-
return stringSameness(normalExisting, orderedCandidateString, {transforms: [], strategies: [levenStrategy, diceStrategy]});
249+
return stringSameness(longerTokens.join(' '), orderedCandidateTokens.ordered.join(' '), {
250+
transforms: [],
251+
strategies: [levenStrategy, diceStrategy]
252+
})
211253
}

0 commit comments

Comments
 (0)