@@ -167,23 +167,55 @@ export const compareScrobbleArtists = (existing: PlayObject, candidate: PlayObje
167
167
return compareNormalizedStrings ( existingArtists . reduce ( ( acc , curr ) => `${ acc } ${ curr } ` , '' ) , candidateArtists . reduce ( ( acc , curr ) => `${ acc } ${ curr } ` , '' ) ) . highScore ;
168
168
}
169
169
170
+ /**
171
+ * Compare the sameness of two strings after making them token-order independent
172
+ *
173
+ * Transform two strings before comparing in order to have as little difference between them as possible:
174
+ *
175
+ * * First, normalize (lower case, remove extraneous whitespace, remove punctuation, make all characters standard ANSI) strings and split into tokens
176
+ * * Second, reorder tokens in the shorter list so that they mirror order of tokens in longer list as closely as possible
177
+ * * Finally, concat back to strings and compare with sameness strategies
178
+ *
179
+ * */
170
180
export const compareNormalizedStrings = ( existing : string , candidate : string ) : StringSamenessResult => {
171
181
182
+ // there may be scenarios where a track differs in *ordering* of ancillary information between sources
183
+ // EX My Track (feat. Art1, Art2) -- My Track (feat. Art2 Art1)
184
+
185
+ // first remove lower case, extraneous whitespace, punctuation, and replace non-ansi with ansi characters
172
186
const normalExisting = normalizeStr ( existing , { keepSingleWhitespace : true } ) ;
173
187
const normalCandidate = normalizeStr ( candidate , { keepSingleWhitespace : true } ) ;
174
188
175
- // there may be scenarios where a track differs in *ordering* of ancillary information between sources
176
- // EX My Track (feat. Art1, Art2) -- My Track (feat. Art2 Art1)
177
- // so instead of naively comparing the entire track string against the candidate we
178
- // * first try to match up all white-space separated tokens
179
- // * recombine with closest tokens in order
180
- // * then check sameness
189
+ // split by "token"
181
190
const eTokens = normalExisting . split ( ' ' ) ;
182
191
const cTokens = normalCandidate . split ( ' ' ) ;
183
192
184
- const orderedCandidateTokens = eTokens . reduce ( ( acc : { ordered : string [ ] , remaining : string [ ] } , curr ) => {
193
+
194
+ let longerTokens : string [ ] ,
195
+ shorterTokens : string [ ] ;
196
+
197
+ if ( eTokens . length > cTokens . length ) {
198
+ longerTokens = eTokens ;
199
+ shorterTokens = cTokens ;
200
+ } else {
201
+ longerTokens = cTokens ;
202
+ shorterTokens = eTokens ;
203
+ }
204
+
205
+ // we will use longest string (token list) as the reducer and order the shorter list to match it
206
+ // so we don't have to deal with undefined positions in the shorter list
207
+
208
+ const orderedCandidateTokens = longerTokens . reduce ( ( acc : { ordered : string [ ] , remaining : string [ ] } , curr ) => {
209
+ // if we've run out of tokens in the shorter list just return
210
+ if ( acc . remaining . length === 0 ) {
211
+ return acc ;
212
+ }
213
+
214
+ // on each iteration of tokens in the long list
215
+ // we iterate through remaining tokens from the shorter list and find the token with the most sameness
216
+
185
217
let highScore = 0 ;
186
- let highIndex = undefined ;
218
+ let highIndex = 0 ;
187
219
let index = 0 ;
188
220
for ( const token of acc . remaining ) {
189
221
const result = stringSameness ( curr , token ) ;
@@ -194,18 +226,28 @@ export const compareNormalizedStrings = (existing: string, candidate: string): S
194
226
index ++ ;
195
227
}
196
228
229
+ // then remove the most same token from the remaining short list tokens
197
230
const splicedRemaining = [ ...acc . remaining ] ;
198
231
splicedRemaining . splice ( highIndex , 1 ) ;
199
232
200
- return { ordered : acc . ordered . concat ( acc . remaining [ highIndex ] ) , remaining : splicedRemaining } ;
201
- } , { ordered : [ ] , remaining : cTokens } ) ;
202
-
203
- const allOrderedCandidateTokens = orderedCandidateTokens . ordered . concat ( orderedCandidateTokens . remaining ) ;
204
- const orderedCandidateString = allOrderedCandidateTokens . join ( ' ' ) ;
205
-
206
- // since we have already "matched" up words by order we don't want to use cosine strat
233
+ return {
234
+ // finally add the most same token to the ordered short list
235
+ ordered : acc . ordered . concat ( acc . remaining [ highIndex ] ) ,
236
+ // and return the remaining short list tokens
237
+ remaining : splicedRemaining
238
+ } ;
239
+ } , {
240
+ // "ordered" is the result of ordering tokens in the shorter list to match longer token order
241
+ ordered : [ ] ,
242
+ // remaining is the initial shorter list
243
+ remaining : shorterTokens
244
+ } ) ;
245
+
246
+ // since we have already "matched" up tokens by order we don't want to use cosine strat
207
247
// bc it only does comparisons between whole words in a sentence (instead of all letters in a string)
208
248
// which makes it inaccurate for small-n sentences and typos
209
-
210
- return stringSameness ( normalExisting , orderedCandidateString , { transforms : [ ] , strategies : [ levenStrategy , diceStrategy ] } ) ;
249
+ return stringSameness ( longerTokens . join ( ' ' ) , orderedCandidateTokens . ordered . join ( ' ' ) , {
250
+ transforms : [ ] ,
251
+ strategies : [ levenStrategy , diceStrategy ]
252
+ } )
211
253
}
0 commit comments