forked from hash-bang/compare-names
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
107 lines (92 loc) · 2.85 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
var isArray = require('lodash.isarray');
var levenshtein = require('levenshtein-dist');
/**
* Remove reference 'noise' from a string
* @param string a The string to remove the noise from
* @return string The input string with all noise removed
*/
function stripNoise(a) {
return a
.replace(/[^a-z0-9]+/i, ' ')
.replace(/ (the|a) /, ' ');
}
/**
* Fuzzily compare strings a and b
* @param string a The first string to compare
* @param string b The second string to compare
* @param number tolerence The tolerence when comparing using levenshtein, defaults to 10
* @return boolean True if a ≈ b
*/
function fuzzyStringCompare(a, b, tolerence) {
if (a == b) return true;
var as = stripNoise(a);
as = as.toLowerCase();
if (as.length > 255) as = as.substr(0, 255);
var bs = stripNoise(b);
bs = bs.toLowerCase();
if (bs.length > 255) bs = bs.substr(0, 255);
if (tolerence == undefined && levenshtein(as, bs) < 10) return true;
if (tolerence && levenshtein(as, bs) <= tolerence) return true;
}
/**
* Splits an author string into its component parts
* @param string author The raw author string to split
* @return array An array composed of lastname, initial/name
*/
function splitAuthor(author) {
return author
.split(/\s*[,\.\s]\s*/)
.filter(function(i) { return !!i }) // Strip out blanks
.filter(function(i) { return !/^[0-9]+(st|nd|rd|th)$/.test(i) }); // Strip out decendent numerics (e.g. '1st', '23rd')
}
/**
* Splits a single string of multiple authors into an array
* @param string str The string to split
* @return array The array of extracted authors
*/
function splitAuthorString(str) {
return str.split(/\s*;\s*/);
}
/**
* Compare an array of authors against a second array
* @param array a The first array of authors
* @param array b The second array of authors
* @return bolean True if a ≈ b
*/
function compareNames(a, b) {
if (!isArray(a)) a = splitAuthorString(a);
if (!isArray(b)) b = splitAuthorString(b);
var aPos = 0, bPos = 0;
var authorLimit = Math.min(a.length, b.length);
var failed = false;
while (aPos < authorLimit && bPos < authorLimit) {
if (fuzzyStringCompare(a[aPos], b[bPos])) { // Direct or fuzzy matching of entire strings
aPos++;
bPos++;
} else {
var aAuth = splitAuthor(a[aPos]);
var bAuth = splitAuthor(b[bPos]);
var nameLimit = Math.min(aAuth.length, bAuth.length);
var nameMatches = 0;
for (var n = 0; n < nameLimit; n++) {
if (
aAuth[n] == bAuth[n] || // Direct match
aAuth[n].length == 1 && bAuth[n].substr(0, 1) || // A is initial and B is full name
bAuth[n].length == 1 && aAuth[n].substr(0, 1) ||
(aAuth[n].length > 1 && bAuth[n].length > 1 && fuzzyStringCompare(aAuth[n], bAuth[n], 3))
) {
nameMatches++;
}
}
if (nameMatches >= nameLimit) {
aPos++;
bPos++;
} else {
failed = true;
}
break;
}
}
return !failed;
}
module.exports = compareNames;