-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdlfa-236_go-version-of-v1-indexer-sanitize-gem-unsuitable-chars-regexp.go
62 lines (51 loc) · 3.76 KB
/
dlfa-236_go-version-of-v1-indexer-sanitize-gem-unsuitable-chars-regexp.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
// This is a toy script for the testing the correctness of a Go port of the
// `REGEX_UNSUITABLE_CHARS` regular expression in the `Sanitize` gem used by v1 indexer.
//
// For more details, see Jira ticket:
//
// "Determine if we need to port `Sanitize.preprocess` removal of `REGEX_UNSUITABLE_CHARS`"
// https://jira.nyu.edu/browse/DLFA-209
package main
import (
"fmt"
"regexp"
)
// https://github.com/rgrove/sanitize/blob/v6.0.1/lib/sanitize.rb#L27
// REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
const regexpHTMLControlCharacters = `[\x{0001}-\x{0008}\x{000b}\x{000e}-\x{001f}\x{007f}-\x{009f}]`
// https://github.com/rgrove/sanitize/blob/v6.0.1/lib/sanitize.rb#L34
// REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
const regexpHTMLNonCharacters = `[\x{fdd0}-\x{fdef}\x{fffe}\x{ffff}\x{1fffe}\x{1ffff}\x{2fffe}\x{2ffff}\x{3fffe}\x{3ffff}\x{4fffe}\x{4ffff}\x{5fffe}\x{5ffff}\x{6fffe}\x{6ffff}\x{7fffe}\x{7ffff}\x{8fffe}\x{8ffff}\x{9fffe}\x{9ffff}\x{afffe}\x{affff}\x{bfffe}\x{bffff}\x{cfffe}\x{cffff}\x{dfffe}\x{dffff}\x{efffe}\x{effff}\x{ffffe}\x{fffff}\x{10fffe}\x{10ffff}]`
// https://github.com/rgrove/sanitize/blob/v6.0.1/lib/sanitize.rb#L48
// REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
var sanitizeRegexpString = fmt.Sprintf("(?:%s+|%s+)", regexpHTMLControlCharacters,
regexpHTMLNonCharacters)
var sanitizeRegexp = regexp.MustCompile(sanitizeRegexpString)
func main() {
testStrings := []string{
// The U+2013 En Dash character after "Civil Rights" should not be removed,
// but the U+009D OSC characters in all of these <unittitle> tags should be removed.
"<unittitle>Civil Rights – Monroe, North Carolina \"KISSING CASE\": David Fuzzy\" Simpson and Hanover Thompson, Mrs.Jennie Simpson, Mrs.Evelyn Thompson, Dr.A.E.Perry, Robert F.Williams, Mrs.Mary Ruth Reid Assault Case, Rev TH Harris</unittitle>",
"<unittitle>\"MOVE\"</unittitle>",
"<unittitle>\"Human Rights\"</unittitle>",
"<unittitle>\"Movie Salt of the Earth Film of the Striking Miners of New Mexico\"</unittitle>",
// "testing testing one two three" in Chinese, Amharic, and Greek
// ...should not be changed in any way
"測試 測試一二三",
"አንድ ሁለት ሶስት መሞከር",
"δοκιμή δοκιμής ένα δύο τρία",
// Should not remove or change tags
`Henry Draper: Memorial minute. By Geo. F. Barker, for the <emph render="italic">Proceedings of the American Philosophical Society</emph>.`,
// This string which should contain every unicode point from the v1 indexer
// regexp (minus the range hyphens) was generated by this one-liner:
// ruby -e 'print "#\u0001\u0008\u000b\u000e\u001f\u007f\u009f\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}#"' | pbcopy
// All of the characters between the beginning and end "#" characters
// should be removed.
`##`,
}
for _, testString := range testStrings {
sanitizedString := sanitizeRegexp.ReplaceAllString(testString, "")
fmt.Println(fmt.Sprintf("=======\nA: %s\n\nB: %s\n=======\n",
testString, sanitizedString))
}
}