dlfa-236_go-version-of-v1-indexer-sanitize-gem-unsuitable-chars-regexp.go

// This is a toy script for the testing the correctness of a Go port of the
// `REGEX_UNSUITABLE_CHARS` regular expression in the `Sanitize` gem used by v1 indexer.
//
// For more details, see Jira ticket:
//
//	"Determine if we need to port `Sanitize.preprocess` removal of `REGEX_UNSUITABLE_CHARS`"
//	https://jira.nyu.edu/browse/DLFA-209
package main

import (
	"fmt"
	"regexp"
)

// https://github.com/rgrove/sanitize/blob/v6.0.1/lib/sanitize.rb#L27
// REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
const regexpHTMLControlCharacters = `[\x{0001}-\x{0008}\x{000b}\x{000e}-\x{001f}\x{007f}-\x{009f}]`

// https://github.com/rgrove/sanitize/blob/v6.0.1/lib/sanitize.rb#L34
// REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
const regexpHTMLNonCharacters = `[\x{fdd0}-\x{fdef}\x{fffe}\x{ffff}\x{1fffe}\x{1ffff}\x{2fffe}\x{2ffff}\x{3fffe}\x{3ffff}\x{4fffe}\x{4ffff}\x{5fffe}\x{5ffff}\x{6fffe}\x{6ffff}\x{7fffe}\x{7ffff}\x{8fffe}\x{8ffff}\x{9fffe}\x{9ffff}\x{afffe}\x{affff}\x{bfffe}\x{bffff}\x{cfffe}\x{cffff}\x{dfffe}\x{dffff}\x{efffe}\x{effff}\x{ffffe}\x{fffff}\x{10fffe}\x{10ffff}]`

// https://github.com/rgrove/sanitize/blob/v6.0.1/lib/sanitize.rb#L48
// REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
var sanitizeRegexpString = fmt.Sprintf("(?:%s+|%s+)", regexpHTMLControlCharacters,
	regexpHTMLNonCharacters)

var sanitizeRegexp = regexp.MustCompile(sanitizeRegexpString)

func main() {
	testStrings := []string{
		// The U+2013 En Dash character after "Civil Rights" should not be removed,
		// but the U+009D OSC characters in all of these <unittitle> tags should be removed.
		"<unittitle>Civil Rights – Monroe, North Carolina \"KISSING CASE\": David Fuzzy\" Simpson and Hanover Thompson, Mrs.Jennie Simpson, Mrs.Evelyn Thompson, Dr.A.E.Perry, Robert F.Williams, Mrs.Mary Ruth Reid Assault Case, Rev TH Harris</unittitle>",
		"<unittitle>\"MOVE\"</unittitle>",
		"<unittitle>\"Human Rights\"</unittitle>",
		"<unittitle>\"Movie Salt of the Earth Film of the Striking Miners of New Mexico\"</unittitle>",

		// "testing testing one two three" in Chinese, Amharic, and Greek
		// ...should not be changed in any way
		"測試 測試一二三",
		"አንድ ሁለት ሶስት መሞከር",
		"δοκιμή δοκιμής ένα δύο τρία",

		// Should not remove or change tags
		`Henry Draper: Memorial minute. By Geo. F. Barker, for the <emph render="italic">Proceedings of the American Philosophical Society</emph>.`,

		// This string which should contain every unicode point from the v1 indexer
		// regexp (minus the range hyphens) was generated by this one-liner:
		//     ruby -e 'print "#\u0001\u0008\u000b\u000e\u001f\u007f\u009f\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}#"' | pbcopy
		// All of the characters between the beginning and end "#" characters
		// should be removed.
		`#﷐﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿#`,
	}

	for _, testString := range testStrings {
		sanitizedString := sanitizeRegexp.ReplaceAllString(testString, "")

		fmt.Println(fmt.Sprintf("=======\nA: %s\n\nB: %s\n=======\n",
			testString, sanitizedString))
	}
}