From 527d3b2d1c847ee3e36df277b79471dfe000bc93 Mon Sep 17 00:00:00 2001 From: auvred Date: Sat, 13 Sep 2025 11:08:05 +0300 Subject: [PATCH 1/4] migrate to regonaut --- builtin_regexp.go | 389 ++++++++++++---------------- compiler_expr.go | 5 +- go.mod | 2 +- go.sum | 6 +- parser/parser.go | 2 +- parser/parser_test.go | 13 - parser/regexp.go | 472 ---------------------------------- parser/regexp_test.go | 191 -------------- regexp.go | 574 +++--------------------------------------- regexp_test.go | 232 +++++++---------- string.go | 1 + string_ascii.go | 9 + string_imported.go | 8 + string_unicode.go | 4 + tc39_test.go | 34 --- 15 files changed, 323 insertions(+), 1619 deletions(-) delete mode 100644 parser/regexp.go delete mode 100644 parser/regexp_test.go diff --git a/builtin_regexp.go b/builtin_regexp.go index cdc0d9db2..18b19acc3 100644 --- a/builtin_regexp.go +++ b/builtin_regexp.go @@ -2,11 +2,11 @@ package goja import ( "fmt" - "github.com/dop251/goja/parser" - "regexp" "strings" "unicode/utf16" "unicode/utf8" + + "github.com/auvred/regonaut" ) func (r *Runtime) newRegexpObject(proto *Object) *regexpObject { @@ -31,26 +31,6 @@ func (r *Runtime) newRegExpp(pattern *regexpPattern, patternStr String, proto *O return o } -func decodeHex(s string) (int, bool) { - var hex int - for i := 0; i < len(s); i++ { - var n byte - chr := s[i] - switch { - case '0' <= chr && chr <= '9': - n = chr - '0' - case 'a' <= chr && chr <= 'f': - n = chr - 'a' + 10 - case 'A' <= chr && chr <= 'F': - n = chr - 'A' + 10 - default: - return 0, false - } - hex = hex*16 + int(n) - } - return hex, true -} - func writeHex4(b *strings.Builder, i int) { b.WriteByte(hex[i>>12]) b.WriteByte(hex[(i>>8)&0xF]) @@ -58,71 +38,6 @@ func writeHex4(b *strings.Builder, i int) { b.WriteByte(hex[i&0xF]) } -// Convert any valid surrogate pairs in the form of \uXXXX\uXXXX to unicode characters -func convertRegexpToUnicode(patternStr string) string { - var sb strings.Builder - pos := 0 - for i := 0; i < len(patternStr)-11; { - r, size := utf8.DecodeRuneInString(patternStr[i:]) - if r == '\\' { - i++ - if patternStr[i] == 'u' && patternStr[i+5] == '\\' && patternStr[i+6] == 'u' { - if first, ok := decodeHex(patternStr[i+1 : i+5]); ok { - if isUTF16FirstSurrogate(uint16(first)) { - if second, ok := decodeHex(patternStr[i+7 : i+11]); ok { - if isUTF16SecondSurrogate(uint16(second)) { - r = utf16.DecodeRune(rune(first), rune(second)) - sb.WriteString(patternStr[pos : i-1]) - sb.WriteRune(r) - i += 11 - pos = i - continue - } - } - } - } - } - i++ - } else { - i += size - } - } - if pos > 0 { - sb.WriteString(patternStr[pos:]) - return sb.String() - } - return patternStr -} - -// Convert any extended unicode characters to UTF-16 in the form of \uXXXX\uXXXX -func convertRegexpToUtf16(patternStr string) string { - var sb strings.Builder - pos := 0 - var prevRune rune - for i := 0; i < len(patternStr); { - r, size := utf8.DecodeRuneInString(patternStr[i:]) - if r > 0xFFFF { - sb.WriteString(patternStr[pos:i]) - if prevRune == '\\' { - sb.WriteRune('\\') - } - first, second := utf16.EncodeRune(r) - sb.WriteString(`\u`) - writeHex4(&sb, int(first)) - sb.WriteString(`\u`) - writeHex4(&sb, int(second)) - pos = i + size - } - i += size - prevRune = r - } - if pos > 0 { - sb.WriteString(patternStr[pos:]) - return sb.String() - } - return patternStr -} - // convert any broken UTF-16 surrogate pairs to \uXXXX func escapeInvalidUtf16(s String) string { if imported, ok := s.(*importedString); ok { @@ -178,14 +93,12 @@ func escapeInvalidUtf16(s String) string { return s.String() } -func compileRegexpFromValueString(patternStr String, flags string) (*regexpPattern, error) { - return compileRegexp(escapeInvalidUtf16(patternStr), flags) -} +func compileRegexp(patternStr String, flags string) (p *regexpPattern, err error) { + patternUtf16 := patternStr.toUnicode() + + var global, ignoreCase, multiline, dotAll, sticky, unicode, unicodeSets bool -func compileRegexp(patternStr, flags string) (p *regexpPattern, err error) { - var global, ignoreCase, multiline, dotAll, sticky, unicode bool - var wrapper *regexpWrapper - var wrapper2 *regexp2Wrapper + reFlags := regonaut.FlagAnnexB if flags != "" { invalidFlags := func() { @@ -204,30 +117,41 @@ func compileRegexp(patternStr, flags string) (p *regexpPattern, err error) { invalidFlags() return } + reFlags |= regonaut.FlagMultiline multiline = true case 's': if dotAll { invalidFlags() return } + reFlags |= regonaut.FlagDotAll dotAll = true case 'i': if ignoreCase { invalidFlags() return } + reFlags |= regonaut.FlagIgnoreCase ignoreCase = true case 'y': if sticky { invalidFlags() return } + reFlags |= regonaut.FlagSticky sticky = true case 'u': if unicode { invalidFlags() } + reFlags |= regonaut.FlagUnicode unicode = true + case 'v': + if unicodeSets { + invalidFlags() + } + reFlags |= regonaut.FlagUnicodeSets + unicodeSets = true default: invalidFlags() return @@ -235,62 +159,28 @@ func compileRegexp(patternStr, flags string) (p *regexpPattern, err error) { } } - if unicode { - patternStr = convertRegexpToUnicode(patternStr) - } else { - patternStr = convertRegexpToUtf16(patternStr) - } - - re2Str, err1 := parser.TransformRegExp(patternStr, dotAll, unicode) - if err1 == nil { - re2flags := "" - if multiline { - re2flags += "m" - } - if dotAll { - re2flags += "s" - } - if ignoreCase { - re2flags += "i" - } - if len(re2flags) > 0 { - re2Str = fmt.Sprintf("(?%s:%s)", re2flags, re2Str) - } - - pattern, err1 := regexp.Compile(re2Str) - if err1 != nil { - err = fmt.Errorf("Invalid regular expression (re2): %s (%v)", re2Str, err1) - return - } - wrapper = (*regexpWrapper)(pattern) - } else { - if _, incompat := err1.(parser.RegexpErrorIncompatible); !incompat { - err = err1 - return - } - wrapper2, err = compileRegexp2(patternStr, multiline, dotAll, ignoreCase, unicode) - if err != nil { - err = fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", patternStr, err) - return - } + var re *regonaut.RegExpUtf16 + re, err = regonaut.CompileUtf16(patternUtf16[1:], reFlags) + if err != nil { + return } p = ®expPattern{ - src: patternStr, - regexpWrapper: wrapper, - regexp2Wrapper: wrapper2, - global: global, - ignoreCase: ignoreCase, - multiline: multiline, - dotAll: dotAll, - sticky: sticky, - unicode: unicode, + src: patternUtf16, + re: re, + global: global, + ignoreCase: ignoreCase, + multiline: multiline, + dotAll: dotAll, + sticky: sticky, + unicode: unicode, + unicodeSets: unicodeSets, } return } func (r *Runtime) _newRegExp(patternStr String, flags string, proto *Object) *regexpObject { - pattern, err := compileRegexpFromValueString(patternStr, flags) + pattern, err := compileRegexp(patternStr, flags) if err != nil { panic(r.newSyntaxError(err.Error(), -1)) } @@ -388,7 +278,7 @@ func (r *Runtime) regexpproto_compile(call FunctionCall) Value { if flagsVal != _undefined { flags = flagsVal.toString().String() } - pattern, err = compileRegexpFromValueString(source, flags) + pattern, err = compileRegexp(source, flags) if err != nil { panic(r.newSyntaxError(err.Error(), -1)) } @@ -447,6 +337,9 @@ func (r *Runtime) regexpproto_toString(call FunctionCall) Value { if this.pattern.unicode { sb.WriteRune('u') } + if this.pattern.unicodeSets { + sb.WriteRune('v') + } if this.pattern.sticky { sb.WriteRune('y') } @@ -593,6 +486,20 @@ func (r *Runtime) regexpproto_getUnicode(call FunctionCall) Value { } } +func (r *Runtime) regexpproto_getUnicodeSets(call FunctionCall) Value { + if this, ok := r.toObject(call.This).self.(*regexpObject); ok { + if this.pattern.unicodeSets { + return valueTrue + } else { + return valueFalse + } + } else if call.This == r.global.RegExpPrototype { + return _undefined + } else { + panic(r.NewTypeError("Method RegExp.prototype.unicodeSets getter called on incompatible receiver %s", r.objectproto_toString(FunctionCall{This: call.This}))) + } +} + func (r *Runtime) regexpproto_getSticky(call FunctionCall) Value { if this, ok := r.toObject(call.This).self.(*regexpObject); ok { if this.pattern.sticky { @@ -608,7 +515,7 @@ func (r *Runtime) regexpproto_getSticky(call FunctionCall) Value { } func (r *Runtime) regexpproto_getFlags(call FunctionCall) Value { - var global, ignoreCase, multiline, dotAll, sticky, unicode bool + var global, ignoreCase, multiline, dotAll, sticky, unicode, unicodeSets bool thisObj := r.toObject(call.This) size := 0 @@ -648,6 +555,12 @@ func (r *Runtime) regexpproto_getFlags(call FunctionCall) Value { size++ } } + if v := thisObj.self.getStr("unicodeSets", nil); v != nil { + unicodeSets = v.ToBoolean() + if unicodeSets { + size++ + } + } var sb strings.Builder sb.Grow(size) @@ -666,6 +579,9 @@ func (r *Runtime) regexpproto_getFlags(call FunctionCall) Value { if unicode { sb.WriteByte('u') } + if unicodeSets { + sb.WriteByte('v') + } if sticky { sb.WriteByte('y') } @@ -762,16 +678,24 @@ func (r *Runtime) regexpproto_stdMatcher(call FunctionCall) Value { return r.regexpproto_stdMatcherGeneric(thisObj, s) } if rx.pattern.global { - res := rx.pattern.findAllSubmatchIndex(s, 0, -1, rx.pattern.sticky) - if len(res) == 0 { - rx.setOwnStr("lastIndex", intToValue(0), true) - return _null + sUtf16 := s.toUnicode() + var a []Value + rx.setOwnStr("lastIndex", valueInt(0), true) + for { + match := rx.execRegexp(rx.pattern, sUtf16, false) + if match == nil { + break + } + a = append(a, regexpGroupToValue(sUtf16, match.Groups[0])) + if match.Groups[0].Start == match.Groups[0].End { + thisIndex := toLength(rx.getStr("lastIndex", nil)) + rx.setOwnStr("lastIndex", valueInt(advanceStringIndex64(s, thisIndex, rx.pattern.unicode || rx.pattern.unicodeSets)), true) + } } - a := make([]Value, 0, len(res)) - for _, result := range res { - a = append(a, s.Substring(result[0], result[1])) + + if len(a) == 0 { + return _null } - rx.setOwnStr("lastIndex", intToValue(int64(res[len(res)-1][1])), true) return r.newArrayValues(a) } else { return rx.exec(s) @@ -811,8 +735,8 @@ func (r *Runtime) regexpproto_stdMatcherAll(call FunctionCall) Value { matcher := r.toConstructor(c)([]Value{call.This, flags}, nil) matcher.self.setOwnStr("lastIndex", valueInt(toLength(thisObj.self.getStr("lastIndex", nil))), true) flagsStr := flags.String() - global := strings.Contains(flagsStr, "g") - fullUnicode := strings.Contains(flagsStr, "u") + global := strings.ContainsRune(flagsStr, 'g') + fullUnicode := strings.ContainsRune(flagsStr, 'u') || strings.ContainsRune(flagsStr, 'v') return r.createRegExpStringIterator(matcher, s, global, fullUnicode) } @@ -890,13 +814,13 @@ func (r *Runtime) regexpproto_stdSearch(call FunctionCall) Value { previousLastIndex := rx.getStr("lastIndex", nil) rx.setOwnStr("lastIndex", intToValue(0), true) - match, result := rx.execRegexp(s) + match := rx.execRegexp(rx.pattern, s, false) rx.setOwnStr("lastIndex", previousLastIndex, true) - if !match { + if match == nil { return intToValue(-1) } - return intToValue(int64(result[0])) + return intToValue(int64(match.Groups[0].Start)) } func (r *Runtime) regexpproto_stdSplitterGeneric(splitter *Object, s String, limit Value, unicodeMatching bool) Value { @@ -1011,90 +935,78 @@ func (r *Runtime) regexpproto_stdSplitter(call FunctionCall) Value { splitter = r.toConstructor(c)([]Value{rxObj, flags}, nil) search = r.checkStdRegexp(splitter) if search == nil { - return r.regexpproto_stdSplitterGeneric(splitter, s, limitValue, strings.Contains(flagsStr, "u")) + return r.regexpproto_stdSplitterGeneric(splitter, s, limitValue, strings.ContainsRune(flagsStr, 'u') || strings.ContainsRune(flagsStr, 'v')) } } pattern := search.pattern // toUint32() may recompile the pattern, but we still need to use the original - limit := -1 - if limitValue != _undefined { - limit = int(toUint32(limitValue)) + + var lim int64 + if limitValue == nil || limitValue == _undefined { + lim = maxInt - 1 + } else { + lim = int64(toUint32(limitValue)) } - if limit == 0 { + if lim == 0 { return r.newArrayValues(nil) } - targetLength := s.Length() - var valueArray []Value - lastIndex := 0 - found := 0 + size := s.Length() + var a []Value + + sUtf16 := s.toUnicode() + p := 0 + q := p - result := pattern.findAllSubmatchIndex(s, 0, -1, false) - if targetLength == 0 { - if result == nil { - valueArray = append(valueArray, s) + if size == 0 { + if search.execRegexp(pattern, s, true) == nil { + a = append(a, s) } goto RETURN } - for _, match := range result { - if match[0] == match[1] { - // FIXME Ugh, this is a hack - if match[0] == 0 || match[0] == targetLength { - continue - } - } - - if lastIndex != match[0] { - valueArray = append(valueArray, s.Substring(lastIndex, match[0])) - found++ - } else if lastIndex == match[0] { - if lastIndex != -1 { - valueArray = append(valueArray, stringEmpty) - found++ - } - } - - lastIndex = match[1] - if found == limit { - goto RETURN - } - - captureCount := len(match) / 2 - for index := 1; index < captureCount; index++ { - offset := index * 2 - var value Value - if match[offset] != -1 { - value = s.Substring(match[offset], match[offset+1]) + for q < size { + search.setOwnStr("lastIndex", intToValue(int64(q)), true) + z := search.execRegexp(pattern, s, true) + if z == nil { + q = advanceStringIndex(s, q, search.pattern.unicode || search.pattern.unicodeSets) + } else { + e := toLength(search.getStr("lastIndex", nil)) + if e == int64(p) { + q = advanceStringIndex(s, q, search.pattern.unicode || search.pattern.unicodeSets) } else { - value = _undefined - } - valueArray = append(valueArray, value) - found++ - if found == limit { - goto RETURN + a = append(a, s.Substring(p, q)) + if int64(len(a)) == lim { + return r.newArrayValues(a) + } + if e > int64(size) { + p = size + } else { + p = int(e) + } + numberOfCaptures := max(int64(len(z.Groups))-1, 0) + for i := int64(1); i <= numberOfCaptures; i++ { + a = append(a, regexpGroupToValue(sUtf16, z.Groups[i])) + if int64(len(a)) == lim { + return r.newArrayValues(a) + } + } + q = p } } } - - if found != limit { - if lastIndex != targetLength { - valueArray = append(valueArray, s.Substring(lastIndex, targetLength)) - } else { - valueArray = append(valueArray, stringEmpty) - } - } + a = append(a, s.Substring(p, size)) RETURN: - return r.newArrayValues(valueArray) + return r.newArrayValues(a) } func (r *Runtime) regexpproto_stdReplacerGeneric(rxObj *Object, s, replaceStr String, rcall func(FunctionCall) Value) Value { var results []Value flags := nilSafe(rxObj.self.getStr("flags", nil)).String() isGlobal := strings.ContainsRune(flags, 'g') - isUnicode := strings.ContainsRune(flags, 'u') + isUnicode := strings.ContainsRune(flags, 'u') || strings.ContainsRune(flags, 'v') if isGlobal { results = r.getGlobalRegexpMatches(rxObj, s, isUnicode) } else { @@ -1222,24 +1134,38 @@ func (r *Runtime) regexpproto_stdReplacer(call FunctionCall) Value { return r.regexpproto_stdReplacerGeneric(rxObj, s, replaceStr, rcall) } - var index int64 - find := 1 if rx.pattern.global { - find = -1 rx.setOwnStr("lastIndex", intToValue(0), true) - } else { - index = rx.getLastIndex() } - found := rx.pattern.findAllSubmatchIndex(s, toIntStrict(index), find, rx.pattern.sticky) - if len(found) > 0 { - if !rx.updateLastIndex(index, found[0], found[len(found)-1]) { - found = nil + sUtf16 := s.toUnicode() + a := [][]int{} + for { + match := rx.execRegexp(rx.pattern, sUtf16, false) + if match == nil { + break + } + result := make([]int, len(match.Groups)<<1) + for i, group := range match.Groups { + if group.Start == -1 { + result[i*2] = -1 + result[i*2+1] = 0 + } else { + result[i*2] = group.Start + result[i*2+1] = group.End + } + } + a = append(a, result) + if !rx.pattern.global { + break + } + + if match.Groups[0].Start == match.Groups[0].End { + thisIndex := toLength(rx.getStr("lastIndex", nil)) + rx.setOwnStr("lastIndex", valueInt(advanceStringIndex64(s, thisIndex, rx.pattern.unicode || rx.pattern.unicodeSets)), true) } - } else { - rx.updateLastIndex(index, nil, nil) - } - return stringReplace(s, found, replaceStr, rcall) + } + return stringReplace(s, a, replaceStr, rcall) } func (r *Runtime) regExpStringIteratorProto_next(call FunctionCall) Value { @@ -1326,6 +1252,11 @@ func (r *Runtime) getRegExpPrototype() *Object { getterFunc: r.newNativeFunc(r.regexpproto_getUnicode, "get unicode", 0), accessor: true, }, false) + o.setOwnStr("unicodeSets", &valueProperty{ + configurable: true, + getterFunc: r.newNativeFunc(r.regexpproto_getUnicodeSets, "get unicodeSets", 0), + accessor: true, + }, false) o.setOwnStr("sticky", &valueProperty{ configurable: true, getterFunc: r.newNativeFunc(r.regexpproto_getSticky, "get sticky", 0), @@ -1342,7 +1273,7 @@ func (r *Runtime) getRegExpPrototype() *Object { o._putSym(SymSearch, valueProp(r.newNativeFunc(r.regexpproto_stdSearch, "[Symbol.search]", 1), true, false, true)) o._putSym(SymSplit, valueProp(r.newNativeFunc(r.regexpproto_stdSplitter, "[Symbol.split]", 2), true, false, true)) o._putSym(SymReplace, valueProp(r.newNativeFunc(r.regexpproto_stdReplacer, "[Symbol.replace]", 2), true, false, true)) - o.guard("exec", "global", "multiline", "ignoreCase", "unicode", "sticky") + o.guard("exec", "global", "multiline", "ignoreCase", "unicode", "unicodeSets", "sticky") } return ret } diff --git a/compiler_expr.go b/compiler_expr.go index 3f537415d..07054cd30 100644 --- a/compiler_expr.go +++ b/compiler_expr.go @@ -3068,12 +3068,13 @@ func (c *compiler) compileArrayLiteral(v *ast.ArrayLiteral) compiledExpr { func (e *compiledRegexpLiteral) emitGetter(putOnStack bool) { if putOnStack { - pattern, err := compileRegexp(e.expr.Pattern, e.expr.Flags) + src := newStringValue(e.expr.Pattern) + pattern, err := compileRegexp(src, e.expr.Flags) if err != nil { e.c.throwSyntaxError(e.offset, err.Error()) } - e.c.emit(&newRegexp{pattern: pattern, src: newStringValue(e.expr.Pattern)}) + e.c.emit(&newRegexp{pattern: pattern, src: src}) } } diff --git a/go.mod b/go.mod index 103a3f125..1ca888461 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.20 require ( github.com/Masterminds/semver/v3 v3.2.1 - github.com/dlclark/regexp2 v1.11.4 + github.com/auvred/regonaut v0.0.1 github.com/dop251/goja_nodejs v0.0.0-20211022123610-8dd9abb0616d github.com/go-sourcemap/sourcemap v2.1.3+incompatible github.com/google/pprof v0.0.0-20230207041349-798e818bf904 diff --git a/go.sum b/go.sum index 14a87f46d..60e2ed997 100644 --- a/go.sum +++ b/go.sum @@ -1,15 +1,16 @@ github.com/Masterminds/semver/v3 v3.2.1 h1:RN9w6+7QoMeJVGyfmbcgs28Br8cvmnucEXnY0rYXWg0= github.com/Masterminds/semver/v3 v3.2.1/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= +github.com/auvred/regonaut v0.0.1 h1:308+62qAZlIJ9Uq8R9KeLscLFBu/gENl1ZlgdlwWM1E= +github.com/auvred/regonaut v0.0.1/go.mod h1:XetEYtndfNYNqp0i+DZKtY37RdUp6L6kq4rFx1f+lic= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc= -github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo= -github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dop251/goja v0.0.0-20211022113120-dc8c55024d06/go.mod h1:R9ET47fwRVRPZnOGvHxxhuZcbrMCuiqOz3Rlrh4KSnk= github.com/dop251/goja_nodejs v0.0.0-20210225215109-d91c329300e7/go.mod h1:hn7BA7c8pLvoGndExHudxTDKZ84Pyvv+90pbBjbTz0Y= github.com/dop251/goja_nodejs v0.0.0-20211022123610-8dd9abb0616d h1:W1n4DvpzZGOISgp7wWNtraLcHtnmnTwBlJidqtMIuwQ= github.com/dop251/goja_nodejs v0.0.0-20211022123610-8dd9abb0616d/go.mod h1:DngW8aVqWbuLRMHItjPUyqdj+HWPvnQe8V8y1nDpIbM= github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU= github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/pprof v0.0.0-20230207041349-798e818bf904 h1:4/hN5RUoecvl+RmJRE2YxKWtnnQls6rQjjW5oV7qg2U= github.com/google/pprof v0.0.0-20230207041349-798e818bf904/go.mod h1:uglQLonpP8qtYCYyzA+8c/9qtqgA3qsXGYqCPKARAFg= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= @@ -33,3 +34,4 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EV gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= diff --git a/parser/parser.go b/parser/parser.go index 24b380249..0ed63fbe3 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -48,7 +48,7 @@ import ( type Mode uint const ( - IgnoreRegExpErrors Mode = 1 << iota // Ignore RegExp compatibility errors (allow backtracking) + WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff" ) type options struct { diff --git a/parser/parser_test.go b/parser/parser_test.go index e09a7d6cc..9f02123ce 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -43,19 +43,6 @@ func testParse(src string) (parser *_parser, program *ast.Program, err error) { return } -func TestParseFile(t *testing.T) { - tt(t, func() { - _, err := ParseFile(nil, "", `/abc/`, 0) - is(err, nil) - - _, err = ParseFile(nil, "", `/(?!def)abc/`, IgnoreRegExpErrors) - is(err, nil) - - _, err = ParseFile(nil, "", `/(?!def)abc/; return`, IgnoreRegExpErrors) - is(err, "(anonymous): Line 1:15 Illegal return statement") - }) -} - func TestParseFunction(t *testing.T) { tt(t, func() { test := func(prm, bdy string, expect interface{}) *ast.FunctionLiteral { diff --git a/parser/regexp.go b/parser/regexp.go deleted file mode 100644 index f455d0da2..000000000 --- a/parser/regexp.go +++ /dev/null @@ -1,472 +0,0 @@ -package parser - -import ( - "fmt" - "strconv" - "strings" - "unicode/utf8" -) - -const ( - WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff" - Re2Dot = "[^\r\n\u2028\u2029]" -) - -type regexpParseError struct { - offset int - err string -} - -type RegexpErrorIncompatible struct { - regexpParseError -} -type RegexpSyntaxError struct { - regexpParseError -} - -func (s regexpParseError) Error() string { - return s.err -} - -type _RegExp_parser struct { - str string - length int - - chr rune // The current character - chrOffset int // The offset of current character - offset int // The offset after current character (may be greater than 1) - - err error - - goRegexp strings.Builder - passOffset int - - dotAll bool // Enable dotAll mode - unicode bool -} - -// TransformRegExp transforms a JavaScript pattern into a Go "regexp" pattern. -// -// re2 (Go) cannot do backtracking, so the presence of a lookahead (?=) (?!) or -// backreference (\1, \2, ...) will cause an error. -// -// re2 (Go) has a different definition for \s: [\t\n\f\r ]. -// The JavaScript definition, on the other hand, also includes \v, Unicode "Separator, Space", etc. -// -// If the pattern is valid, but incompatible (contains a lookahead or backreference), -// then this function returns an empty string an error of type RegexpErrorIncompatible. -// -// If the pattern is invalid (not valid even in JavaScript), then this function -// returns an empty string and a generic error. -func TransformRegExp(pattern string, dotAll, unicode bool) (transformed string, err error) { - - if pattern == "" { - return "", nil - } - - parser := _RegExp_parser{ - str: pattern, - length: len(pattern), - dotAll: dotAll, - unicode: unicode, - } - err = parser.parse() - if err != nil { - return "", err - } - - return parser.ResultString(), nil -} - -func (self *_RegExp_parser) ResultString() string { - if self.passOffset != -1 { - return self.str[:self.passOffset] - } - return self.goRegexp.String() -} - -func (self *_RegExp_parser) parse() (err error) { - self.read() // Pull in the first character - self.scan() - return self.err -} - -func (self *_RegExp_parser) read() { - if self.offset < self.length { - self.chrOffset = self.offset - chr, width := rune(self.str[self.offset]), 1 - if chr >= utf8.RuneSelf { // !ASCII - chr, width = utf8.DecodeRuneInString(self.str[self.offset:]) - if chr == utf8.RuneError && width == 1 { - self.error(true, "Invalid UTF-8 character") - return - } - } - self.offset += width - self.chr = chr - } else { - self.chrOffset = self.length - self.chr = -1 // EOF - } -} - -func (self *_RegExp_parser) stopPassing() { - self.goRegexp.Grow(3 * len(self.str) / 2) - self.goRegexp.WriteString(self.str[:self.passOffset]) - self.passOffset = -1 -} - -func (self *_RegExp_parser) write(p []byte) { - if self.passOffset != -1 { - self.stopPassing() - } - self.goRegexp.Write(p) -} - -func (self *_RegExp_parser) writeByte(b byte) { - if self.passOffset != -1 { - self.stopPassing() - } - self.goRegexp.WriteByte(b) -} - -func (self *_RegExp_parser) writeString(s string) { - if self.passOffset != -1 { - self.stopPassing() - } - self.goRegexp.WriteString(s) -} - -func (self *_RegExp_parser) scan() { - for self.chr != -1 { - switch self.chr { - case '\\': - self.read() - self.scanEscape(false) - case '(': - self.pass() - self.scanGroup() - case '[': - self.scanBracket() - case ')': - self.error(true, "Unmatched ')'") - return - case '.': - if self.dotAll { - self.pass() - break - } - self.writeString(Re2Dot) - self.read() - default: - self.pass() - } - } -} - -// (...) -func (self *_RegExp_parser) scanGroup() { - str := self.str[self.chrOffset:] - if len(str) > 1 { // A possibility of (?= or (?! - if str[0] == '?' { - ch := str[1] - switch { - case ch == '=' || ch == '!': - self.error(false, "re2: Invalid (%s) ", self.str[self.chrOffset:self.chrOffset+2]) - return - case ch == '<': - self.error(false, "re2: Invalid (%s) ", self.str[self.chrOffset:self.chrOffset+2]) - return - case ch != ':': - self.error(true, "Invalid group") - return - } - } - } - for self.chr != -1 && self.chr != ')' { - switch self.chr { - case '\\': - self.read() - self.scanEscape(false) - case '(': - self.pass() - self.scanGroup() - case '[': - self.scanBracket() - case '.': - if self.dotAll { - self.pass() - break - } - self.writeString(Re2Dot) - self.read() - default: - self.pass() - continue - } - } - if self.chr != ')' { - self.error(true, "Unterminated group") - return - } - self.pass() -} - -// [...] -func (self *_RegExp_parser) scanBracket() { - str := self.str[self.chrOffset:] - if strings.HasPrefix(str, "[]") { - // [] -- Empty character class - self.writeString("[^\u0000-\U0001FFFF]") - self.offset += 1 - self.read() - return - } - - if strings.HasPrefix(str, "[^]") { - self.writeString("[\u0000-\U0001FFFF]") - self.offset += 2 - self.read() - return - } - - self.pass() - for self.chr != -1 { - if self.chr == ']' { - break - } else if self.chr == '\\' { - self.read() - self.scanEscape(true) - continue - } - self.pass() - } - if self.chr != ']' { - self.error(true, "Unterminated character class") - return - } - self.pass() -} - -// \... -func (self *_RegExp_parser) scanEscape(inClass bool) { - offset := self.chrOffset - - var length, base uint32 - switch self.chr { - - case '0', '1', '2', '3', '4', '5', '6', '7': - var value int64 - size := 0 - for { - digit := int64(digitValue(self.chr)) - if digit >= 8 { - // Not a valid digit - break - } - value = value*8 + digit - self.read() - size += 1 - } - if size == 1 { // The number of characters read - if value != 0 { - // An invalid backreference - self.error(false, "re2: Invalid \\%d ", value) - return - } - self.passString(offset-1, self.chrOffset) - return - } - tmp := []byte{'\\', 'x', '0', 0} - if value >= 16 { - tmp = tmp[0:2] - } else { - tmp = tmp[0:3] - } - tmp = strconv.AppendInt(tmp, value, 16) - self.write(tmp) - return - - case '8', '9': - self.read() - self.error(false, "re2: Invalid \\%s ", self.str[offset:self.chrOffset]) - return - - case 'x': - self.read() - length, base = 2, 16 - - case 'u': - self.read() - if self.chr == '{' && self.unicode { - self.read() - length, base = 0, 16 - } else { - length, base = 4, 16 - } - - case 'b': - if inClass { - self.write([]byte{'\\', 'x', '0', '8'}) - self.read() - return - } - fallthrough - - case 'B': - fallthrough - - case 'd', 'D', 'w', 'W': - // This is slightly broken, because ECMAScript - // includes \v in \s, \S, while re2 does not - fallthrough - - case '\\': - fallthrough - - case 'f', 'n', 'r', 't', 'v': - self.passString(offset-1, self.offset) - self.read() - return - - case 'c': - self.read() - var value int64 - if 'a' <= self.chr && self.chr <= 'z' { - value = int64(self.chr - 'a' + 1) - } else if 'A' <= self.chr && self.chr <= 'Z' { - value = int64(self.chr - 'A' + 1) - } else { - self.writeByte('c') - return - } - tmp := []byte{'\\', 'x', '0', 0} - if value >= 16 { - tmp = tmp[0:2] - } else { - tmp = tmp[0:3] - } - tmp = strconv.AppendInt(tmp, value, 16) - self.write(tmp) - self.read() - return - case 's': - if inClass { - self.writeString(WhitespaceChars) - } else { - self.writeString("[" + WhitespaceChars + "]") - } - self.read() - return - case 'S': - if inClass { - self.error(false, "S in class") - return - } else { - self.writeString("[^" + WhitespaceChars + "]") - } - self.read() - return - default: - // $ is an identifier character, so we have to have - // a special case for it here - if self.chr == '$' || self.chr < utf8.RuneSelf && !isIdentifierPart(self.chr) { - // A non-identifier character needs escaping - self.passString(offset-1, self.offset) - self.read() - return - } - // Unescape the character for re2 - self.pass() - return - } - - // Otherwise, we're a \u.... or \x... - valueOffset := self.chrOffset - - if length > 0 { - for length := length; length > 0; length-- { - digit := uint32(digitValue(self.chr)) - if digit >= base { - // Not a valid digit - goto skip - } - self.read() - } - } else { - for self.chr != '}' && self.chr != -1 { - digit := uint32(digitValue(self.chr)) - if digit >= base { - // Not a valid digit - self.error(true, "Invalid Unicode escape") - return - } - self.read() - } - } - - if length == 4 || length == 0 { - self.write([]byte{ - '\\', - 'x', - '{', - }) - self.passString(valueOffset, self.chrOffset) - if length != 0 { - self.writeByte('}') - } - } else if length == 2 { - self.passString(offset-1, valueOffset+2) - } else { - // Should never, ever get here... - self.error(true, "re2: Illegal branch in scanEscape") - return - } - - return - -skip: - self.passString(offset, self.chrOffset) -} - -func (self *_RegExp_parser) pass() { - if self.passOffset == self.chrOffset { - self.passOffset = self.offset - } else { - if self.passOffset != -1 { - self.stopPassing() - } - if self.chr != -1 { - self.goRegexp.WriteRune(self.chr) - } - } - self.read() -} - -func (self *_RegExp_parser) passString(start, end int) { - if self.passOffset == start { - self.passOffset = end - return - } - if self.passOffset != -1 { - self.stopPassing() - } - self.goRegexp.WriteString(self.str[start:end]) -} - -func (self *_RegExp_parser) error(fatal bool, msg string, msgValues ...interface{}) { - if self.err != nil { - return - } - e := regexpParseError{ - offset: self.offset, - err: fmt.Sprintf(msg, msgValues...), - } - if fatal { - self.err = RegexpSyntaxError{e} - } else { - self.err = RegexpErrorIncompatible{e} - } - self.offset = self.length - self.chr = -1 -} diff --git a/parser/regexp_test.go b/parser/regexp_test.go deleted file mode 100644 index 3be77a39a..000000000 --- a/parser/regexp_test.go +++ /dev/null @@ -1,191 +0,0 @@ -package parser - -import ( - "regexp" - "testing" -) - -func TestRegExp(t *testing.T) { - tt(t, func() { - { - // err - test := func(input string, expect interface{}) { - _, err := TransformRegExp(input, false, false) - _, incompat := err.(RegexpErrorIncompatible) - is(incompat, false) - is(err, expect) - } - - test("[", "Unterminated character class") - - test("(", "Unterminated group") - - test("\\(?=)", "Unmatched ')'") - - test(")", "Unmatched ')'") - test("0:(?)", "Invalid group") - test("(?)", "Invalid group") - test("(?U)", "Invalid group") - test("(?)|(?i)", "Invalid group") - test("(?P)(?P)(?P)", "Invalid group") - } - - { - // incompatible - test := func(input string, expectErr interface{}) { - _, err := TransformRegExp(input, false, false) - _, incompat := err.(RegexpErrorIncompatible) - is(incompat, true) - is(err, expectErr) - } - - test(`<%([\s\S]+?)%>`, "S in class") - - test("(?<=y)x", "re2: Invalid (?<) ") - - test(`(?!test)`, "re2: Invalid (?!) ") - - test(`\1`, "re2: Invalid \\1 ") - - test(`\8`, "re2: Invalid \\8 ") - - } - - { - // err - test := func(input string, expect string) { - result, err := TransformRegExp(input, false, false) - is(err, nil) - _, incompat := err.(RegexpErrorIncompatible) - is(incompat, false) - is(result, expect) - _, err = regexp.Compile(result) - is(err, nil) - } - - test("", "") - - test("abc", "abc") - - test(`\abc`, `abc`) - - test(`\a\b\c`, `a\bc`) - - test(`\x`, `x`) - - test(`\c`, `c`) - - test(`\cA`, `\x01`) - - test(`\cz`, `\x1a`) - - test(`\ca`, `\x01`) - - test(`\cj`, `\x0a`) - - test(`\ck`, `\x0b`) - - test(`\+`, `\+`) - - test(`[\b]`, `[\x08]`) - - test(`\u0z01\x\undefined`, `u0z01xundefined`) - - test(`\\|'|\r|\n|\t|\u2028|\u2029`, `\\|'|\r|\n|\t|\x{2028}|\x{2029}`) - - test("]", "]") - - test("}", "}") - - test("%", "%") - - test("(%)", "(%)") - - test("(?:[%\\s])", "(?:[%"+WhitespaceChars+"])") - - test("[[]", "[[]") - - test("\\101", "\\x41") - - test("\\51", "\\x29") - - test("\\051", "\\x29") - - test("\\175", "\\x7d") - - test("\\0", "\\0") - - test("\\04", "\\x04") - - test(`(.)^`, "("+Re2Dot+")^") - - test(`\$`, `\$`) - - test(`[G-b]`, `[G-b]`) - - test(`[G-b\0]`, `[G-b\0]`) - - test(`\k`, `k`) - - test(`\x20`, `\x20`) - - test(`๐Ÿ˜Š`, `๐Ÿ˜Š`) - - test(`^.*`, `^`+Re2Dot+`*`) - - test(`(\n)`, `(\n)`) - - test(`(a(bc))`, `(a(bc))`) - - test(`[]`, "[^\u0000-\U0001FFFF]") - - test(`[^]`, "[\u0000-\U0001FFFF]") - - test(`\s+`, "["+WhitespaceChars+"]+") - - test(`\S+`, "[^"+WhitespaceChars+"]+") - - } - }) -} - -func TestTransformRegExp(t *testing.T) { - tt(t, func() { - pattern, err := TransformRegExp(`\s+abc\s+`, false, false) - is(err, nil) - is(pattern, `[`+WhitespaceChars+`]+abc[`+WhitespaceChars+`]+`) - is(regexp.MustCompile(pattern).MatchString("\t abc def"), true) - }) - tt(t, func() { - pattern, err := TransformRegExp(`\u{1d306}`, false, true) - is(err, nil) - is(pattern, `\x{1d306}`) - }) - tt(t, func() { - pattern, err := TransformRegExp(`\u1234`, false, false) - is(err, nil) - is(pattern, `\x{1234}`) - }) -} - -func BenchmarkTransformRegExp(b *testing.B) { - f := func(reStr string, b *testing.B) { - b.ResetTimer() - b.ReportAllocs() - for i := 0; i < b.N; i++ { - _, _ = TransformRegExp(reStr, false, false) - } - } - - b.Run("Re", func(b *testing.B) { - f(`^(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$`, b) - }) - - b.Run("Re2-1", func(b *testing.B) { - f(`(?=)^(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$`, b) - }) - - b.Run("Re2-1", func(b *testing.B) { - f(`^(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$(?=)`, b) - }) -} diff --git a/regexp.go b/regexp.go index f70c34d95..6f0ef13f0 100644 --- a/regexp.go +++ b/regexp.go @@ -1,197 +1,23 @@ package goja import ( - "fmt" - "github.com/dlclark/regexp2" + "github.com/auvred/regonaut" "github.com/dop251/goja/unistring" - "io" - "regexp" - "sort" - "strings" - "unicode/utf16" ) -type regexp2MatchCache struct { - target String - runes []rune - posMap []int -} - -// Not goroutine-safe. Use regexp2Wrapper.clone() -type regexp2Wrapper struct { - rx *regexp2.Regexp - cache *regexp2MatchCache -} - -type regexpWrapper regexp.Regexp - -type positionMapItem struct { - src, dst int -} -type positionMap []positionMapItem - -func (m positionMap) get(src int) int { - if src <= 0 { - return src - } - res := sort.Search(len(m), func(n int) bool { return m[n].src >= src }) - if res >= len(m) || m[res].src != src { - panic("index not found") - } - return m[res].dst -} - -type arrayRuneReader struct { - runes []rune - pos int -} - -func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) { - if rd.pos < len(rd.runes) { - r = rd.runes[rd.pos] - size = 1 - rd.pos++ - } else { - err = io.EOF - } - return -} - // Not goroutine-safe. Use regexpPattern.clone() type regexpPattern struct { - src string - - global, ignoreCase, multiline, dotAll, sticky, unicode bool - - regexpWrapper *regexpWrapper - regexp2Wrapper *regexp2Wrapper -} - -func compileRegexp2(src string, multiline, dotAll, ignoreCase, unicode bool) (*regexp2Wrapper, error) { - var opts regexp2.RegexOptions = regexp2.ECMAScript - if multiline { - opts |= regexp2.Multiline - } - if dotAll { - opts |= regexp2.Singleline - } - if ignoreCase { - opts |= regexp2.IgnoreCase - } - if unicode { - opts |= regexp2.Unicode - } - regexp2Pattern, err1 := regexp2.Compile(src, opts) - if err1 != nil { - return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1) - } + src unicodeString - return ®exp2Wrapper{rx: regexp2Pattern}, nil -} + global, ignoreCase, multiline, dotAll, sticky, unicode, unicodeSets bool -func (p *regexpPattern) createRegexp2() { - if p.regexp2Wrapper != nil { - return - } - rx, err := compileRegexp2(p.src, p.multiline, p.dotAll, p.ignoreCase, p.unicode) - if err != nil { - // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug. - panic(err) - } - p.regexp2Wrapper = rx -} - -func buildUTF8PosMap(s unicodeString) (positionMap, string) { - pm := make(positionMap, 0, s.Length()) - rd := s.Reader() - sPos, utf8Pos := 0, 0 - var sb strings.Builder - for { - r, size, err := rd.ReadRune() - if err == io.EOF { - break - } - if err != nil { - // the string contains invalid UTF-16, bailing out - return nil, "" - } - utf8Size, _ := sb.WriteRune(r) - sPos += size - utf8Pos += utf8Size - pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos}) - } - return pm, sb.String() -} - -func (p *regexpPattern) findSubmatchIndex(s String, start int) []int { - if p.regexpWrapper == nil { - return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) - } - if start != 0 { - // Unfortunately Go's regexp library does not allow starting from an arbitrary position. - // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not - // work correctly. - p.createRegexp2() - return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) - } - return p.regexpWrapper.findSubmatchIndex(s, p.unicode) -} - -func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int { - if p.regexpWrapper == nil { - return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) - } - if start == 0 { - a, u := devirtualizeString(s) - if u == nil { - return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky) - } - if limit == 1 { - result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode) - if result == nil { - return nil - } - return [][]int{result} - } - // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an - // input. - if p.unicode { - // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8. - pm, str := buildUTF8PosMap(u) - if pm != nil { - res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky) - for _, result := range res { - for i, idx := range result { - result[i] = pm.get(idx) - } - } - return res - } - } - } - - p.createRegexp2() - return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) + re *regonaut.RegExpUtf16 } +// TODO: regonaut's RegExp is safe for concurrent use // clone creates a copy of the regexpPattern which can be used concurrently. func (p *regexpPattern) clone() *regexpPattern { - ret := ®expPattern{ - src: p.src, - global: p.global, - ignoreCase: p.ignoreCase, - multiline: p.multiline, - dotAll: p.dotAll, - sticky: p.sticky, - unicode: p.unicode, - } - if p.regexpWrapper != nil { - ret.regexpWrapper = p.regexpWrapper.clone() - } - if p.regexp2Wrapper != nil { - ret.regexp2Wrapper = p.regexp2Wrapper.clone() - } - return ret + return p } type regexpObject struct { @@ -202,383 +28,63 @@ type regexpObject struct { standard bool } -func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) { - if fullUnicode { - return r.findSubmatchIndexUnicode(s, start, doCache) - } - return r.findSubmatchIndexUTF16(s, start, doCache) -} - -func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) { - wrapped := r.rx - cache := r.cache - if cache != nil && cache.posMap == nil && cache.target.SameAs(s) { - runes = cache.runes +func regexpGroupToValue(str unicodeString, group regonaut.GroupUtf16) Value { + if group.Start >= 0 { + return str.Substring(group.Start, group.End) } else { - runes = s.utf16Runes() - cache = nil + return _undefined } - match, err = wrapped.FindRunesMatchStartingAt(runes, start) - if doCache && match != nil && err == nil { - if cache == nil { - if r.cache == nil { - r.cache = new(regexp2MatchCache) - } - *r.cache = regexp2MatchCache{ - target: s, - runes: runes, - } - } - } else { - r.cache = nil - } - return } - -func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) { - match, _, err := r.findUTF16Cached(s, start, doCache) - if err != nil { - return +func (r *regexpObject) execResultToArray(target String, targetUtf16 unicodeString, match *regonaut.MatchUtf16) Value { + valueArray := make([]Value, len(match.Groups)) + matchIndex := match.Groups[0].Start + for index, group := range match.Groups { + valueArray[index] = regexpGroupToValue(targetUtf16, group) } - - if match == nil { - return - } - groups := match.Groups() - - result = make([]int, 0, len(groups)<<1) - for _, group := range groups { - if len(group.Captures) > 0 { - result = append(result, group.Index, group.Index+group.Length) - } else { - result = append(result, -1, 0) - } - } - return + result := r.val.runtime.newArrayValues(valueArray) + result.self.setOwnStr("input", target, false) + result.self.setOwnStr("index", intToValue(int64(matchIndex)), false) + return result } -func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) { - var ( - runes []rune - mappedStart int - splitPair bool - savedRune rune - ) - wrapped := r.rx - cache := r.cache - if cache != nil && cache.posMap != nil && cache.target.SameAs(s) { - runes, posMap = cache.runes, cache.posMap - mappedStart, splitPair = posMapReverseLookup(posMap, start) - } else { - posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start) - cache = nil - } - if splitPair { - // temporarily set the rune at mappedStart to the second code point of the pair - _, second := utf16.EncodeRune(runes[mappedStart]) - savedRune, runes[mappedStart] = runes[mappedStart], second - } - match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart) - if doCache && match != nil && err == nil { - if splitPair { - runes[mappedStart] = savedRune - } - if cache == nil { - if r.cache == nil { - r.cache = new(regexp2MatchCache) - } - *r.cache = regexp2MatchCache{ - target: s, - runes: runes, - posMap: posMap, - } - } - } else { - r.cache = nil - } +func (r *regexpObject) execRegexp(pattern *regexpPattern, target String, sticky bool) *regonaut.MatchUtf16 { + globalOrSticky := pattern.global || sticky || pattern.sticky + index := toLength(r.getStr("lastIndex", nil)) - return -} + var match *regonaut.MatchUtf16 -func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) { - match, posMap, err := r.findUnicodeCached(s, start, doCache) - if match == nil || err != nil { - return + if !globalOrSticky || index < 0 { + index = 0 } - groups := match.Groups() - - result = make([]int, 0, len(groups)<<1) - for _, group := range groups { - if len(group.Captures) > 0 { - result = append(result, posMap[group.Index], posMap[group.Index+group.Length]) - } else { - result = append(result, -1, 0) - } - } - return -} - -func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int { - wrapped := r.rx - match, runes, err := r.findUTF16Cached(s, start, false) - if match == nil || err != nil { - return nil - } - if limit < 0 { - limit = len(runes) + 1 - } - results := make([][]int, 0, limit) - for match != nil { - groups := match.Groups() - - result := make([]int, 0, len(groups)<<1) - - for _, group := range groups { - if len(group.Captures) > 0 { - startPos := group.Index - endPos := group.Index + group.Length - result = append(result, startPos, endPos) - } else { - result = append(result, -1, 0) - } - } - - if sticky && len(result) > 1 { - if result[0] != start { - break - } - start = result[1] - } - - results = append(results, result) - limit-- - if limit <= 0 { - break - } - match, err = wrapped.FindNextMatch(match) - if err != nil { - return nil - } - } - return results -} - -func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) { - posMap = make([]int, 0, l+1) - curPos := 0 - runes = make([]rune, 0, l) - startFound := false - for { - if !startFound { - if curPos == start { - mappedStart = len(runes) - startFound = true - } - if curPos > start { - // start position splits a surrogate pair - mappedStart = len(runes) - 1 - splitPair = true - startFound = true - } - } - rn, size, err := rd.ReadRune() - if err != nil { - break - } - runes = append(runes, rn) - posMap = append(posMap, curPos) - curPos += size - } - posMap = append(posMap, curPos) - return -} - -func posMapReverseLookup(posMap []int, pos int) (int, bool) { - mapped := sort.SearchInts(posMap, pos) - if mapped < len(posMap) && posMap[mapped] != pos { - return mapped - 1, true - } - return mapped, false -} - -func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int { - wrapped := r.rx - if limit < 0 { - limit = len(s) + 1 - } - results := make([][]int, 0, limit) - match, posMap, err := r.findUnicodeCached(s, start, false) - if err != nil { - return nil - } - for match != nil { - groups := match.Groups() - - result := make([]int, 0, len(groups)<<1) - - for _, group := range groups { - if len(group.Captures) > 0 { - start := posMap[group.Index] - end := posMap[group.Index+group.Length] - result = append(result, start, end) - } else { - result = append(result, -1, 0) - } - } - - if sticky && len(result) > 1 { - if result[0] != start { - break - } - start = result[1] - } - - results = append(results, result) - match, err = wrapped.FindNextMatch(match) - if err != nil { - return nil - } - } - return results -} - -func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int { - a, u := devirtualizeString(s) - if u != nil { - if fullUnicode { - return r.findAllSubmatchIndexUnicode(u, start, limit, sticky) - } - return r.findAllSubmatchIndexUTF16(u, start, limit, sticky) - } - return r.findAllSubmatchIndexUTF16(a, start, limit, sticky) -} - -func (r *regexp2Wrapper) clone() *regexp2Wrapper { - return ®exp2Wrapper{ - rx: r.rx, - } -} - -func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) { - wrapped := (*regexp.Regexp)(r) - results = wrapped.FindAllStringSubmatchIndex(s, limit) - pos := 0 if sticky { - for i, result := range results { - if len(result) > 1 { - if result[0] != pos { - return results[:i] - } - pos = result[1] - } - } - } - return -} - -func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int { - a, u := devirtualizeString(s) - if u != nil { - return r.findSubmatchIndexUnicode(u, fullUnicode) - } - return r.findSubmatchIndexASCII(string(a)) -} - -func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int { - wrapped := (*regexp.Regexp)(r) - return wrapped.FindStringSubmatchIndex(s) -} - -func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) { - wrapped := (*regexp.Regexp)(r) - if fullUnicode { - posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0) - res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes}) - for i, item := range res { - if item >= 0 { - res[i] = posMap[item] - } - } - return res + match = pattern.re.FindMatchStartingAtSticky(target.toUnicode()[1:], int(index)) + } else { + match = pattern.re.FindMatchStartingAt(target.toUnicode()[1:], int(index)) } - return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader()) -} - -func (r *regexpWrapper) clone() *regexpWrapper { - return r -} - -func (r *regexpObject) execResultToArray(target String, result []int) Value { - captureCount := len(result) >> 1 - valueArray := make([]Value, captureCount) - matchIndex := result[0] - valueArray[0] = target.Substring(result[0], result[1]) - lowerBound := 0 - for index := 1; index < captureCount; index++ { - offset := index << 1 - if result[offset] >= 0 && result[offset+1] >= lowerBound { - valueArray[index] = target.Substring(result[offset], result[offset+1]) - lowerBound = result[offset] + if globalOrSticky { + if match == nil { + index = 0 } else { - valueArray[index] = _undefined - } - } - match := r.val.runtime.newArrayValues(valueArray) - match.self.setOwnStr("input", target, false) - match.self.setOwnStr("index", intToValue(int64(matchIndex)), false) - return match -} - -func (r *regexpObject) getLastIndex() int64 { - lastIndex := toLength(r.getStr("lastIndex", nil)) - if !r.pattern.global && !r.pattern.sticky { - return 0 - } - return lastIndex -} - -func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool { - if r.pattern.sticky { - if firstResult == nil || int64(firstResult[0]) != index { - r.setOwnStr("lastIndex", intToValue(0), true) - return false - } - } else { - if firstResult == nil { - if r.pattern.global { - r.setOwnStr("lastIndex", intToValue(0), true) - } - return false + index = int64(match.Groups[0].End) } + r.setOwnStr("lastIndex", intToValue(index), true) } - if r.pattern.global || r.pattern.sticky { - r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true) - } - return true -} - -func (r *regexpObject) execRegexp(target String) (match bool, result []int) { - index := r.getLastIndex() - if index >= 0 && index <= int64(target.Length()) { - result = r.pattern.findSubmatchIndex(target, int(index)) - } - match = r.updateLastIndex(index, result, result) - return + return match } func (r *regexpObject) exec(target String) Value { - match, result := r.execRegexp(target) - if match { - return r.execResultToArray(target, result) + targetUtf16 := target.toUnicode() + match := r.execRegexp(r.pattern, targetUtf16, false) + if match == nil { + return _null } - return _null + return r.execResultToArray(target, targetUtf16, match) } func (r *regexpObject) test(target String) bool { - match, _ := r.execRegexp(target) - return match + return r.execRegexp(r.pattern, target, false) != nil } func (r *regexpObject) clone() *regexpObject { diff --git a/regexp_test.go b/regexp_test.go index 04498e51b..b19b2f361 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -259,45 +259,6 @@ func TestRegexpUnicode(t *testing.T) { testScriptWithTestLib(SCRIPT, _undefined, t) } -func TestConvertRegexpToUnicode(t *testing.T) { - if s := convertRegexpToUnicode(`test\uD800\u0C00passed`); s != `test\uD800\u0C00passed` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`test\uD800\uDC00passed`); s != `test๐€€passed` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`test\u0023passed`); s != `test\u0023passed` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`test\u0passed`); s != `test\u0passed` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`test\uD800passed`); s != `test\uD800passed` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`test\uD800`); s != `test\uD800` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`test\uD80`); s != `test\uD80` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`\\uD800\uDC00passed`); s != `\\uD800\uDC00passed` { - t.Fatal(s) - } - if s := convertRegexpToUnicode(`testpassed`); s != `testpassed` { - t.Fatal(s) - } -} - -func TestConvertRegexpToUtf16(t *testing.T) { - if s := convertRegexpToUtf16(`๐€€`); s != `\ud800\udc00` { - t.Fatal(s) - } - if s := convertRegexpToUtf16(`\๐€€`); s != `\\\ud800\udc00` { - t.Fatal(s) - } -} - func TestEscapeInvalidUtf16(t *testing.T) { if s := escapeInvalidUtf16(asciiString("test")); s != "test" { t.Fatal(s) @@ -390,98 +351,98 @@ func TestRegexpEscapeSource(t *testing.T) { testScript(SCRIPT, asciiString(`href="(.+?)(\/.*\/\S+?)\/"`), t) } -func TestRegexpConsecutiveMatchCache(t *testing.T) { - const SCRIPT = ` - (function test(unicode) { - var regex = new RegExp('t(e)(st(\\d?))', unicode?'gu':'g'); - var string = 'test1test2'; - var match; - var matches = []; - while (match = regex.exec(string)) { - matches.push(match); - } - var expectedMatches = [ - [ - 'test1', - 'e', - 'st1', - '1' - ], - [ - 'test2', - 'e', - 'st2', - '2' - ] - ]; - expectedMatches[0].index = 0; - expectedMatches[0].input = 'test1test2'; - expectedMatches[1].index = 5; - expectedMatches[1].input = 'test1test2'; - - assert(deepEqual(matches, expectedMatches), "#1"); - - // try the same regexp with a different string - regex.lastIndex = 0; - match = regex.exec(' test5'); - var expectedMatch = [ - 'test5', - 'e', - 'st5', - '5' - ]; - expectedMatch.index = 1; - expectedMatch.input = ' test5'; - assert(deepEqual(match, expectedMatch), "#2"); - assert.sameValue(regex.lastIndex, 6, "#3"); - - // continue matching with a different string - match = regex.exec(' test5test6'); - expectedMatch = [ - 'test6', - 'e', - 'st6', - '6' - ]; - expectedMatch.index = 6; - expectedMatch.input = ' test5test6'; - assert(deepEqual(match, expectedMatch), "#4"); - assert.sameValue(regex.lastIndex, 11, "#5"); - - match = regex.exec(' test5test6'); - assert.sameValue(match, null, "#6"); - return regex; - }); - ` - vm := New() - _, _ = vm.RunProgram(testLib()) - _, _ = vm.RunProgram(testLibX()) - v, err := vm.RunString(SCRIPT) - if err != nil { - t.Fatal(err) - } - var f func(bool) (*Object, error) - err = vm.ExportTo(v, &f) - if err != nil { - t.Fatal(err) - } - - regex, err := f(false) - if err != nil { - t.Fatal(err) - } - if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil { - t.Fatal("Cache is not nil (non-unicode)") - } - - regex, err = f(true) - if err != nil { - t.Fatal(err) - } - if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil { - t.Fatal("Cache is not nil (unicode)") - } -} +// func TestRegexpConsecutiveMatchCache(t *testing.T) { +// const SCRIPT = ` +// (function test(unicode) { +// var regex = new RegExp('t(e)(st(\\d?))', unicode?'gu':'g'); +// var string = 'test1test2'; +// var match; +// var matches = []; +// while (match = regex.exec(string)) { +// matches.push(match); +// } +// var expectedMatches = [ +// [ +// 'test1', +// 'e', +// 'st1', +// '1' +// ], +// [ +// 'test2', +// 'e', +// 'st2', +// '2' +// ] +// ]; +// expectedMatches[0].index = 0; +// expectedMatches[0].input = 'test1test2'; +// expectedMatches[1].index = 5; +// expectedMatches[1].input = 'test1test2'; +// +// assert(deepEqual(matches, expectedMatches), "#1"); +// +// // try the same regexp with a different string +// regex.lastIndex = 0; +// match = regex.exec(' test5'); +// var expectedMatch = [ +// 'test5', +// 'e', +// 'st5', +// '5' +// ]; +// expectedMatch.index = 1; +// expectedMatch.input = ' test5'; +// assert(deepEqual(match, expectedMatch), "#2"); +// assert.sameValue(regex.lastIndex, 6, "#3"); +// +// // continue matching with a different string +// match = regex.exec(' test5test6'); +// expectedMatch = [ +// 'test6', +// 'e', +// 'st6', +// '6' +// ]; +// expectedMatch.index = 6; +// expectedMatch.input = ' test5test6'; +// assert(deepEqual(match, expectedMatch), "#4"); +// assert.sameValue(regex.lastIndex, 11, "#5"); +// +// match = regex.exec(' test5test6'); +// assert.sameValue(match, null, "#6"); +// return regex; +// }); +// ` +// vm := New() +// _, _ = vm.RunProgram(testLib()) +// _, _ = vm.RunProgram(testLibX()) +// v, err := vm.RunString(SCRIPT) +// if err != nil { +// t.Fatal(err) +// } +// var f func(bool) (*Object, error) +// err = vm.ExportTo(v, &f) +// if err != nil { +// t.Fatal(err) +// } +// +// regex, err := f(false) +// if err != nil { +// t.Fatal(err) +// } +// if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil { +// t.Fatal("Cache is not nil (non-unicode)") +// } +// +// regex, err = f(true) +// if err != nil { +// t.Fatal(err) +// } +// if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil { +// t.Fatal("Cache is not nil (unicode)") +// } +// } func TestRegexpMatchAll(t *testing.T) { const SCRIPT = ` @@ -689,15 +650,6 @@ func TestRegexpLookbehindAssertion(t *testing.T) { testScriptWithTestLib(SCRIPT, _undefined, t) } -func TestRegexpInvalidUTF8(t *testing.T) { - vm := New() - // Note that normally vm.ToValue() would replace invalid UTF-8 sequences with RuneError - _, err := vm.New(vm.Get("RegExp"), asciiString([]byte{0xAD})) - if err == nil { - t.Fatal("Expected error") - } -} - // this should not cause data races when run with -race func TestRegexpConcurrentLiterals(t *testing.T) { prg := MustCompile("test.js", `var r = /(?`. @@ -169,29 +152,13 @@ var ( "test/language/literals/string/S7.8.4_A4.3_T2.js": true, "test/language/literals/string/S7.8.4_A4.3_T1.js": true, - // Regexp - "test/language/literals/regexp/invalid-range-negative-lookbehind.js": true, - "test/language/literals/regexp/invalid-range-lookbehind.js": true, - "test/language/literals/regexp/invalid-optional-negative-lookbehind.js": true, - "test/language/literals/regexp/invalid-optional-lookbehind.js": true, - - // unicode full case folding - "test/built-ins/RegExp/unicode_full_case_folding.js": true, - // FIXME bugs // Left-hand side as a CoverParenthesizedExpression "test/language/expressions/assignment/fn-name-lhs-cover.js": true, - // Character \ missing from character class [\c] - "test/annexB/built-ins/RegExp/RegExp-invalid-control-escape-character-class.js": true, - "test/annexB/built-ins/RegExp/RegExp-control-escape-russian-letter.js": true, - // Skip due to regexp named groups "test/built-ins/String/prototype/replaceAll/searchValue-replacer-RegExp-call.js": true, - - "test/built-ins/RegExp/nullable-quantifier.js": true, - "test/built-ins/RegExp/lookahead-quantifier-match-groups.js": true, } featuresBlackList = []string{ @@ -202,7 +169,6 @@ var ( "regexp-duplicate-named-groups", "regexp-unicode-property-escapes", "regexp-match-indices", - "regexp-modifiers", "RegExp.escape", "legacy-regexp", "tail-call-optimization", From 200456586801c7298cec730eb70d1e4d44524b7a Mon Sep 17 00:00:00 2001 From: auvred Date: Sat, 13 Sep 2025 11:11:23 +0300 Subject: [PATCH 2/4] get rid of unicodeSets for now --- builtin_regexp.go | 68 +++++++++++------------------------------------ regexp.go | 2 +- 2 files changed, 16 insertions(+), 54 deletions(-) diff --git a/builtin_regexp.go b/builtin_regexp.go index 18b19acc3..3245a164d 100644 --- a/builtin_regexp.go +++ b/builtin_regexp.go @@ -96,7 +96,7 @@ func escapeInvalidUtf16(s String) string { func compileRegexp(patternStr String, flags string) (p *regexpPattern, err error) { patternUtf16 := patternStr.toUnicode() - var global, ignoreCase, multiline, dotAll, sticky, unicode, unicodeSets bool + var global, ignoreCase, multiline, dotAll, sticky, unicode bool reFlags := regonaut.FlagAnnexB @@ -146,12 +146,6 @@ func compileRegexp(patternStr String, flags string) (p *regexpPattern, err error } reFlags |= regonaut.FlagUnicode unicode = true - case 'v': - if unicodeSets { - invalidFlags() - } - reFlags |= regonaut.FlagUnicodeSets - unicodeSets = true default: invalidFlags() return @@ -166,15 +160,14 @@ func compileRegexp(patternStr String, flags string) (p *regexpPattern, err error } p = ®expPattern{ - src: patternUtf16, - re: re, - global: global, - ignoreCase: ignoreCase, - multiline: multiline, - dotAll: dotAll, - sticky: sticky, - unicode: unicode, - unicodeSets: unicodeSets, + src: patternUtf16, + re: re, + global: global, + ignoreCase: ignoreCase, + multiline: multiline, + dotAll: dotAll, + sticky: sticky, + unicode: unicode, } return } @@ -337,9 +330,6 @@ func (r *Runtime) regexpproto_toString(call FunctionCall) Value { if this.pattern.unicode { sb.WriteRune('u') } - if this.pattern.unicodeSets { - sb.WriteRune('v') - } if this.pattern.sticky { sb.WriteRune('y') } @@ -486,20 +476,6 @@ func (r *Runtime) regexpproto_getUnicode(call FunctionCall) Value { } } -func (r *Runtime) regexpproto_getUnicodeSets(call FunctionCall) Value { - if this, ok := r.toObject(call.This).self.(*regexpObject); ok { - if this.pattern.unicodeSets { - return valueTrue - } else { - return valueFalse - } - } else if call.This == r.global.RegExpPrototype { - return _undefined - } else { - panic(r.NewTypeError("Method RegExp.prototype.unicodeSets getter called on incompatible receiver %s", r.objectproto_toString(FunctionCall{This: call.This}))) - } -} - func (r *Runtime) regexpproto_getSticky(call FunctionCall) Value { if this, ok := r.toObject(call.This).self.(*regexpObject); ok { if this.pattern.sticky { @@ -515,7 +491,7 @@ func (r *Runtime) regexpproto_getSticky(call FunctionCall) Value { } func (r *Runtime) regexpproto_getFlags(call FunctionCall) Value { - var global, ignoreCase, multiline, dotAll, sticky, unicode, unicodeSets bool + var global, ignoreCase, multiline, dotAll, sticky, unicode bool thisObj := r.toObject(call.This) size := 0 @@ -555,12 +531,6 @@ func (r *Runtime) regexpproto_getFlags(call FunctionCall) Value { size++ } } - if v := thisObj.self.getStr("unicodeSets", nil); v != nil { - unicodeSets = v.ToBoolean() - if unicodeSets { - size++ - } - } var sb strings.Builder sb.Grow(size) @@ -579,9 +549,6 @@ func (r *Runtime) regexpproto_getFlags(call FunctionCall) Value { if unicode { sb.WriteByte('u') } - if unicodeSets { - sb.WriteByte('v') - } if sticky { sb.WriteByte('y') } @@ -689,7 +656,7 @@ func (r *Runtime) regexpproto_stdMatcher(call FunctionCall) Value { a = append(a, regexpGroupToValue(sUtf16, match.Groups[0])) if match.Groups[0].Start == match.Groups[0].End { thisIndex := toLength(rx.getStr("lastIndex", nil)) - rx.setOwnStr("lastIndex", valueInt(advanceStringIndex64(s, thisIndex, rx.pattern.unicode || rx.pattern.unicodeSets)), true) + rx.setOwnStr("lastIndex", valueInt(advanceStringIndex64(s, thisIndex, rx.pattern.unicode)), true) } } @@ -970,11 +937,11 @@ func (r *Runtime) regexpproto_stdSplitter(call FunctionCall) Value { search.setOwnStr("lastIndex", intToValue(int64(q)), true) z := search.execRegexp(pattern, s, true) if z == nil { - q = advanceStringIndex(s, q, search.pattern.unicode || search.pattern.unicodeSets) + q = advanceStringIndex(s, q, search.pattern.unicode) } else { e := toLength(search.getStr("lastIndex", nil)) if e == int64(p) { - q = advanceStringIndex(s, q, search.pattern.unicode || search.pattern.unicodeSets) + q = advanceStringIndex(s, q, search.pattern.unicode) } else { a = append(a, s.Substring(p, q)) if int64(len(a)) == lim { @@ -1161,7 +1128,7 @@ func (r *Runtime) regexpproto_stdReplacer(call FunctionCall) Value { if match.Groups[0].Start == match.Groups[0].End { thisIndex := toLength(rx.getStr("lastIndex", nil)) - rx.setOwnStr("lastIndex", valueInt(advanceStringIndex64(s, thisIndex, rx.pattern.unicode || rx.pattern.unicodeSets)), true) + rx.setOwnStr("lastIndex", valueInt(advanceStringIndex64(s, thisIndex, rx.pattern.unicode)), true) } } @@ -1252,11 +1219,6 @@ func (r *Runtime) getRegExpPrototype() *Object { getterFunc: r.newNativeFunc(r.regexpproto_getUnicode, "get unicode", 0), accessor: true, }, false) - o.setOwnStr("unicodeSets", &valueProperty{ - configurable: true, - getterFunc: r.newNativeFunc(r.regexpproto_getUnicodeSets, "get unicodeSets", 0), - accessor: true, - }, false) o.setOwnStr("sticky", &valueProperty{ configurable: true, getterFunc: r.newNativeFunc(r.regexpproto_getSticky, "get sticky", 0), @@ -1273,7 +1235,7 @@ func (r *Runtime) getRegExpPrototype() *Object { o._putSym(SymSearch, valueProp(r.newNativeFunc(r.regexpproto_stdSearch, "[Symbol.search]", 1), true, false, true)) o._putSym(SymSplit, valueProp(r.newNativeFunc(r.regexpproto_stdSplitter, "[Symbol.split]", 2), true, false, true)) o._putSym(SymReplace, valueProp(r.newNativeFunc(r.regexpproto_stdReplacer, "[Symbol.replace]", 2), true, false, true)) - o.guard("exec", "global", "multiline", "ignoreCase", "unicode", "unicodeSets", "sticky") + o.guard("exec", "global", "multiline", "ignoreCase", "unicode", "sticky") } return ret } diff --git a/regexp.go b/regexp.go index 6f0ef13f0..a124fd63c 100644 --- a/regexp.go +++ b/regexp.go @@ -9,7 +9,7 @@ import ( type regexpPattern struct { src unicodeString - global, ignoreCase, multiline, dotAll, sticky, unicode, unicodeSets bool + global, ignoreCase, multiline, dotAll, sticky, unicode bool re *regonaut.RegExpUtf16 } From 36c76bdbec233955601b461eaa3748afda2b0f07 Mon Sep 17 00:00:00 2001 From: auvred Date: Sat, 13 Sep 2025 11:26:22 +0300 Subject: [PATCH 3/4] cleanup --- builtin_regexp.go | 11 +++--- regexp.go | 8 +--- regexp_test.go | 93 ----------------------------------------------- vm.go | 2 +- 4 files changed, 7 insertions(+), 107 deletions(-) diff --git a/builtin_regexp.go b/builtin_regexp.go index 3245a164d..f51be46a5 100644 --- a/builtin_regexp.go +++ b/builtin_regexp.go @@ -2,11 +2,10 @@ package goja import ( "fmt" + "github.com/auvred/regonaut" "strings" "unicode/utf16" "unicode/utf8" - - "github.com/auvred/regonaut" ) func (r *Runtime) newRegexpObject(proto *Object) *regexpObject { @@ -702,8 +701,8 @@ func (r *Runtime) regexpproto_stdMatcherAll(call FunctionCall) Value { matcher := r.toConstructor(c)([]Value{call.This, flags}, nil) matcher.self.setOwnStr("lastIndex", valueInt(toLength(thisObj.self.getStr("lastIndex", nil))), true) flagsStr := flags.String() - global := strings.ContainsRune(flagsStr, 'g') - fullUnicode := strings.ContainsRune(flagsStr, 'u') || strings.ContainsRune(flagsStr, 'v') + global := strings.Contains(flagsStr, "g") + fullUnicode := strings.Contains(flagsStr, "u") return r.createRegExpStringIterator(matcher, s, global, fullUnicode) } @@ -902,7 +901,7 @@ func (r *Runtime) regexpproto_stdSplitter(call FunctionCall) Value { splitter = r.toConstructor(c)([]Value{rxObj, flags}, nil) search = r.checkStdRegexp(splitter) if search == nil { - return r.regexpproto_stdSplitterGeneric(splitter, s, limitValue, strings.ContainsRune(flagsStr, 'u') || strings.ContainsRune(flagsStr, 'v')) + return r.regexpproto_stdSplitterGeneric(splitter, s, limitValue, strings.Contains(flagsStr, "u")) } } @@ -973,7 +972,7 @@ func (r *Runtime) regexpproto_stdReplacerGeneric(rxObj *Object, s, replaceStr St var results []Value flags := nilSafe(rxObj.self.getStr("flags", nil)).String() isGlobal := strings.ContainsRune(flags, 'g') - isUnicode := strings.ContainsRune(flags, 'u') || strings.ContainsRune(flags, 'v') + isUnicode := strings.ContainsRune(flags, 'u') if isGlobal { results = r.getGlobalRegexpMatches(rxObj, s, isUnicode) } else { diff --git a/regexp.go b/regexp.go index a124fd63c..20ad0dc29 100644 --- a/regexp.go +++ b/regexp.go @@ -5,7 +5,7 @@ import ( "github.com/dop251/goja/unistring" ) -// Not goroutine-safe. Use regexpPattern.clone() +// Safe for concurrent use by multiple goroutines. type regexpPattern struct { src unicodeString @@ -14,12 +14,6 @@ type regexpPattern struct { re *regonaut.RegExpUtf16 } -// TODO: regonaut's RegExp is safe for concurrent use -// clone creates a copy of the regexpPattern which can be used concurrently. -func (p *regexpPattern) clone() *regexpPattern { - return p -} - type regexpObject struct { baseObject pattern *regexpPattern diff --git a/regexp_test.go b/regexp_test.go index b19b2f361..9e778babe 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -351,99 +351,6 @@ func TestRegexpEscapeSource(t *testing.T) { testScript(SCRIPT, asciiString(`href="(.+?)(\/.*\/\S+?)\/"`), t) } -// func TestRegexpConsecutiveMatchCache(t *testing.T) { -// const SCRIPT = ` -// (function test(unicode) { -// var regex = new RegExp('t(e)(st(\\d?))', unicode?'gu':'g'); -// var string = 'test1test2'; -// var match; -// var matches = []; -// while (match = regex.exec(string)) { -// matches.push(match); -// } -// var expectedMatches = [ -// [ -// 'test1', -// 'e', -// 'st1', -// '1' -// ], -// [ -// 'test2', -// 'e', -// 'st2', -// '2' -// ] -// ]; -// expectedMatches[0].index = 0; -// expectedMatches[0].input = 'test1test2'; -// expectedMatches[1].index = 5; -// expectedMatches[1].input = 'test1test2'; -// -// assert(deepEqual(matches, expectedMatches), "#1"); -// -// // try the same regexp with a different string -// regex.lastIndex = 0; -// match = regex.exec(' test5'); -// var expectedMatch = [ -// 'test5', -// 'e', -// 'st5', -// '5' -// ]; -// expectedMatch.index = 1; -// expectedMatch.input = ' test5'; -// assert(deepEqual(match, expectedMatch), "#2"); -// assert.sameValue(regex.lastIndex, 6, "#3"); -// -// // continue matching with a different string -// match = regex.exec(' test5test6'); -// expectedMatch = [ -// 'test6', -// 'e', -// 'st6', -// '6' -// ]; -// expectedMatch.index = 6; -// expectedMatch.input = ' test5test6'; -// assert(deepEqual(match, expectedMatch), "#4"); -// assert.sameValue(regex.lastIndex, 11, "#5"); -// -// match = regex.exec(' test5test6'); -// assert.sameValue(match, null, "#6"); -// return regex; -// }); -// ` -// vm := New() -// _, _ = vm.RunProgram(testLib()) -// _, _ = vm.RunProgram(testLibX()) -// v, err := vm.RunString(SCRIPT) -// if err != nil { -// t.Fatal(err) -// } -// var f func(bool) (*Object, error) -// err = vm.ExportTo(v, &f) -// if err != nil { -// t.Fatal(err) -// } -// -// regex, err := f(false) -// if err != nil { -// t.Fatal(err) -// } -// if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil { -// t.Fatal("Cache is not nil (non-unicode)") -// } -// -// regex, err = f(true) -// if err != nil { -// t.Fatal(err) -// } -// if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil { -// t.Fatal("Cache is not nil (unicode)") -// } -// } - func TestRegexpMatchAll(t *testing.T) { const SCRIPT = ` (function test(unicode) { diff --git a/vm.go b/vm.go index 0782930c2..b770fbec5 100644 --- a/vm.go +++ b/vm.go @@ -2818,7 +2818,7 @@ type newRegexp struct { } func (n *newRegexp) exec(vm *vm) { - vm.push(vm.r.newRegExpp(n.pattern.clone(), n.src, vm.r.getRegExpPrototype()).val) + vm.push(vm.r.newRegExpp(n.pattern, n.src, vm.r.getRegExpPrototype()).val) vm.pc++ } From dc87376df642d65302d86eed6af0b51e3be7144e Mon Sep 17 00:00:00 2001 From: auvred Date: Sat, 13 Sep 2025 11:35:20 +0300 Subject: [PATCH 4/4] enable language/literals/regexp/u- tests --- tc39_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/tc39_test.go b/tc39_test.go index 490f84721..bcef8125d 100644 --- a/tc39_test.go +++ b/tc39_test.go @@ -260,9 +260,6 @@ func init() { "test/language/eval-code/direct/async-gen-", - // restricted unicode regexp syntax - "test/language/literals/regexp/u-", - // legacy octal escape in strings in strict mode "test/language/literals/string/legacy-octal-", "test/language/literals/string/legacy-non-octal-",