From 0ba6dd94dec24257be4f528c70d3c62d0c9eb48a Mon Sep 17 00:00:00 2001 From: Stanislav Modrak Date: Fri, 20 Mar 2026 13:24:01 +0000 Subject: [PATCH 1/3] refactor: scan regex patterns in a single pass --- lib/tokenizer.ts | 326 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 256 insertions(+), 70 deletions(-) diff --git a/lib/tokenizer.ts b/lib/tokenizer.ts index 34a522a..dddcd81 100644 --- a/lib/tokenizer.ts +++ b/lib/tokenizer.ts @@ -1,8 +1,14 @@ -import * as util from './util'; -import { Group, types, Root, Token, Reference, Char } from './types'; +import { Group, types, Root, Token, Reference, Char, Position, SetTokens, Set as SetToken } from './types'; import * as sets from './sets'; type ReferenceQueue = { reference: (Reference | Char), stack: Token[], index: number }[]; +type DecodedEscape = { value: number, nextIndex: number } | null; +type ParsedNumber = { value: number, nextIndex: number } | null; +type EscapedToken = Char | Position | Reference | SetToken; +type EscapedTokenResult = { token: EscapedToken, nextIndex: number }; +type ClassToken = Char | SetToken; +type ClassTokenResult = { token: ClassToken, nextIndex: number }; +type RepetitionBounds = { min: number, max: number, nextIndex: number } | null; /** * Valid opening characters for capture group names. @@ -14,7 +20,239 @@ const captureGroupFirstChar = /^[a-zA-Z_$]$/i; */ const captureGroupChars = /^[a-zA-Z0-9_$]$/i; -const digit = /\d/; +const CTRL = '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^ ?'; + +const isDigitCode = (charCode: number) => charCode >= 48 && charCode <= 57; + +const isUpperHexCode = (charCode: number) => + isDigitCode(charCode) || (charCode >= 65 && charCode <= 70); + +const hexValue = (charCode: number) => charCode <= 57 ? charCode - 48 : charCode - 55; + +const readNumber = (str: string, index: number): ParsedNumber => { + if (!isDigitCode(str.charCodeAt(index))) { + return null; + } + + let value = 0; + let nextIndex = index; + while (nextIndex < str.length && isDigitCode(str.charCodeAt(nextIndex))) { + value = value * 10 + str.charCodeAt(nextIndex) - 48; + nextIndex++; + } + + return { value, nextIndex }; +}; + +const readEscapedChar = (str: string, slashIndex: number): DecodedEscape => { + const escapeType = str[slashIndex + 1]; + + switch (escapeType) { + case 'u': { + if (slashIndex + 5 >= str.length) { + return null; + } + + let value = 0; + for (let index = slashIndex + 2; index <= slashIndex + 5; index++) { + const charCode = str.charCodeAt(index); + if (!isUpperHexCode(charCode)) { + return null; + } + value = value * 16 + hexValue(charCode); + } + return { value, nextIndex: slashIndex + 6 }; + } + + case 'x': { + if (slashIndex + 3 >= str.length) { + return null; + } + + let value = 0; + for (let index = slashIndex + 2; index <= slashIndex + 3; index++) { + const charCode = str.charCodeAt(index); + if (!isUpperHexCode(charCode)) { + return null; + } + value = value * 16 + hexValue(charCode); + } + return { value, nextIndex: slashIndex + 4 }; + } + + case 'c': { + const ctrlChar = str[slashIndex + 2]; + if (!ctrlChar) { + return null; + } + + const value = CTRL.indexOf(ctrlChar); + return value === -1 ? null : { value, nextIndex: slashIndex + 3 }; + } + + case '0': + return { value: 0, nextIndex: slashIndex + 2 }; + + case 't': + return { value: 9, nextIndex: slashIndex + 2 }; + + case 'n': + return { value: 10, nextIndex: slashIndex + 2 }; + + case 'v': + return { value: 11, nextIndex: slashIndex + 2 }; + + case 'f': + return { value: 12, nextIndex: slashIndex + 2 }; + + case 'r': + return { value: 13, nextIndex: slashIndex + 2 }; + + default: + return null; + } +}; + +const readSet = (escapedType: string): SetToken | null => { + switch (escapedType) { + case 'w': + return sets.words(); + case 'W': + return sets.notWords(); + case 'd': + return sets.ints(); + case 'D': + return sets.notInts(); + case 's': + return sets.whitespace(); + case 'S': + return sets.notWhitespace(); + default: + return null; + } +}; + +const readEscapedToken = (str: string, slashIndex: number, inClass: boolean): EscapedTokenResult => { + const escapedType = str[slashIndex + 1]; + const setToken = readSet(escapedType); + if (setToken) { + return { token: setToken, nextIndex: slashIndex + 2 }; + } + + if (inClass) { + // Character classes treat \b as backspace instead of a word boundary. + if (escapedType === 'b') { + return { token: { type: types.CHAR, value: 8 }, nextIndex: slashIndex + 2 }; + } + } else { + if (escapedType === 'b' || escapedType === 'B') { + return { + token: { type: types.POSITION, value: escapedType }, + nextIndex: slashIndex + 2, + }; + } + + // Outside character classes, decimal escapes are parsed as references first + // and only normalized to chars later if there are not enough capture groups. + if (escapedType !== '0') { + const reference = readNumber(str, slashIndex + 1); + if (reference) { + return { + token: { type: types.REFERENCE, value: reference.value }, + nextIndex: reference.nextIndex, + }; + } + } + } + + const decoded = readEscapedChar(str, slashIndex); + return { + token: { + type: types.CHAR, + value: decoded ? decoded.value : str.charCodeAt(slashIndex + 1), + }, + nextIndex: decoded ? decoded.nextIndex : slashIndex + 2, + }; +}; + +const readClassToken = (str: string, index: number): ClassTokenResult => { + if (str[index] === '\\') { + return readEscapedToken(str, index, true) as ClassTokenResult; + } + + return { + token: { type: types.CHAR, value: str.charCodeAt(index) }, + nextIndex: index + 1, + }; +}; + +const tokenizeClassAt = (str: string, index: number, regexpStr: string): [SetTokens, number] => { + const tokens: SetTokens = []; + let i = index; + + while (i < str.length) { + if (str[i] === ']') { + return [tokens, i + 1]; + } + + if (str[i] === '\\' && i + 1 >= str.length) { + break; + } + + const first = readClassToken(str, i); + const nextChar = str[first.nextIndex]; + if ( + first.token.type === types.CHAR && + nextChar === '-' && + first.nextIndex + 1 < str.length && + str[first.nextIndex + 1] !== ']' + ) { + // Only char-to-char pairs become ranges; predefined sets keep the dash literal. + const second = readClassToken(str, first.nextIndex + 1); + if (second.token.type === types.CHAR) { + tokens.push({ + type: types.RANGE, + from: first.token.value, + to: second.token.value, + }); + i = second.nextIndex; + continue; + } + } + + tokens.push(first.token); + i = first.nextIndex; + } + + throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated character class`); +}; + +const readRepetition = (str: string, index: number): RepetitionBounds => { + const min = readNumber(str, index); + if (!min) { + return null; + } + + if (str[min.nextIndex] === '}') { + return { min: min.value, max: min.value, nextIndex: min.nextIndex + 1 }; + } + + if (str[min.nextIndex] !== ',') { + return null; + } + + // `{n,}` is open-ended, while `{n,m}` requires a second parsed integer. + if (str[min.nextIndex + 1] === '}') { + return { min: min.value, max: Infinity, nextIndex: min.nextIndex + 2 }; + } + + const max = readNumber(str, min.nextIndex + 1); + if (!max || str[max.nextIndex] !== '}') { + return null; + } + + return { min: min.value, max: max.value, nextIndex: max.nextIndex + 1 }; +}; /** * Tokenizes a regular expression (that is currently a string) @@ -42,14 +280,13 @@ export const tokenizer = (regexpStr: string): Root => { ); }; - // Decode a few escaped characters. - let str = util.strToChars(regexpStr); + const str = regexpStr; // Iterate through each character in string. while (i < str.length) { switch (c = str[i++]) { // Handle escaped characters, inclues a few sets. - case '\\': + case '\\': { if (i === str.length) { throw new SyntaxError( `Invalid regular expression: /${ @@ -57,62 +294,15 @@ export const tokenizer = (regexpStr: string): Root => { }/: \\ at end of pattern`, ); } - switch (c = str[i++]) { - case 'b': - last.push({ type: types.POSITION, value: 'b' }); - break; - - case 'B': - last.push({ type: types.POSITION, value: 'B' }); - break; - - case 'w': - last.push(sets.words()); - break; - - case 'W': - last.push(sets.notWords()); - break; - - case 'd': - last.push(sets.ints()); - break; - - case 'D': - last.push(sets.notInts()); - break; - - case 's': - last.push(sets.whitespace()); - break; - - case 'S': - last.push(sets.notWhitespace()); - break; - - default: - // Check if c is integer. - // In which case it's a reference. - if (digit.test(c)) { - let digits = c; - - while (i < str.length && digit.test(str[i])) { - digits += str[i++]; - } - - let value = parseInt(digits, 10); - const reference: Reference = { type: types.REFERENCE, value }; - - last.push(reference); - referenceQueue.push({ reference, stack: last, index: last.length - 1 }); - - // Escaped character. - } else { - last.push({ type: types.CHAR, value: c.charCodeAt(0) }); - } + const escaped = readEscapedToken(str, i - 1, false); + last.push(escaped.token); + if (escaped.token.type === types.REFERENCE) { + referenceQueue.push({ reference: escaped.token, stack: last, index: last.length - 1 }); } + i = escaped.nextIndex; break; + } // Positionals. @@ -137,10 +327,8 @@ export const tokenizer = (regexpStr: string): Root => { } // Get all the characters in class. - let classTokens = util.tokenizeClass(str.slice(i), regexpStr); - - // Increase index by length of class. - i += classTokens[1]; + let classTokens = tokenizeClassAt(str, i, regexpStr); + i = classTokens[1]; last.push({ type: types.SET, set: classTokens[0], @@ -291,19 +479,17 @@ export const tokenizer = (regexpStr: string): Root => { // This design is chosen because there could be more than // one repetition symbols in a regex i.e. `a?+{2,3}`. case '{': { - let rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max; - if (rs !== null) { + let repetition = readRepetition(str, i); + if (repetition !== null) { if (last.length === 0) { repeatErr(i); } - min = parseInt(rs[1], 10); - max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min; - i += rs[0].length; + i = repetition.nextIndex; last.push({ type: types.REPETITION, - min, - max, + min: repetition.min, + max: repetition.max, value: last.pop(), }); } else { From f40fbe279e2da55e27db5a8f61fd1dc397d66db0 Mon Sep 17 00:00:00 2001 From: Stanislav Modrak Date: Fri, 20 Mar 2026 13:31:02 +0000 Subject: [PATCH 2/3] test: add benchmark suite --- README.md | 15 ++++ benchmark/index.js | 216 +++++++++++++++++++++++++++++++++++++++++++++ package.json | 1 + 3 files changed, 232 insertions(+) create mode 100644 benchmark/index.js diff --git a/README.md b/README.md index 140c229..b7c72e1 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,21 @@ The following latest JavaScript additions are not supported yet: } ``` +# Benchmarking + +Run the benchmark suite against the compiled build with: + +```sh +npm run bench +``` + +The benchmark covers representative tokenizer, reconstruct, and roundtrip workloads. +You can narrow it to one suite or run longer samples: + +```sh +npm run bench -- --suite tokenizer --min-ms 750 +``` + `/[abc]/` ```js diff --git a/benchmark/index.js b/benchmark/index.js new file mode 100644 index 0000000..7a0f470 --- /dev/null +++ b/benchmark/index.js @@ -0,0 +1,216 @@ +#!/usr/bin/env node +'use strict'; + +const { performance } = require('perf_hooks'); +const tokenizer = require('../dist'); + +const reconstruct = tokenizer.reconstruct; + +const DEFAULT_MIN_MS = 400; +const DEFAULT_WARMUP_RUNS = 5000; + +const CASES = [ + { + name: 'literal', + pattern: 'walnuts', + }, + { + name: 'alternation', + pattern: '(?:foo|bar|baz){1,3}(qux|quux)', + }, + { + name: 'named-group', + pattern: '(?\\d{4})-(?\\d{2})-(?\\d{2})', + }, + { + name: 'path-like', + pattern: '^(?:\\/(?:[A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}){1,64}){1,24}\\/?$', + }, + { + name: 'email-like', + pattern: '^(?:[A-Za-z0-9_+.-]{1,64})@(?:[A-Za-z0-9-]{1,63}\\.){1,8}[A-Za-z]{2,24}$', + }, + { + name: 'class-heavy', + pattern: '^(?:[A-Za-z0-9_./-]{3,32}|\\[(?:\\d{1,3}\\.){3}\\d{1,3}\\])(?:,(?:[A-Za-z0-9_./-]{3,32}|\\[(?:\\d{1,3}\\.){3}\\d{1,3}\\])){0,50}$', + }, + { + name: 'dense-sets', + pattern: '^(?:[\\w.-]{1,16}:[^\\s\\]]{1,32};?){1,40}$', + }, + { + name: 'backrefs', + pattern: '<(\\w+)>(?:[^<]|<(?!\\/\\1>))*<\\/\\1>', + }, +]; + +const SUITE_NAMES = ['tokenizer', 'reconstruct', 'roundtrip']; + +const parseArgs = (argv) => { + const options = { + minMs: DEFAULT_MIN_MS, + warmupRuns: DEFAULT_WARMUP_RUNS, + suite: 'all', + }; + + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + switch (arg) { + case '--min-ms': + options.minMs = parsePositiveInt(argv[++i], '--min-ms'); + break; + + case '--warmup-runs': + options.warmupRuns = parsePositiveInt(argv[++i], '--warmup-runs'); + break; + + case '--suite': + options.suite = argv[++i]; + if (options.suite !== 'all' && !SUITE_NAMES.includes(options.suite)) { + throw new Error(`Unknown suite '${options.suite}'`); + } + break; + + case '--help': + printHelp(); + process.exit(0); + break; + + default: + throw new Error(`Unknown argument '${arg}'`); + } + } + + return options; +}; + +const parsePositiveInt = (value, flagName) => { + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error(`${flagName} must be a positive integer`); + } + return parsed; +}; + +const printHelp = () => { + console.log('Usage: node benchmark/index.js [--suite ] [--min-ms ] [--warmup-runs ]'); + console.log(''); + console.log('Suites: all, tokenizer, reconstruct, roundtrip'); +}; + +const benchmark = (fn, input, minMs, warmupRuns) => { + for (let i = 0; i < warmupRuns; i++) { + fn(input); + } + + let iterations = 0; + let elapsedMs = 0; + const start = performance.now(); + + // Sample elapsed time periodically so the timer itself does not dominate fast cases. + do { + fn(input); + iterations++; + if ((iterations & 0x3ff) === 0) { + elapsedMs = performance.now() - start; + } + } while (elapsedMs < minMs); + + const totalMs = performance.now() - start; + return { + iterations, + ms: totalMs, + opsPerSec: iterations / (totalMs / 1000), + }; +}; + +const formatNumber = (value, fractionDigits) => + value.toLocaleString('en-US', { + minimumFractionDigits: fractionDigits, + maximumFractionDigits: fractionDigits, + }); + +const truncate = (pattern, maxLength) => + pattern.length <= maxLength ? pattern : `${pattern.slice(0, maxLength - 3)}...`; + +const printSuite = (name, rows) => { + console.log(`${name}:`); + console.log('case'.padEnd(16) + 'len'.padStart(6) + 'ops/s'.padStart(15) + 'iterations'.padStart(14) + ' sample'); + + for (const row of rows) { + console.log( + row.name.padEnd(16) + + String(row.length).padStart(6) + + formatNumber(row.opsPerSec, 2).padStart(15) + + formatNumber(row.iterations, 0).padStart(14) + + ` ${truncate(row.label, 72)}`, + ); + } + + const averageOpsPerSec = rows.reduce((sum, row) => sum + row.opsPerSec, 0) / rows.length; + console.log(`average ops/s: ${formatNumber(averageOpsPerSec, 2)}`); + console.log(''); +}; + +const main = () => { + const options = parseArgs(process.argv.slice(2)); + + // Reconstruct benchmarks reuse one token tree per pattern so they measure the + // serializer itself instead of folding tokenizer time back into the result. + const tokenCases = CASES.map(({ name, pattern }) => ({ + name, + label: pattern, + length: pattern.length, + input: tokenizer(pattern), + })); + + const suites = { + tokenizer: { + cases: CASES.map(({ name, pattern }) => ({ + name, + label: pattern, + length: pattern.length, + input: pattern, + })), + fn: tokenizer, + }, + reconstruct: { + cases: tokenCases, + fn: reconstruct, + }, + roundtrip: { + cases: CASES.map(({ name, pattern }) => ({ + name, + label: pattern, + length: pattern.length, + input: pattern, + })), + fn: (pattern) => reconstruct(tokenizer(pattern)), + }, + }; + + const suiteNames = options.suite === 'all' ? SUITE_NAMES : [options.suite]; + + for (const suiteName of suiteNames) { + const suite = suites[suiteName]; + const rows = suite.cases.map((testCase) => { + const result = benchmark(suite.fn, testCase.input, options.minMs, options.warmupRuns); + return { + name: testCase.name, + label: testCase.label, + length: testCase.length, + opsPerSec: result.opsPerSec, + iterations: result.iterations, + }; + }); + + printSuite(suiteName, rows); + } +}; + +try { + main(); +} catch (error) { + console.error(error.message); + process.exit(1); +} diff --git a/package.json b/package.json index 8c46562..aea7a7d 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "scripts": { "test": "nyc --extension .ts --reporter=lcov --reporter=text-summary vows -- --spec test/*-test.js", "build": "tsc", + "bench": "npm run build && node benchmark/index.js", "prepare": "tsc", "lint": "eslint ./lib ./test", "lint:fix": "eslint --fix ./lib ./test" From fe4865bb82f366a5f962f1820dc3c187875daa87 Mon Sep 17 00:00:00 2001 From: Stanislav Modrak Date: Fri, 20 Mar 2026 13:58:31 +0000 Subject: [PATCH 3/3] simplify --- lib/tokenizer.ts | 321 ++++++++++++++++++----------------------------- 1 file changed, 122 insertions(+), 199 deletions(-) diff --git a/lib/tokenizer.ts b/lib/tokenizer.ts index dddcd81..099f3c2 100644 --- a/lib/tokenizer.ts +++ b/lib/tokenizer.ts @@ -2,13 +2,11 @@ import { Group, types, Root, Token, Reference, Char, Position, SetTokens, Set as import * as sets from './sets'; type ReferenceQueue = { reference: (Reference | Char), stack: Token[], index: number }[]; -type DecodedEscape = { value: number, nextIndex: number } | null; -type ParsedNumber = { value: number, nextIndex: number } | null; +type ReadResult = { result: T, nextIndex: number }; +type MaybeReadResult = ReadResult | null; type EscapedToken = Char | Position | Reference | SetToken; -type EscapedTokenResult = { token: EscapedToken, nextIndex: number }; type ClassToken = Char | SetToken; -type ClassTokenResult = { token: ClassToken, nextIndex: number }; -type RepetitionBounds = { min: number, max: number, nextIndex: number } | null; +type RepetitionBounds = { min: number, max: number }; /** * Valid opening characters for capture group names. @@ -29,7 +27,10 @@ const isUpperHexCode = (charCode: number) => const hexValue = (charCode: number) => charCode <= 57 ? charCode - 48 : charCode - 55; -const readNumber = (str: string, index: number): ParsedNumber => { +const syntaxError = (regexpStr: string, message: string) => + new SyntaxError(`Invalid regular expression: /${regexpStr}/: ${message}`); + +const readNumber = (str: string, index: number): MaybeReadResult => { if (!isDigitCode(str.charCodeAt(index))) { return null; } @@ -41,72 +42,56 @@ const readNumber = (str: string, index: number): ParsedNumber => { nextIndex++; } - return { value, nextIndex }; + return { result: value, nextIndex }; }; -const readEscapedChar = (str: string, slashIndex: number): DecodedEscape => { - const escapeType = str[slashIndex + 1]; - - switch (escapeType) { - case 'u': { - if (slashIndex + 5 >= str.length) { - return null; - } +const readHex = (str: string, index: number, length: number): MaybeReadResult => { + if (index + length > str.length) { + return null; + } - let value = 0; - for (let index = slashIndex + 2; index <= slashIndex + 5; index++) { - const charCode = str.charCodeAt(index); - if (!isUpperHexCode(charCode)) { - return null; - } - value = value * 16 + hexValue(charCode); - } - return { value, nextIndex: slashIndex + 6 }; + let value = 0; + for (let i = index; i < index + length; i++) { + const charCode = str.charCodeAt(i); + if (!isUpperHexCode(charCode)) { + return null; } + value = value * 16 + hexValue(charCode); + } - case 'x': { - if (slashIndex + 3 >= str.length) { - return null; - } + return { result: value, nextIndex: index + length }; +}; - let value = 0; - for (let index = slashIndex + 2; index <= slashIndex + 3; index++) { - const charCode = str.charCodeAt(index); - if (!isUpperHexCode(charCode)) { - return null; - } - value = value * 16 + hexValue(charCode); - } - return { value, nextIndex: slashIndex + 4 }; - } +const readEscapedChar = (str: string, slashIndex: number): MaybeReadResult => { + switch (str[slashIndex + 1]) { + case 'u': + return readHex(str, slashIndex + 2, 4); - case 'c': { - const ctrlChar = str[slashIndex + 2]; - if (!ctrlChar) { - return null; - } + case 'x': + return readHex(str, slashIndex + 2, 2); - const value = CTRL.indexOf(ctrlChar); - return value === -1 ? null : { value, nextIndex: slashIndex + 3 }; + case 'c': { + const value = CTRL.indexOf(str[slashIndex + 2]); + return value === -1 ? null : { result: value, nextIndex: slashIndex + 3 }; } case '0': - return { value: 0, nextIndex: slashIndex + 2 }; + return { result: 0, nextIndex: slashIndex + 2 }; case 't': - return { value: 9, nextIndex: slashIndex + 2 }; + return { result: 9, nextIndex: slashIndex + 2 }; case 'n': - return { value: 10, nextIndex: slashIndex + 2 }; + return { result: 10, nextIndex: slashIndex + 2 }; case 'v': - return { value: 11, nextIndex: slashIndex + 2 }; + return { result: 11, nextIndex: slashIndex + 2 }; case 'f': - return { value: 12, nextIndex: slashIndex + 2 }; + return { result: 12, nextIndex: slashIndex + 2 }; case 'r': - return { value: 13, nextIndex: slashIndex + 2 }; + return { result: 13, nextIndex: slashIndex + 2 }; default: return null; @@ -132,22 +117,24 @@ const readSet = (escapedType: string): SetToken | null => { } }; -const readEscapedToken = (str: string, slashIndex: number, inClass: boolean): EscapedTokenResult => { +function readEscapedToken(str: string, slashIndex: number, inClass: true): ReadResult; +function readEscapedToken(str: string, slashIndex: number, inClass: false): ReadResult; +function readEscapedToken(str: string, slashIndex: number, inClass: boolean): ReadResult { const escapedType = str[slashIndex + 1]; const setToken = readSet(escapedType); if (setToken) { - return { token: setToken, nextIndex: slashIndex + 2 }; + return { result: setToken, nextIndex: slashIndex + 2 }; } if (inClass) { // Character classes treat \b as backspace instead of a word boundary. if (escapedType === 'b') { - return { token: { type: types.CHAR, value: 8 }, nextIndex: slashIndex + 2 }; + return { result: { type: types.CHAR, value: 8 }, nextIndex: slashIndex + 2 }; } } else { if (escapedType === 'b' || escapedType === 'B') { return { - token: { type: types.POSITION, value: escapedType }, + result: { type: types.POSITION, value: escapedType }, nextIndex: slashIndex + 2, }; } @@ -158,7 +145,7 @@ const readEscapedToken = (str: string, slashIndex: number, inClass: boolean): Es const reference = readNumber(str, slashIndex + 1); if (reference) { return { - token: { type: types.REFERENCE, value: reference.value }, + result: { type: types.REFERENCE, value: reference.result }, nextIndex: reference.nextIndex, }; } @@ -167,21 +154,21 @@ const readEscapedToken = (str: string, slashIndex: number, inClass: boolean): Es const decoded = readEscapedChar(str, slashIndex); return { - token: { + result: { type: types.CHAR, - value: decoded ? decoded.value : str.charCodeAt(slashIndex + 1), + value: decoded ? decoded.result : str.charCodeAt(slashIndex + 1), }, nextIndex: decoded ? decoded.nextIndex : slashIndex + 2, }; -}; +} -const readClassToken = (str: string, index: number): ClassTokenResult => { +const readClassToken = (str: string, index: number): ReadResult => { if (str[index] === '\\') { - return readEscapedToken(str, index, true) as ClassTokenResult; + return readEscapedToken(str, index, true); } return { - token: { type: types.CHAR, value: str.charCodeAt(index) }, + result: { type: types.CHAR, value: str.charCodeAt(index) }, nextIndex: index + 1, }; }; @@ -200,41 +187,40 @@ const tokenizeClassAt = (str: string, index: number, regexpStr: string): [SetTok } const first = readClassToken(str, i); - const nextChar = str[first.nextIndex]; if ( - first.token.type === types.CHAR && - nextChar === '-' && + first.result.type === types.CHAR && + str[first.nextIndex] === '-' && first.nextIndex + 1 < str.length && str[first.nextIndex + 1] !== ']' ) { // Only char-to-char pairs become ranges; predefined sets keep the dash literal. const second = readClassToken(str, first.nextIndex + 1); - if (second.token.type === types.CHAR) { + if (second.result.type === types.CHAR) { tokens.push({ type: types.RANGE, - from: first.token.value, - to: second.token.value, + from: first.result.value, + to: second.result.value, }); i = second.nextIndex; continue; } } - tokens.push(first.token); + tokens.push(first.result); i = first.nextIndex; } - throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated character class`); + throw syntaxError(regexpStr, 'Unterminated character class'); }; -const readRepetition = (str: string, index: number): RepetitionBounds => { +const readRepetition = (str: string, index: number): MaybeReadResult => { const min = readNumber(str, index); if (!min) { return null; } if (str[min.nextIndex] === '}') { - return { min: min.value, max: min.value, nextIndex: min.nextIndex + 1 }; + return { result: { min: min.result, max: min.result }, nextIndex: min.nextIndex + 1 }; } if (str[min.nextIndex] !== ',') { @@ -243,7 +229,7 @@ const readRepetition = (str: string, index: number): RepetitionBounds => { // `{n,}` is open-ended, while `{n,m}` requires a second parsed integer. if (str[min.nextIndex + 1] === '}') { - return { min: min.value, max: Infinity, nextIndex: min.nextIndex + 2 }; + return { result: { min: min.result, max: Infinity }, nextIndex: min.nextIndex + 2 }; } const max = readNumber(str, min.nextIndex + 1); @@ -251,7 +237,30 @@ const readRepetition = (str: string, index: number): RepetitionBounds => { return null; } - return { min: min.value, max: max.value, nextIndex: max.nextIndex + 1 }; + return { result: { min: min.result, max: max.result }, nextIndex: max.nextIndex + 1 }; +}; + +const readGroupName = (str: string, index: number, regexpStr: string): ReadResult => { + if (!captureGroupFirstChar.test(str[index])) { + throw syntaxError( + regexpStr, + `Invalid capture group name, character '${str[index]}' after '<' at column ${index + 1}`, + ); + } + + let name = str[index++]; + while (index < str.length && captureGroupChars.test(str[index])) { + name += str[index++]; + } + + if (str[index] !== '>') { + throw syntaxError( + regexpStr, + `Unclosed capture group name, expected '>', found '${str[index]}' at column ${index + 1}`, + ); + } + + return { result: name, nextIndex: index + 1 }; }; /** @@ -272,12 +281,21 @@ export const tokenizer = (regexpStr: string): Root => { let referenceQueue: ReferenceQueue = []; let groupCount = 0; - const repeatErr = (col: number) => { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Nothing to repeat at column ${col - 1}`, - ); + const repeatErr = (col: number): never => { + throw syntaxError(regexpStr, `Nothing to repeat at column ${col - 1}`); + }; + + const pushRepetition = (min: number, max: number, col: number) => { + if (last.length === 0) { + repeatErr(col); + } + + last.push({ + type: types.REPETITION, + min, + max, + value: last.pop()!, + }); }; const str = regexpStr; @@ -288,16 +306,12 @@ export const tokenizer = (regexpStr: string): Root => { // Handle escaped characters, inclues a few sets. case '\\': { if (i === str.length) { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: \\ at end of pattern`, - ); + throw syntaxError(regexpStr, '\\ at end of pattern'); } const escaped = readEscapedToken(str, i - 1, false); - last.push(escaped.token); - if (escaped.token.type === types.REFERENCE) { - referenceQueue.push({ reference: escaped.token, stack: last, index: last.length - 1 }); + last.push(escaped.result); + if (escaped.result.type === types.REFERENCE) { + referenceQueue.push({ reference: escaped.result, stack: last, index: last.length - 1 }); } i = escaped.nextIndex; @@ -307,27 +321,19 @@ export const tokenizer = (regexpStr: string): Root => { // Positionals. case '^': - last.push({ type: types.POSITION, value: '^' }); - break; - case '$': - last.push({ type: types.POSITION, value: '$' }); + last.push({ type: types.POSITION, value: c }); break; // Handle custom sets. case '[': { // Check if this class is 'anti' i.e. [^abc]. - let not; - if (str[i] === '^') { - not = true; - i++; - } else { - not = false; - } + const not = str[i] === '^'; + if (not) i++; // Get all the characters in class. - let classTokens = tokenizeClassAt(str, i, regexpStr); + const classTokens = tokenizeClassAt(str, i, regexpStr); i = classTokens[1]; last.push({ type: types.SET, @@ -369,54 +375,13 @@ export const tokenizer = (regexpStr: string): Root => { group.notFollowedBy = true; group.remember = false; } else if (c === '<') { - let name = ''; - - if (captureGroupFirstChar.test(str[i])) { - name += str[i]; - i++; - } else { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Invalid capture group name, character '${str[i]}'` + - ` after '<' at column ${i + 1}`, - ); - } - - while (i < str.length && captureGroupChars.test(str[i])) { - name += str[i]; - i++; - } - - if (!name) { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Invalid capture group name, character '${str[i]}'` + - ` after '<' at column ${i + 1}`, - ); - } - - if (str[i] !== '>') { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Unclosed capture group name, expected '>', found` + - ` '${str[i]}' at column ${i + 1}`, - ); - } - - group.name = name; - i++; + const name = readGroupName(str, i, regexpStr); + group.name = name.result; + i = name.nextIndex; } else if (c === ':') { group.remember = false; } else { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Invalid group, character '${c}'` + - ` after '?' at column ${i - 1}`, - ); + throw syntaxError(regexpStr, `Invalid group, character '${c}' after '?' at column ${i - 1}`); } } else { groupCount += 1; @@ -439,11 +404,7 @@ export const tokenizer = (regexpStr: string): Root => { // Pop group out of stack. case ')': if (groupStack.length === 0) { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Unmatched ) at column ${i - 1}`, - ); + throw syntaxError(regexpStr, `Unmatched ) at column ${i - 1}`); } lastGroup = groupStack.pop(); @@ -465,7 +426,7 @@ export const tokenizer = (regexpStr: string): Root => { delete lastGroup.stack; } // Create a new stack and add to options for rest of clause. - let stack: Token[] = []; + const stack: Token[] = []; lastGroup.options.push(stack); last = stack; @@ -479,65 +440,31 @@ export const tokenizer = (regexpStr: string): Root => { // This design is chosen because there could be more than // one repetition symbols in a regex i.e. `a?+{2,3}`. case '{': { - let repetition = readRepetition(str, i); - if (repetition !== null) { - if (last.length === 0) { - repeatErr(i); - } - i = repetition.nextIndex; - - last.push({ - type: types.REPETITION, - min: repetition.min, - max: repetition.max, - value: last.pop(), - }); - } else { + const repetition = readRepetition(str, i); + if (!repetition) { last.push({ type: types.CHAR, value: 123, }); + break; } + pushRepetition(repetition.result.min, repetition.result.max, i); + i = repetition.nextIndex; + break; } case '?': - if (last.length === 0) { - repeatErr(i); - } - last.push({ - type: types.REPETITION, - min: 0, - max: 1, - value: last.pop(), - }); + pushRepetition(0, 1, i); break; case '+': - if (last.length === 0) { - repeatErr(i); - } - last.push({ - type: types.REPETITION, - min: 1, - max: Infinity, - value: last.pop(), - }); - + pushRepetition(1, Infinity, i); break; case '*': - if (last.length === 0) { - repeatErr(i); - } - last.push({ - type: types.REPETITION, - min: 0, - max: Infinity, - value: last.pop(), - }); - + pushRepetition(0, Infinity, i); break; @@ -552,11 +479,7 @@ export const tokenizer = (regexpStr: string): Root => { // Check if any groups have not been closed. if (groupStack.length !== 0) { - throw new SyntaxError( - `Invalid regular expression: /${ - regexpStr - }/: Unterminated group`, - ); + throw syntaxError(regexpStr, 'Unterminated group'); } updateReferences(referenceQueue, groupCount);