Skip to content

Commit

Permalink
Merge pull request #131 from mvorisek/fix_perf2
Browse files Browse the repository at this point in the history
Improve tokenizer regex matching
  • Loading branch information
greg0ire authored Jun 22, 2024
2 parents e53096c + fda3d6e commit 9fcd0ac
Showing 1 changed file with 47 additions and 57 deletions.
104 changes: 47 additions & 57 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -720,11 +720,13 @@ final class Tokenizer

// Regular expressions for tokenizing

private readonly string $regexBoundaries;
private readonly string $regexReserved;
private readonly string $regexReservedNewline;
private readonly string $regexReservedToplevel;
private readonly string $regexFunction;
private readonly string $nextTokenRegexNumber;
private readonly string $nextTokenRegexBoundaryCharacter;
private readonly string $nextTokenRegexReservedToplevel;
private readonly string $nextTokenRegexReservedNewline;
private readonly string $nextTokenRegexReserved;
private readonly string $nextTokenRegexFunction;
private readonly string $nextTokenRegexNonReserved;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -769,25 +771,30 @@ public function __construct()
return array_keys($valuesMap);
};

// Set up regular expressions
$this->regexBoundaries = '(' . implode(
'|',
$this->quoteRegex($this->boundaries),
) . ')';
$this->regexReserved = '(' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reserved)),
) . ')';
$this->regexReservedToplevel = str_replace(' ', '\s+', '(' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reservedToplevel)),
) . ')');
$this->regexReservedNewline = str_replace(' ', '\s+', '(' . implode(
'|',
$this->quoteRegex($sortByLengthFx($this->reservedNewline)),
) . ')');
$buildRegexFromListFx = static function ($values) use ($sortByLengthFx) {
return '(?>' . implode(
'|',
array_map(
static fn ($v) => preg_quote($v, '/'),
$sortByLengthFx($values),
),
) . ')';
};

$this->regexFunction = '(' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';
// Set up regular expressions
$regexBoundaries = $buildRegexFromListFx($this->boundaries);
$regexReserved = $buildRegexFromListFx($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedNewline));
$regexFunction = $buildRegexFromListFx($this->functions);

$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
$this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
}

/**
Expand Down Expand Up @@ -829,7 +836,6 @@ public function tokenize(string $string): Cursor
*/
private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token
{
$matches = [];
// Whitespace
if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]);
Expand Down Expand Up @@ -883,9 +889,9 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
$value = $firstChar . $this->getNextQuotedString($string, $offset + 1);
} else {
// Non-quoted variable name
preg_match('/\G(' . $firstChar . '[\w.$]+)/', $string, $matches, 0, $offset);
preg_match('/\G[@:][\w.$]+/', $string, $matches, 0, $offset);
if ($matches) {
$value = $matches[1];
$value = $matches[0];
}
}

Expand All @@ -897,19 +903,19 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Number (decimal, binary, or hex)
if (
preg_match(
'/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexNumber,
$string,
$matches,
0,
$offset,
)
) {
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]);
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[0]);
}

// Boundary Character (punctuation and symbols)
if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]);
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]);
}

// A reserved word cannot be preceded by a '.'
Expand All @@ -918,7 +924,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Top Level Reserved Word
if (
preg_match(
'/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReservedToplevel,
$upper,
$matches,
0,
Expand All @@ -927,14 +933,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
) {
return new Token(
Token::TOKEN_TYPE_RESERVED_TOPLEVEL,
substr($string, $offset, strlen($matches[1])),
substr($string, $offset, strlen($matches[0])),
);
}

// Newline Reserved Word
if (
preg_match(
'/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReservedNewline,
$upper,
$matches,
0,
Expand All @@ -943,14 +949,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
) {
return new Token(
Token::TOKEN_TYPE_RESERVED_NEWLINE,
substr($string, $offset, strlen($matches[1])),
substr($string, $offset, strlen($matches[0])),
);
}

// Other Reserved Word
if (
preg_match(
'/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/',
$this->nextTokenRegexReserved,
$upper,
$matches,
0,
Expand All @@ -959,40 +965,24 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[1])),
substr($string, $offset, strlen($matches[0])),
);
}
}

// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not
// function
if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) {
// this makes it so "count(" is considered a function, but "count" alone is not function
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[1]) - 1),
substr($string, $offset, strlen($matches[0])),
);
}

// Non reserved word
preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset);

return new Token(Token::TOKEN_TYPE_WORD, $matches[1]);
}
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);

/**
* Helper function for building regular expressions for reserved words and boundary characters
*
* @param string[] $strings The strings to be quoted
*
* @return string[] The quoted strings
*/
private function quoteRegex(array $strings): array
{
return array_map(
static fn (string $string): string => preg_quote($string, '/'),
$strings,
);
return new Token(Token::TOKEN_TYPE_WORD, $matches[0]);
}

private function getNextQuotedString(string $string, int $offset): string
Expand Down

0 comments on commit 9fcd0ac

Please sign in to comment.