diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 74a980e..087996b 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -4,8 +4,12 @@ namespace Doctrine\SqlFormatter; +use function array_key_last; use function array_map; +use function array_pop; +use function assert; use function count; +use function implode; use function is_int; use function preg_match; use function preg_quote; @@ -13,7 +17,6 @@ use function str_replace; use function str_starts_with; use function strlen; -use function strpos; use function strtoupper; use function substr; use function usort; @@ -719,15 +722,8 @@ final class Tokenizer 'YEARWEEK', ]; - // Regular expressions for tokenizing - - private readonly string $nextTokenRegexNumber; - private readonly string $nextTokenRegexBoundaryCharacter; - private readonly string $nextTokenRegexReservedToplevel; - private readonly string $nextTokenRegexReservedNewline; - private readonly string $nextTokenRegexReserved; - private readonly string $nextTokenRegexFunction; - private readonly string $nextTokenRegexNonReserved; + /** Regular expression for tokenizing. */ + private readonly string $tokenizeRegex; /** * Punctuation that can be used as a boundary between other tokens @@ -758,25 +754,11 @@ final class Tokenizer ]; /** - * Stuff that only needs to be done once. Builds regular expressions and - * sorts the reserved words. + * Stuff that only needs to be done once. Builds tokenizing regular expression. */ public function __construct() { - // Set up regular expressions - $regexBoundaries = $this->makeRegexFromList($this->boundaries); - $regexReserved = $this->makeRegexFromList($this->reserved); - $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); - $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); - $regexFunction = $this->makeRegexFromList($this->functions); - - $this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/'; - $this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/'; - $this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/'; - $this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/'; - $this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/'; - $this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/'; - $this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/'; + $this->tokenizeRegex = $this->makeTokenizeRegex($this->makeTokenizeRegexes()); } /** @@ -848,222 +830,101 @@ private function makeRegexFromList(array $values, bool $sorted = false): string return $regex . ')'; } - /** - * Takes a SQL string and breaks it into tokens. - * Each token is an associative array with type and value. - * - * @param string $string The SQL string - */ - public function tokenize(string $string): Cursor + /** @return array */ + private function makeTokenizeRegexes(): array { - $tokens = []; + // Set up regular expressions + $regexBoundaries = $this->makeRegexFromList($this->boundaries); + $regexReserved = $this->makeRegexFromList($this->reserved); + $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); + $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); + $regexFunction = $this->makeRegexFromList($this->functions); - $upper = strtoupper($string); - $offset = 0; - $token = null; + return [ + Token::TOKEN_TYPE_WHITESPACE => '\s+', + Token::TOKEN_TYPE_COMMENT => '(?:--|#)[^\n]*+', + Token::TOKEN_TYPE_BLOCK_COMMENT => '/\*(?:[^*]+|\*(?!/))*+(?:\*|$)(?:/|$)', + // 1. backtick quoted string using `` to escape + // 2. square bracket quoted string (SQL Server) using ]] to escape + Token::TOKEN_TYPE_BACKTICK_QUOTE => <<<'EOD' + (?>(?x) + `(?:[^`]+|`(?:`|$))*+(?:`|$) + |\[(?:[^\]]+|\](?:\]|$))*+(?:\]|$) + ) + EOD, + // 3. double quoted string using "" or \" to escape + // 4. single quoted string using '' or \' to escape + Token::TOKEN_TYPE_QUOTE => <<<'EOD' + (?>(?sx) + '(?:[^'\\]+|\\(?:.|$)|'(?:'|$))*+(?:'|$) + |"(?:[^"\\]+|\\(?:.|$)|"(?:"|$))*+(?:"|$) + ) + EOD, + // User-defined variable, possibly with quoted name + Token::TOKEN_TYPE_VARIABLE => '[@:](?:[\w.$]++|(?&t_' . Token::TOKEN_TYPE_BACKTICK_QUOTE . ')|(?&t_' . Token::TOKEN_TYPE_QUOTE . '))', + // decimal, binary, or hex + Token::TOKEN_TYPE_NUMBER => '(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')', + // punctuation and symbols + Token::TOKEN_TYPE_BOUNDARY => $regexBoundaries, + // A reserved word cannot be preceded by a '.' + // this makes it so in "mytable.from", "from" is not considered a reserved word + Token::TOKEN_TYPE_RESERVED_TOPLEVEL => '(? '(? '(? '.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')', + ]; + } - // Keep processing the string until it is empty - while ($offset < strlen($string)) { - // Get the next token and the token type - $token = $this->createNextToken($string, $upper, $offset, $token); - $offset += strlen($token->value()); + /** @param array $regexes */ + private function makeTokenizeRegex(array $regexes): string + { + $parts = []; - $tokens[] = $token; + foreach ($regexes as $type => $regex) { + $parts[] = '(?' . $regex . ')'; } - return new Cursor($tokens); + return '~\G(?:' . implode('|', $parts) . ')~'; } /** - * Return the next token and token type in a SQL string. - * Quoted strings, comments, reserved words, whitespace, and punctuation - * are all their own tokens. - * - * @param string $string The SQL string - * @param string $upper The SQL string in upper case - * @param Token|null $previous The result of the previous createNextToken() call + * Takes a SQL string and breaks it into tokens. + * Each token is an associative array with type and value. * - * @return Token An associative array containing the type and value of the token. + * @param string $string The SQL string */ - private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token + public function tokenize(string $string): Cursor { - // Whitespace - if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) { - return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); - } + $tokenizeRegex = $this->tokenizeRegex; + $upper = strtoupper($string); - $firstChar = $string[$offset]; - $secondChar = $string[$offset + 1] ?? ''; - - // Comment - if ( - $firstChar === '#' || - (($firstChar === '-' && $secondChar === '-') || - ($firstChar === '/' && $secondChar === '*')) - ) { - // Comment until end of line - if ($firstChar === '-' || $firstChar === '#') { - $last = strpos($string, "\n", $offset); - $type = Token::TOKEN_TYPE_COMMENT; - } else { // Comment until closing comment tag - $pos = strpos($string, '*/', $offset + 2); - $last = $pos !== false - ? $pos + 2 - : false; - $type = Token::TOKEN_TYPE_BLOCK_COMMENT; - } - - if ($last === false) { - $last = strlen($string); - } - - return new Token($type, substr($string, $offset, $last - $offset)); - } - - // Quoted String - if ($firstChar === '"' || $firstChar === '\'' || $firstChar === '`' || $firstChar === '[') { - return new Token( - ($firstChar === '`' || $firstChar === '[' - ? Token::TOKEN_TYPE_BACKTICK_QUOTE - : Token::TOKEN_TYPE_QUOTE), - $this->getNextQuotedString($string, $offset), - ); - } - - // User-defined Variable - if (($firstChar === '@' || $firstChar === ':') && $secondChar !== '') { - $value = null; - $type = Token::TOKEN_TYPE_VARIABLE; - - // If the variable name is quoted - if ($secondChar === '"' || $secondChar === '\'' || $secondChar === '`') { - $value = $firstChar . $this->getNextQuotedString($string, $offset + 1); - } else { - // Non-quoted variable name - preg_match('/\G[@:][\w.$]+/', $string, $matches, 0, $offset); - if ($matches) { - $value = $matches[0]; - } - } - - if ($value !== null) { - return new Token($type, $value); - } - } - - // Number (decimal, binary, or hex) - if ( - preg_match( - $this->nextTokenRegexNumber, - $string, - $matches, - 0, - $offset, - ) - ) { - return new Token(Token::TOKEN_TYPE_NUMBER, $matches[0]); - } - - // Boundary Character (punctuation and symbols) - if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) { - return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]); - } - - // A reserved word cannot be preceded by a '.' - // this makes it so in "mytable.from", "from" is not considered a reserved word - if ($previous === null || $previous->value() !== '.') { - // Top Level Reserved Word - if ( - preg_match( - $this->nextTokenRegexReservedToplevel, - $upper, - $matches, - 0, - $offset, - ) - ) { - return new Token( - Token::TOKEN_TYPE_RESERVED_TOPLEVEL, - substr($string, $offset, strlen($matches[0])), - ); - } + $tokens = []; + $offset = 0; - // Newline Reserved Word - if ( - preg_match( - $this->nextTokenRegexReservedNewline, - $upper, - $matches, - 0, - $offset, - ) - ) { - return new Token( - Token::TOKEN_TYPE_RESERVED_NEWLINE, - substr($string, $offset, strlen($matches[0])), - ); - } + while ($offset < strlen($string)) { + // Get the next token and the token type + preg_match($tokenizeRegex, $upper, $matches, 0, $offset); + assert(($matches[0] ?? '') !== ''); - // Other Reserved Word - if ( - preg_match( - $this->nextTokenRegexReserved, - $upper, - $matches, - 0, - $offset, - ) - ) { - return new Token( - Token::TOKEN_TYPE_RESERVED, - substr($string, $offset, strlen($matches[0])), - ); + while (is_int($lastMatchesKey = array_key_last($matches))) { + array_pop($matches); } - } - // A function must be succeeded by '(' - // this makes it so "count(" is considered a function, but "count" alone is not function - if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) { - return new Token( - Token::TOKEN_TYPE_RESERVED, - substr($string, $offset, strlen($matches[0])), - ); - } + assert(str_starts_with($lastMatchesKey, 't_')); - // Non reserved word - preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset); + /** @var Token::TOKEN_TYPE_* $tokenType */ + $tokenType = (int) substr($lastMatchesKey, 2); - return new Token(Token::TOKEN_TYPE_WORD, $matches[0]); - } + $token = new Token($tokenType, substr($string, $offset, strlen($matches[0]))); - private function getNextQuotedString(string $string, int $offset): string - { - $ret = ''; + $offset += strlen($token->value()); - // This checks for the following patterns: - // 1. backtick quoted string using `` to escape - // 2. square bracket quoted string (SQL Server) using ]] to escape - // 3. double quoted string using "" or \" to escape - // 4. single quoted string using '' or \' to escape - if ( - preg_match( - <<<'EOD' - ~\G(?>(?sx) - (?:`[^`]*(?:$|`))+ - |(?:\[[^\]]*($|\]))(?:\][^\]]*(?:$|\]))* - |(?:"[^"\\]*(?:\\.[^"\\]*)*(?:"|$))+ - |(?:'[^'\\]*(?:\\.[^'\\]*)*(?:'|$))+ - )~ - EOD, - $string, - $matches, - 0, - $offset, - ) - ) { - $ret = $matches[0]; + $tokens[] = $token; } - return $ret; + return new Cursor($tokens); } }