From 41606cdd64a5ededd55ac4f0939ba73049f980ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Wed, 12 Jun 2024 10:10:19 +0200 Subject: [PATCH] Build optimized regex from string list -20% improved runtime --- src/Tokenizer.php | 108 +++++++++++++++++++++++++++++++++------------- 1 file changed, 79 insertions(+), 29 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index ee2ed86..08c4cb9 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -4,19 +4,19 @@ namespace Doctrine\SqlFormatter; -use function array_combine; -use function array_keys; use function array_map; -use function arsort; -use function assert; -use function implode; +use function count; +use function is_int; use function preg_match; use function preg_quote; +use function reset; use function str_replace; +use function str_starts_with; use function strlen; use function strpos; use function strtoupper; use function substr; +use function usort; /** @internal */ final class Tokenizer @@ -762,31 +762,12 @@ final class Tokenizer */ public function __construct() { - // Sort list from longest word to shortest, 3x faster than usort - $sortByLengthFx = static function ($values) { - $valuesMap = array_combine($values, array_map(strlen(...), $values)); - assert($valuesMap !== false); - arsort($valuesMap); - - return array_keys($valuesMap); - }; - - $buildRegexFromListFx = static function ($values) use ($sortByLengthFx) { - return '(?>' . implode( - '|', - array_map( - static fn ($v) => preg_quote($v, '/'), - $sortByLengthFx($values), - ), - ) . ')'; - }; - // Set up regular expressions - $regexBoundaries = $buildRegexFromListFx($this->boundaries); - $regexReserved = $buildRegexFromListFx($this->reserved); - $regexReservedToplevel = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedToplevel)); - $regexReservedNewline = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedNewline)); - $regexFunction = $buildRegexFromListFx($this->functions); + $regexBoundaries = $this->makeRegexFromList($this->boundaries); + $regexReserved = $this->makeRegexFromList($this->reserved); + $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); + $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); + $regexFunction = $this->makeRegexFromList($this->functions); $this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/'; $this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/'; @@ -797,6 +778,75 @@ public function __construct() $this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/'; } + /** + * Make regex from a list of values matching longest value first. + * + * Optimized for speed by matching alternative branch only once + * https://github.com/PCRE2Project/pcre2/issues/411 . + * + * @param list $values + */ + private function makeRegexFromList(array $values, bool $sorted = false): string + { + // sort list alphabetically and from longest word to shortest + if (! $sorted) { + usort($values, static function (string $a, string $b) { + return str_starts_with($a, $b) || str_starts_with($b, $a) + ? strlen($b) <=> strlen($a) + : $a <=> $b; + }); + } + + /** @var array> $valuesBySharedPrefix */ + $valuesBySharedPrefix = []; + $items = []; + $prefix = null; + + foreach ($values as $v) { + if ($prefix !== null && ! str_starts_with($v, substr($prefix, 0, 1))) { + $valuesBySharedPrefix[$prefix] = $items; + $items = []; + $prefix = null; + } + + $items[] = $v; + + if ($prefix === null) { + $prefix = $v; + } else { + while (! str_starts_with($v, $prefix)) { + $prefix = substr($prefix, 0, -1); + } + } + } + + if ($items !== []) { + $valuesBySharedPrefix[$prefix] = $items; + $items = []; + $prefix = null; + } + + $regex = '(?>'; + + foreach ($valuesBySharedPrefix as $prefix => $items) { + if ($regex !== '(?>') { + $regex .= '|'; + } + + if (is_int($prefix)) { + $prefix = (string) $prefix; + } + + $regex .= preg_quote($prefix, '/'); + + $regex .= count($items) === 1 + ? preg_quote(substr(reset($items), strlen($prefix)), '/') + : $this->makeRegexFromList(array_map(static fn ($v) => substr($v, strlen($prefix)), $items), true); + } + + return $regex . ')'; + } + /** * Takes a SQL string and breaks it into tokens. * Each token is an associative array with type and value.