From 19976f47aee05b991ed1c171d9cd4cf9a66bd9d3 Mon Sep 17 00:00:00 2001 From: Roman Parpalak Date: Sat, 23 Mar 2024 16:37:43 +0200 Subject: [PATCH] Fixed highlighting of found words with hyphen. --- src/S2/Rose/Entity/ResultItem.php | 39 +------- src/S2/Rose/Snippet/SnippetBuilder.php | 42 +------- src/S2/Rose/Snippet/WordsByStemsExtractor.php | 98 +++++++++++++++++++ tests/unit/Rose/SnippetsTest.php | 14 ++- 4 files changed, 115 insertions(+), 78 deletions(-) create mode 100755 src/S2/Rose/Snippet/WordsByStemsExtractor.php diff --git a/src/S2/Rose/Entity/ResultItem.php b/src/S2/Rose/Entity/ResultItem.php index 6be17c2..297106f 100644 --- a/src/S2/Rose/Entity/ResultItem.php +++ b/src/S2/Rose/Entity/ResultItem.php @@ -10,7 +10,7 @@ use S2\Rose\Entity\Metadata\SnippetSource; use S2\Rose\Exception\InvalidArgumentException; use S2\Rose\Exception\RuntimeException; -use S2\Rose\Stemmer\IrregularWordsStemmerInterface; +use S2\Rose\Snippet\WordsByStemsExtractor; use S2\Rose\Stemmer\StemmerInterface; class ResultItem @@ -165,43 +165,10 @@ public function getHighlightedTitle(StemmerInterface $stemmer): string throw new InvalidArgumentException('Highlight template must contain "%s" substring for sprintf() function.'); } - $stems = $this->foundWords; - $stemsForRegex = $stems; - if ($stemmer instanceof IrregularWordsStemmerInterface) { - $stems = array_merge($stems, $stemmer->irregularWordsFromStems($this->foundWords)); - - $regexRules = $stemmer->getRegexTransformationRules(); - $stemsForRegex = array_map(static fn(string $stem): string => preg_replace( - array_keys($regexRules), - array_values($regexRules), - $stem - ), $stems); - } - - $joinedStems = implode('|', $stemsForRegex); + $extractor = new WordsByStemsExtractor($stemmer, $this->foundWords); - // Check the text for the query words - // TODO: Make sure the modifier S works correct on cyrillic - preg_match_all( - '#(?<=[^\\p{L}]|^)(' . $joinedStems . ')\\p{L}*#Ssui', - $this->title, - $matches, - PREG_OFFSET_CAPTURE - ); + [$foundWords,] = $extractor->extract($this->title); - $foundWords = []; - foreach ($matches[0] as $i => $wordInfo) { - $word = $wordInfo[0]; - $stemEqualsWord = ($wordInfo[0] === $matches[1][$i][0]); - $stemmedWord = $stemmer->stemWord($word); - - // Ignore entry if the word stem differs from needed ones - if (!$stemEqualsWord && !\in_array($stemmedWord, $this->foundWords, true)) { - continue; - } - - $foundWords[$word] = 1; - } $snippetLine = new SnippetLine( $this->title, diff --git a/src/S2/Rose/Snippet/SnippetBuilder.php b/src/S2/Rose/Snippet/SnippetBuilder.php index 5a7fbd7..1d61f84 100644 --- a/src/S2/Rose/Snippet/SnippetBuilder.php +++ b/src/S2/Rose/Snippet/SnippetBuilder.php @@ -79,48 +79,10 @@ public function buildSnippet(array $foundPositionsByStems, string $highlightTemp return $snippet; } - $stemsForRegex = $stems; - if ($this->stemmer instanceof IrregularWordsStemmerInterface) { - $stems = array_merge($stems, $this->stemmer->irregularWordsFromStems($stems)); - - $regexRules = $this->stemmer->getRegexTransformationRules(); - $regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call - $stemsForRegex = array_map(static fn(string $stem): string => preg_replace( - array_keys($regexRules), - array_values($regexRules), - $stem - ), $stems); - } - - $joinedStems = implode('|', $stemsForRegex); + $extractor = new WordsByStemsExtractor($this->stemmer, $stems); foreach ($snippetSources as $snippetSource) { - // Check the text for the query words - // NOTE: Make sure the modifier S works correct on cyrillic - // TODO: After implementing formatting this regex became a set of crutches. - // One has to break the snippets into words, clear formatting, convert words to stems - // and detect what stems has been found. Then highlight the original text based on words source offset. - preg_match_all( - '#(?<=[^\\p{L}]|^|\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])(' . $joinedStems . ')\\p{L}*#Ssui', - $snippetSource->getText(), - $matches, - PREG_OFFSET_CAPTURE - ); - - $foundWords = $foundStems = []; - foreach ($matches[0] as $i => $wordInfo) { - $word = $wordInfo[0]; - $stemEqualsWord = ($wordInfo[0] === $matches[1][$i][0]); - $stemmedWord = $this->stemmer->stemWord($word); - - // Ignore entry if the word stem differs from needed ones - if (!$stemEqualsWord && !\in_array($stemmedWord, $stems, true)) { - continue; - } - - $foundWords[$word] = 1; - $foundStems[$stemmedWord] = 1; - } + [$foundWords, $foundStems] = $extractor->extract($snippetSource->getText()); if (\count($foundWords) === 0) { continue; diff --git a/src/S2/Rose/Snippet/WordsByStemsExtractor.php b/src/S2/Rose/Snippet/WordsByStemsExtractor.php new file mode 100755 index 0000000..d5728e4 --- /dev/null +++ b/src/S2/Rose/Snippet/WordsByStemsExtractor.php @@ -0,0 +1,98 @@ +stemmer = $stemmer; + $this->stems = $stems; + + $stemsForRegex = $stems; + if ($stemmer instanceof IrregularWordsStemmerInterface) { + $stems = array_merge($stems, $stemmer->irregularWordsFromStems($stems)); + + $regexRules = $stemmer->getRegexTransformationRules(); + $regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call + $stemsForRegex = array_map(static fn(string $stem): string => preg_replace( + array_keys($regexRules), + array_values($regexRules), + $stem + ), $stems); + } + + $this->joinedStems = implode('|', $stemsForRegex); + } + + public function extract(string $text): array + { + // Check the text for the query words + // NOTE: Make sure the modifier S works correct on cyrillic + // TODO: After implementing formatting this regex became a set of crutches. + // One has to break the snippets into words, clear formatting, convert words to stems + // and detect what stems have been found. Then highlight the original text based on words source offset. + preg_match_all( + '#(?<=[^\\p{L}-]|^|\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])(' . $this->joinedStems . ')[\\p{L}-]*#Ssui', + $text, + $matches, + PREG_OFFSET_CAPTURE + ); + + $foundWords = $foundStems = []; + foreach ($matches[0] as $i => $wordInfo) { + foreach ($this->getWords($wordInfo[0]) as $word) { + $stemEqualsWord = ($word === $matches[1][$i][0]); + $stemmedWord = $this->stemmer->stemWord($word); + + // Ignore entry if the word stem differs from needed ones + if (!$stemEqualsWord && !\in_array($stemmedWord, $this->stems, true)) { + continue; + } + + $foundWords[$word] = 1; + $foundStems[$stemmedWord] = 1; + } + } + + return [$foundWords, $foundStems]; + } + + /** + * If there is no hyphen in the word, use it as the found word. + * If the word contains a hyphen, besides checking the entire word, + * check each fragment for a match with the searched stem. + * + * @param string $text + * @return string[] + */ + private function getWords(string $text): array + { + if (strpos($text, '-') === false) { + return [$text]; + } + + return array_merge(explode('-', $text), [$text]); + } +} diff --git a/tests/unit/Rose/SnippetsTest.php b/tests/unit/Rose/SnippetsTest.php index 7a1a029..c701b8e 100644 --- a/tests/unit/Rose/SnippetsTest.php +++ b/tests/unit/Rose/SnippetsTest.php @@ -190,8 +190,17 @@ public function testSnippets(array $indexables) $resultSet = $this->finder->find(new Query('астатически дает')); $this->assertEquals( 'Ошибка астатически даёт более простую систему дифференциальных уравнений, если исключить небольшой угол тангажа. Если пренебречь малыми величинами, то видно, что механическая природа устойчиво требует большего внимания к анализу ошибок, которые даёт устойчивый маховик.', - $resultSet->getItems()[0]->getFormattedSnippet(), - 'Stemmer trims incorrectly подсистем to подсист. Check that this incorrect behaviour is handled without bugs.' + $resultSet->getItems()[0]->getFormattedSnippet() + ); + + $resultSet = $this->finder->find(new Query('Об одной из ошибок в веб-дизайне')); + $this->assertEquals( + 'Об одной из ошибок в веб-дизайне', + $resultSet->getItems()[0]->getHighlightedTitle($this->stemmer) + ); + $this->assertEquals( + 'Одна из часто указываемых ошибок в веб-дизайне:', + $resultSet->getItems()[0]->getFormattedSnippet() ); } @@ -243,6 +252,7 @@ public function indexableProvider()

Уравнение возмущенного движения поступательно характеризует подвижный объект. Прецессия гироскопа косвенно интегрирует нестационарный вектор угловой скорости, изменяя направление движения. Угловая скорость, обобщая изложенное, неподвижно не входит своими составляющими, что очевидно, в силы нормальных реакций связей, так же как и кожух. Динамическое уравнение Эйлера, в силу третьего закона Ньютона, вращательно связывает ньютонометр, не забывая о том, что интенсивность диссипативных сил, характеризующаяся величиной коэффициента D, должна лежать в определённых пределах. Еще 1 раз проверим, как gt работает защита против <script>alert();</script> xss-уязвимостей.

'), new Indexable('id_4', 'Мне не душно', 'Я просто не ощущаю уровень углекислого газа в воздухе. Меня не устраивает.'), + new Indexable('id_5', 'Об одной из ошибок в веб-дизайне', 'Одна из часто указываемых ошибок в веб-дизайне:'), ]; return [