Skip to content

Commit

Permalink
Fixed highlighting of found words with hyphen.
Browse files Browse the repository at this point in the history
  • Loading branch information
parpalak committed Mar 23, 2024
1 parent c4ee100 commit 19976f4
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 78 deletions.
39 changes: 3 additions & 36 deletions src/S2/Rose/Entity/ResultItem.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
use S2\Rose\Entity\Metadata\SnippetSource;
use S2\Rose\Exception\InvalidArgumentException;
use S2\Rose\Exception\RuntimeException;
use S2\Rose\Stemmer\IrregularWordsStemmerInterface;
use S2\Rose\Snippet\WordsByStemsExtractor;
use S2\Rose\Stemmer\StemmerInterface;

class ResultItem
Expand Down Expand Up @@ -165,43 +165,10 @@ public function getHighlightedTitle(StemmerInterface $stemmer): string
throw new InvalidArgumentException('Highlight template must contain "%s" substring for sprintf() function.');
}

$stems = $this->foundWords;
$stemsForRegex = $stems;
if ($stemmer instanceof IrregularWordsStemmerInterface) {
$stems = array_merge($stems, $stemmer->irregularWordsFromStems($this->foundWords));

$regexRules = $stemmer->getRegexTransformationRules();
$stemsForRegex = array_map(static fn(string $stem): string => preg_replace(
array_keys($regexRules),
array_values($regexRules),
$stem
), $stems);
}

$joinedStems = implode('|', $stemsForRegex);
$extractor = new WordsByStemsExtractor($stemmer, $this->foundWords);

// Check the text for the query words
// TODO: Make sure the modifier S works correct on cyrillic
preg_match_all(
'#(?<=[^\\p{L}]|^)(' . $joinedStems . ')\\p{L}*#Ssui',
$this->title,
$matches,
PREG_OFFSET_CAPTURE
);
[$foundWords,] = $extractor->extract($this->title);

$foundWords = [];
foreach ($matches[0] as $i => $wordInfo) {
$word = $wordInfo[0];
$stemEqualsWord = ($wordInfo[0] === $matches[1][$i][0]);
$stemmedWord = $stemmer->stemWord($word);

// Ignore entry if the word stem differs from needed ones
if (!$stemEqualsWord && !\in_array($stemmedWord, $this->foundWords, true)) {
continue;
}

$foundWords[$word] = 1;
}

$snippetLine = new SnippetLine(
$this->title,
Expand Down
42 changes: 2 additions & 40 deletions src/S2/Rose/Snippet/SnippetBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -79,48 +79,10 @@ public function buildSnippet(array $foundPositionsByStems, string $highlightTemp
return $snippet;
}

$stemsForRegex = $stems;
if ($this->stemmer instanceof IrregularWordsStemmerInterface) {
$stems = array_merge($stems, $this->stemmer->irregularWordsFromStems($stems));

$regexRules = $this->stemmer->getRegexTransformationRules();
$regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call
$stemsForRegex = array_map(static fn(string $stem): string => preg_replace(
array_keys($regexRules),
array_values($regexRules),
$stem
), $stems);
}

$joinedStems = implode('|', $stemsForRegex);
$extractor = new WordsByStemsExtractor($this->stemmer, $stems);

foreach ($snippetSources as $snippetSource) {
// Check the text for the query words
// NOTE: Make sure the modifier S works correct on cyrillic
// TODO: After implementing formatting this regex became a set of crutches.
// One has to break the snippets into words, clear formatting, convert words to stems
// and detect what stems has been found. Then highlight the original text based on words source offset.
preg_match_all(
'#(?<=[^\\p{L}]|^|\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])(' . $joinedStems . ')\\p{L}*#Ssui',
$snippetSource->getText(),
$matches,
PREG_OFFSET_CAPTURE
);

$foundWords = $foundStems = [];
foreach ($matches[0] as $i => $wordInfo) {
$word = $wordInfo[0];
$stemEqualsWord = ($wordInfo[0] === $matches[1][$i][0]);
$stemmedWord = $this->stemmer->stemWord($word);

// Ignore entry if the word stem differs from needed ones
if (!$stemEqualsWord && !\in_array($stemmedWord, $stems, true)) {
continue;
}

$foundWords[$word] = 1;
$foundStems[$stemmedWord] = 1;
}
[$foundWords, $foundStems] = $extractor->extract($snippetSource->getText());

if (\count($foundWords) === 0) {
continue;
Expand Down
98 changes: 98 additions & 0 deletions src/S2/Rose/Snippet/WordsByStemsExtractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
<?php
/**
* @copyright 2024 Roman Parpalak
* @license MIT
* @package S2
*/

declare(strict_types=1);

namespace S2\Rose\Snippet;

use S2\Rose\Helper\StringHelper;
use S2\Rose\Stemmer\IrregularWordsStemmerInterface;
use S2\Rose\Stemmer\StemmerInterface;

class WordsByStemsExtractor
{
private StemmerInterface $stemmer;
/**
* @var string[]
*/
private array $stems;
private string $joinedStems;

/**
* @param string[] $stems
*/
public function __construct(StemmerInterface $stemmer, array $stems)
{
$this->stemmer = $stemmer;
$this->stems = $stems;

$stemsForRegex = $stems;
if ($stemmer instanceof IrregularWordsStemmerInterface) {
$stems = array_merge($stems, $stemmer->irregularWordsFromStems($stems));

$regexRules = $stemmer->getRegexTransformationRules();
$regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call
$stemsForRegex = array_map(static fn(string $stem): string => preg_replace(
array_keys($regexRules),
array_values($regexRules),
$stem
), $stems);
}

$this->joinedStems = implode('|', $stemsForRegex);
}

public function extract(string $text): array
{
// Check the text for the query words
// NOTE: Make sure the modifier S works correct on cyrillic
// TODO: After implementing formatting this regex became a set of crutches.
// One has to break the snippets into words, clear formatting, convert words to stems
// and detect what stems have been found. Then highlight the original text based on words source offset.
preg_match_all(
'#(?<=[^\\p{L}-]|^|\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])(' . $this->joinedStems . ')[\\p{L}-]*#Ssui',
$text,
$matches,
PREG_OFFSET_CAPTURE
);

$foundWords = $foundStems = [];
foreach ($matches[0] as $i => $wordInfo) {
foreach ($this->getWords($wordInfo[0]) as $word) {
$stemEqualsWord = ($word === $matches[1][$i][0]);
$stemmedWord = $this->stemmer->stemWord($word);

// Ignore entry if the word stem differs from needed ones
if (!$stemEqualsWord && !\in_array($stemmedWord, $this->stems, true)) {
continue;
}

$foundWords[$word] = 1;
$foundStems[$stemmedWord] = 1;
}
}

return [$foundWords, $foundStems];
}

/**
* If there is no hyphen in the word, use it as the found word.
* If the word contains a hyphen, besides checking the entire word,
* check each fragment for a match with the searched stem.
*
* @param string $text
* @return string[]
*/
private function getWords(string $text): array
{
if (strpos($text, '-') === false) {
return [$text];
}

return array_merge(explode('-', $text), [$text]);
}
}
14 changes: 12 additions & 2 deletions tests/unit/Rose/SnippetsTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,17 @@ public function testSnippets(array $indexables)
$resultSet = $this->finder->find(new Query('астатически дает'));
$this->assertEquals(
'Ошибка <span class="highlight"><i>астатически</i> даёт</span> более простую систему дифференциальных уравнений, если исключить небольшой угол тангажа. Если пренебречь малыми величинами, то видно, что механическая природа устойчиво требует большего внимания к анализу ошибок, которые <span class="highlight">даёт</span> устойчивый маховик.',
$resultSet->getItems()[0]->getFormattedSnippet(),
'Stemmer trims incorrectly подсистем to подсист. Check that this incorrect behaviour is handled without bugs.'
$resultSet->getItems()[0]->getFormattedSnippet()
);

$resultSet = $this->finder->find(new Query('Об одной из ошибок в веб-дизайне'));
$this->assertEquals(
'<span class="highlight">Об одной из ошибок в веб-дизайне</span>',
$resultSet->getItems()[0]->getHighlightedTitle($this->stemmer)
);
$this->assertEquals(
'<span class="highlight">Одна из</span> часто указываемых <span class="highlight">ошибок в веб-дизайне</span>:',
$resultSet->getItems()[0]->getFormattedSnippet()
);
}

Expand Down Expand Up @@ -243,6 +252,7 @@ public function indexableProvider()
<p>Уравнение возмущенного движения поступательно характеризует подвижный объект. Прецессия гироскопа косвенно интегрирует нестационарный вектор угловой скорости, изменяя направление движения. Угловая скорость, обобщая изложенное, неподвижно не входит своими составляющими, что очевидно, в силы нормальных реакций связей, так же как и кожух. Динамическое уравнение Эйлера, в силу третьего закона Ньютона, вращательно связывает ньютонометр, не забывая о том, что интенсивность диссипативных сил, характеризующаяся величиной коэффициента D, должна лежать в определённых пределах. Еще 1 раз проверим, как gt работает защита против &lt;script&gt;alert();&lt;/script&gt; xss-уязвимостей.</p>'),
new Indexable('id_4', 'Мне не душно', 'Я просто не ощущаю уровень углекислого газа в воздухе. Меня не устраивает.'),
new Indexable('id_5', 'Об одной из ошибок в веб-дизайне', 'Одна из часто указываемых ошибок в веб-дизайне:'),
];

return [
Expand Down

0 comments on commit 19976f4

Please sign in to comment.