diff --git a/src/S2/Rose/Entity/HighlightIntervals.php b/src/S2/Rose/Entity/HighlightIntervals.php new file mode 100755 index 0000000..3cebe6b --- /dev/null +++ b/src/S2/Rose/Entity/HighlightIntervals.php @@ -0,0 +1,36 @@ +hasPreviousInterval) { + $this->highlightIntervals[] = [$start, $end]; + } else { + $this->highlightIntervals[\count($this->highlightIntervals) - 1][1] = $end; + } + + $this->hasPreviousInterval = true; + } + + public function skipInterval(): void + { + $this->hasPreviousInterval = false; + } + + public function toArray(): array + { + return $this->highlightIntervals; + } +} diff --git a/src/S2/Rose/Entity/ResultItem.php b/src/S2/Rose/Entity/ResultItem.php index 297106f..1b21745 100644 --- a/src/S2/Rose/Entity/ResultItem.php +++ b/src/S2/Rose/Entity/ResultItem.php @@ -1,16 +1,17 @@ -description; } - $snippet = $this->snippet->toString(0.3); + $snippet = $this->snippet->toString(); if ($snippet) { return $snippet; } @@ -127,7 +128,7 @@ public function getFormattedSnippet(): string return $this->description; } - $snippet = $this->snippet->toString(0.3, true); + $snippet = $this->snippet->toString(true); if ($snippet) { return $snippet; } @@ -137,8 +138,6 @@ public function getFormattedSnippet(): string /** * @param string[] $words - * - * @return $this */ public function setFoundWords(array $words): self { @@ -148,14 +147,7 @@ public function setFoundWords(array $words): self } /** - * TODO Refactor the highlight logic to a separate class. - * - * @param StemmerInterface $stemmer - * - * @return string - * * @throws RuntimeException - * @see \S2\Rose\Snippet\SnippetBuilder::buildSnippet for dublicated logic */ public function getHighlightedTitle(StemmerInterface $stemmer): string { @@ -165,16 +157,12 @@ public function getHighlightedTitle(StemmerInterface $stemmer): string throw new InvalidArgumentException('Highlight template must contain "%s" substring for sprintf() function.'); } - $extractor = new WordsByStemsExtractor($stemmer, $this->foundWords); - - [$foundWords,] = $extractor->extract($this->title); - - $snippetLine = new SnippetLine( $this->title, SnippetSource::FORMAT_PLAIN_TEXT, - array_keys($foundWords), - \count($foundWords) + $stemmer, + $this->foundWords, + 0 ); return $snippetLine->getHighlighted($template, false); diff --git a/src/S2/Rose/Entity/Snippet.php b/src/S2/Rose/Entity/Snippet.php index 2e1364f..f16cd19 100644 --- a/src/S2/Rose/Entity/Snippet.php +++ b/src/S2/Rose/Entity/Snippet.php @@ -1,13 +1,12 @@ -foundWordCount = $foundWordNum; $this->highlightTemplate = $highlightTemplate; $this->introductionSnippetLines = $introductionSnippetLines; } @@ -50,7 +45,6 @@ public function setLineSeparator(string $lineSeparator): self public function attachSnippetLine(int $minWordPosition, int $maxWordPosition, SnippetLine $snippetLine): self { $this->snippetLines[] = $snippetLine; - $this->snippetLineWeights[] = $snippetLine->getRelevance(); $this->snippetMinWordPositions[] = $minWordPosition; $this->snippetMaxWordPositions[] = $maxWordPosition; @@ -67,7 +61,7 @@ public function getTextIntroduction(bool $includeFormatting = false): string return implode(' ', $result); } - public function toString(float $acceptableRelevance = 0.6, bool $includeFormatting = false): ?string + public function toString(bool $includeFormatting = false): ?string { $stat = []; foreach ($this->snippetLines as $index => $snippetLine) { @@ -99,17 +93,11 @@ public function toString(float $acceptableRelevance = 0.6, bool $includeFormatti $resultSnippetLines[$idx] = $this->snippetLines[$idx]; } - if ($this->calcLinesRelevance($resultSnippetLines) < $acceptableRelevance) { - return null; - } - return $this->implodeLines($resultSnippetLines, $includeFormatting); } /** * @param array|SnippetLine[] $snippetLines - * - * @return string */ private function implodeLines(array $snippetLines, bool $includeFormatting): string { @@ -149,25 +137,4 @@ private function implodeLines(array $snippetLines, bool $includeFormatting): str return $result; } - - /** - * @param array|SnippetLine[] $snippetLines - * - * @return float|int - */ - private function calcLinesRelevance(array $snippetLines) - { - if (!($this->foundWordCount > 0)) { - return 0; - } - - $foundWords = []; - foreach ($snippetLines as $snippetLine) { - foreach ($snippetLine->getFoundWords() as $word) { - $foundWords[$word] = 1; - } - } - - return \count($foundWords) * 1.0 / $this->foundWordCount; - } } diff --git a/src/S2/Rose/Entity/SnippetLine.php b/src/S2/Rose/Entity/SnippetLine.php index 581071b..a611535 100644 --- a/src/S2/Rose/Entity/SnippetLine.php +++ b/src/S2/Rose/Entity/SnippetLine.php @@ -11,6 +11,7 @@ use S2\Rose\Entity\Metadata\SnippetSource; use S2\Rose\Exception\RuntimeException; use S2\Rose\Helper\StringHelper; +use S2\Rose\Stemmer\StemmerInterface; class SnippetLine { @@ -19,11 +20,15 @@ class SnippetLine /** * @var string[] */ - protected array $foundWords; + protected array $stemsFoundSomewhere; protected string $line; - protected float $relevance = 0; + protected int $formatId; + + protected StemmerInterface $stemmer; + + protected float $relevance; protected ?string $lineWithoutMaskedFragments = null; @@ -31,40 +36,55 @@ class SnippetLine * @var string[] */ protected array $maskedFragments = []; - private int $formatId; /** * @var string[] */ private array $maskRegexArray = []; - public function __construct(string $line, int $formatId, array $foundWords, float $relevance) + private ?HighlightIntervals $highlightIntervals = null; + + private array $foundStems = []; + + public function __construct(string $line, int $formatId, StemmerInterface $stemmer, array $stemsFoundSomewhere, float $relevance) { - $this->line = $line; - $this->foundWords = $foundWords; - $this->relevance = $relevance; - $this->formatId = $formatId; + $this->line = $line; + $this->formatId = $formatId; + $this->stemmer = $stemmer; + $this->stemsFoundSomewhere = $stemsFoundSomewhere; + $this->relevance = $relevance; } public static function createFromSnippetSourceWithoutFoundWords(SnippetSource $snippetSource): self { - return new static($snippetSource->getText(), $snippetSource->getFormatId(), [], 0.0); + return new static( + $snippetSource->getText(), + $snippetSource->getFormatId(), + new class implements StemmerInterface { + public function stemWord(string $word, bool $normalize = true): string + { + return $word; + } + }, + [], + 0 + ); } - /** - * @return float - */ - public function getRelevance() + public function getRelevance(): float { return $this->relevance; } /** * @return string[] + * @deprecated Not used anymore. TODO delete if not needed */ - public function getFoundWords(): array + public function getFoundStems(): array { - return $this->foundWords; + $this->parse(); + + return $this->foundStems; } public function getLine(): string @@ -86,25 +106,37 @@ public function getHighlighted(string $highlightTemplate, bool $includeFormattin throw new RuntimeException('Highlight template must contain "%s" substring for sprintf() function.'); } - if (\count($this->foundWords) === 0) { - $result = $this->line; - } else { - $line = $this->getLineWithoutMaskedFragments(); - - // TODO: After implementing formatting this regex became a set of crutches. - // One has to break the snippets into words, clear formatting, convert words to stems - // and detect what stems has been found. Then highlight the original text based on words source offset. - $wordPattern = implode('|', array_map(static fn(string $word) => preg_quote($word, '#'), $this->foundWords)); - $wordPatternWithFormatting = '(?:\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])*(?:' . $wordPattern . ')(?:\\\\[' . strtoupper(StringHelper::FORMATTING_SYMBOLS) . '])*'; - $replacedLine = preg_replace_callback( - '#(?:\\s|^|\p{P})\\K' . $wordPatternWithFormatting . '(?:\\s+(?:' . $wordPatternWithFormatting . '))*\\b#su', - static fn($matches) => sprintf($highlightTemplate, $matches[0]), - $line - ); + $this->parse(); - $result = $this->restoreMaskedFragments($replacedLine); + $line = $this->getLineWithoutMaskedFragments(); + + $replacedLine = ''; + $processedPosition = 0; + foreach ($this->highlightIntervals->toArray() as [$start, $end]) { + $replacedLine .= substr($line, $processedPosition, $start - $processedPosition); + $lineToReplace = substr($line, $start, $end - $start); + + [$openFormatting, $closeFormatting] = StringHelper::getUnbalancedInternalFormatting($lineToReplace); + + // Open formatting goes to the end + $outsidePostfix = implode('', array_map(static fn(string $char) => '\\' . $char, $openFormatting)); + $insidePostfix = implode('', array_map(static fn(string $char) => '\\' . strtoupper($char), array_reverse($openFormatting))); + + // Close formatting goes to the start + $outsidePrefix = implode('', array_map(static fn(string $char) => '\\' . $char, $closeFormatting)); + $insidePrefix = implode('', array_map(static fn(string $char) => '\\' . strtolower($char), array_reverse($closeFormatting))); + + $replacedLine .= $outsidePrefix . sprintf( + $highlightTemplate, $insidePrefix . $lineToReplace . $insidePostfix + ) . $outsidePostfix; + + $processedPosition = $end; } + $replacedLine .= substr($line, $processedPosition); + + $result = $this->restoreMaskedFragments($replacedLine); + if ($this->formatId === SnippetSource::FORMAT_INTERNAL) { if ($includeFormatting) { $result = StringHelper::convertInternalFormattingToHtml($result); @@ -121,6 +153,87 @@ public function setMaskRegexArray(array $regexes): void $this->maskRegexArray = $regexes; } + protected function parse(): void + { + if ($this->highlightIntervals !== null) { + // Already parsed + return; + } + + $this->highlightIntervals = new HighlightIntervals(); + + $line = $this->getLineWithoutMaskedFragments(); + + if (\count($this->stemsFoundSomewhere) === 0) { + return; + } + + if ($this->formatId === SnippetSource::FORMAT_INTERNAL) { + $regex = '/(?x) + [\\d\\p{L}^_]*(?:(?:\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])+[\\d\\p{L}^_]*)* # matches as many word and formatting characters as possible + (*SKIP) # do not cross this line on backtracking + \\K # restart pattern matching to the end of the word. + (?: # delimiter regex which includes: + [^\\\\\\d\\p{L}^_\\-.,] # non-word character + |[\\-.,]+(?![\\d\\p{L}\\-.,]) # [,-.] followed by a non-word character + |\\\\(?:[' . StringHelper::FORMATTING_SYMBOLS . '](?![\\d\\p{L}\\-.,])|\\\\) # formatting sequence followed by a non-word character or escaped backslash + )+/iu'; + } else { + $regex = '/(?x) + [\\d\\p{L}^_]* # matches as many word and formatting characters as possible + (*SKIP) # do not cross this line on backtracking + \\K # restart pattern matching to the end of the word. + (?: # delimiter regex which includes: + [^\\d\\p{L}^_\\-.,] # non-word character + |[\\-.,]+(?![\\d\\p{L}\\-.,]) # [,-.] followed by a non-word character + )+/iu'; + } + $wordArray = preg_split($regex, $line, -1, \PREG_SPLIT_OFFSET_CAPTURE); + + $flippedStems = array_flip($this->stemsFoundSomewhere); + foreach ($wordArray as [$rawWord, $offset]) { + $word = $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($rawWord) : $rawWord; + $word = str_replace(self::STORE_MARKER, '', $word); + + if ($word === '') { + // No need to call $intervals->skipInterval() since regex may work several times on a single delimiter + continue; + } + + $stem = null; + if (isset($flippedStems[$word]) || isset($flippedStems[$stem = $this->stemmer->stemWord($word)])) { + $this->highlightIntervals->addInterval($offset, $offset + \strlen($rawWord)); + $this->foundStems[] = $stem ?? $word; + } else { + // Word is not found. Check if it is like a hyphenated compound word, e.g. 'test-drive' or 'long-term' + if (false !== strpbrk($stem, StringHelper::WORD_COMPONENT_DELIMITERS)) { + // Here is more simple regex since formatting sequences may be present. + // The downside is appearance of empty words, but they are filtered out later. + $subWordArray = preg_split('#[\-.,]+#u', $rawWord, -1, \PREG_SPLIT_OFFSET_CAPTURE); + foreach ($subWordArray as [$rawSubWord, $subOffset]) { + $subWord = $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($rawSubWord) : $rawSubWord; + $subWord = str_replace(self::STORE_MARKER, '', $subWord); + + if ($rawSubWord === '') { + continue; + } + + $subStem = null; + if (isset($flippedStems[$subWord]) || isset($flippedStems[$subStem = $this->stemmer->stemWord($subWord)])) { + $this->highlightIntervals->addInterval($offset + $subOffset, $offset + $subOffset + \strlen($rawSubWord)); + $this->foundStems[] = $subStem ?? $subWord; + } else { + $this->highlightIntervals->skipInterval(); + } + } + } else { + // Not a compound word + $this->highlightIntervals->skipInterval(); + } + } + } + } + protected function getLineWithoutMaskedFragments(): string { if ($this->lineWithoutMaskedFragments !== null) { diff --git a/src/S2/Rose/Helper/StringHelper.php b/src/S2/Rose/Helper/StringHelper.php index 06b65a7..488997b 100644 --- a/src/S2/Rose/Helper/StringHelper.php +++ b/src/S2/Rose/Helper/StringHelper.php @@ -1,9 +1,11 @@ - $num) { if ($num > 0) { - $text = str_repeat('\\' . $possibleTag, $num) . $text; + $text = str_repeat('\\' . $possibleTag, $num) . $text; $tagsFromPrevSentence[$possibleTag] = 0; } } @@ -105,6 +110,11 @@ public static function clearInternalFormatting(string $text): string ]); } + /** + * @Note: This approach with counting formatting symbols gives wrong results for the same nested tags. + * For example, for '\i 1 \b 2 \i 3' it returns '\i 1 \b 2 \i 3 \B\I\I', however '\i 1 \b 2 \i 3\I\B\I' is expected. + * It's ok since nesting of formatting tags like ab do not make a lot of sense. + */ public static function fixUnbalancedInternalFormatting(string $text, array &$tagsNum): string { preg_match_all('#\\\\(?:\\\\(*SKIP)\\\\)*\K[' . self::FORMATTING_SYMBOLS . ']#i', $text, $matches); @@ -129,4 +139,37 @@ public static function fixUnbalancedInternalFormatting(string $text, array &$tag return $result; } + + /** + * @return array{0: array, 1: array} + */ + public static function getUnbalancedInternalFormatting(string $text): array + { + preg_match_all('#\\\\(?:\\\\(*SKIP)\\\\)*\K[' . self::FORMATTING_SYMBOLS . ']#i', $text, $matches); + + $openStack = []; + $closeStack = []; + + foreach ($matches[0] as $match) { + $lowerMatch = strtolower($match); + if ($match === $lowerMatch) { + $openStack[] = $match; + continue; + } + + $found = false; + for ($i = \count($openStack); $i--;) { + if ($openStack[$i] === $lowerMatch) { + array_splice($openStack, $i, 1); + $found = true; + break; + } + } + if (!$found) { + $closeStack[] = $match; + } + } + + return [$openStack, $closeStack]; + } } diff --git a/src/S2/Rose/Indexer.php b/src/S2/Rose/Indexer.php index 374ae64..2f0da28 100644 --- a/src/S2/Rose/Indexer.php +++ b/src/S2/Rose/Indexer.php @@ -206,8 +206,8 @@ private function getStemsWithComponents(array $words): array $stemmedWord = $this->stemmer->stemWord($word, false); // If the word contains punctuation marks like hyphen, add a variant without it - if (false !== strpbrk($stemmedWord, '-.,')) { - foreach (preg_split('#[\p{L}\d]\K[\-.,]+|[\-.,]+(?=[\p{L}\d])#u', $word) as $k => $subWord) { + if (false !== strpbrk($stemmedWord, StringHelper::WORD_COMPONENT_DELIMITERS)) { + foreach (preg_split('#(?<=[\p{L}\d])[\-.,]+|[\-.,]++(?=[\p{L}\d])#u', $word) as $k => $subWord) { if ($subWord !== '' && $subWord !== $word) { $componentsOfCompoundWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false); } diff --git a/src/S2/Rose/Snippet/SnippetBuilder.php b/src/S2/Rose/Snippet/SnippetBuilder.php index e647247..f40b4b0 100644 --- a/src/S2/Rose/Snippet/SnippetBuilder.php +++ b/src/S2/Rose/Snippet/SnippetBuilder.php @@ -65,8 +65,9 @@ public function attachSnippets(ResultSet $result, SnippetResult $snippetResult): public function buildSnippet(array $foundPositionsByStems, string $highlightTemplate, array $relevanceByStems, SnippetSource ...$snippetSources): Snippet { // Stems of the words found in the $id chapter - $stems = []; - $foundWordNum = 0; + $stems = []; + $foundWordNum = 0; + $snippetRelevance = []; foreach ($foundPositionsByStems as $stem => $positions) { if (empty($positions)) { // Not a fulltext search result (e.g. title from single keywords) @@ -74,6 +75,11 @@ public function buildSnippet(array $foundPositionsByStems, string $highlightTemp } $stems[] = $stem; $foundWordNum++; + foreach ($snippetSources as $snippetIndex => $snippetSource) { + if ($snippetSource->coversOneOfPositions($positions)) { + $snippetRelevance[$snippetIndex] = ($snippetRelevance[$snippetIndex] ?? 0) + ($relevanceByStems[$stem] ?? 0); + } + } } $introSnippetLines = array_map( @@ -81,7 +87,7 @@ public function buildSnippet(array $foundPositionsByStems, string $highlightTemp \array_slice($snippetSources, 0, 2) ); - $snippet = new Snippet($foundWordNum, $highlightTemplate, ...$introSnippetLines); + $snippet = new Snippet($highlightTemplate, ...$introSnippetLines); if ($this->snippetLineSeparator !== null) { $snippet->setLineSeparator($this->snippetLineSeparator); @@ -91,24 +97,18 @@ public function buildSnippet(array $foundPositionsByStems, string $highlightTemp return $snippet; } - $extractor = new WordsByStemsExtractor($this->stemmer, $stems); - - foreach ($snippetSources as $snippetSource) { - [$foundWords, $foundStems] = $extractor->extract($snippetSource->getText()); - - if (\count($foundWords) === 0) { + foreach ($snippetSources as $snippetIndex => $snippetSource) { + if (!isset($snippetRelevance[$snippetIndex])) { continue; } $snippetLine = new SnippetLine( $snippetSource->getText(), $snippetSource->getFormatId(), - array_keys($foundWords), - array_sum(array_map(static function ($stem) use ($relevanceByStems) { - return $relevanceByStems[$stem] ?? 0; - }, array_keys($foundStems))) + $this->stemmer, + $stems, + $snippetRelevance[$snippetIndex] ?? 0 ); - $snippetLine->setMaskRegexArray($this->highlightMaskRegexArray); $snippet->attachSnippetLine($snippetSource->getMinPosition(), $snippetSource->getMaxPosition(), $snippetLine); diff --git a/src/S2/Rose/Snippet/WordsByStemsExtractor.php b/src/S2/Rose/Snippet/WordsByStemsExtractor.php deleted file mode 100755 index 16d6dda..0000000 --- a/src/S2/Rose/Snippet/WordsByStemsExtractor.php +++ /dev/null @@ -1,99 +0,0 @@ -stemmer = $stemmer; - $this->stems = $stems; - - $stemsForRegex = $stems; - if ($stemmer instanceof IrregularWordsStemmerInterface) { - $stems = array_merge($stems, $stemmer->irregularWordsFromStems($stems)); - - $regexRules = $stemmer->getRegexTransformationRules(); - $regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call - $stemsForRegex = array_map(static fn(string $stem): string => preg_replace( - array_keys($regexRules), - array_values($regexRules), - $stem - ), $stems); - } - - $this->joinedStems = implode('|', $stemsForRegex); - } - - public function extract(string $text): array - { - // Check the text for the query words - // NOTE: Make sure the modifier S works correct on cyrillic - // TODO: After implementing formatting this regex became a set of crutches. - // One has to break the snippets into words, clear formatting, convert words to stems - // and detect what stems have been found. Then highlight the original text based on words source offset. - preg_match_all( - '#(?<=[^\\p{L}-]|^|\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])(' . $this->joinedStems . ')[\\p{L}-]*#Ssui', - $text, - $matches, - PREG_OFFSET_CAPTURE - ); - - $foundWords = $foundStems = []; - foreach ($matches[0] as $i => $wordInfo) { - foreach ($this->getWords($wordInfo[0]) as $word) { - $stemEqualsWord = ($word === $matches[1][$i][0]); - $stemmedWord = $this->stemmer->stemWord($word); - - // Ignore entry if the word stem differs from needed ones - if (!$stemEqualsWord && !\in_array($stemmedWord, $this->stems, true)) { - continue; - } - - $foundWords[$word] = 1; - $foundStems[$stemmedWord] = 1; - } - } - - return [$foundWords, $foundStems]; - } - - /** - * If there is no hyphen in the word, use it as the found word. - * If the word contains a hyphen, besides checking the entire word, - * check each fragment for a match with the searched stem. - * - * @param string $text - * - * @return string[] - */ - private function getWords(string $text): array - { - if (strpos($text, '-') === false) { - return [$text]; - } - - return array_merge(array_filter(explode('-', $text), static fn(string $word) => $word !== ''), [$text]); - } -} diff --git a/src/S2/Rose/Stemmer/AbstractStemmer.php b/src/S2/Rose/Stemmer/AbstractStemmer.php index 2201030..350f736 100644 --- a/src/S2/Rose/Stemmer/AbstractStemmer.php +++ b/src/S2/Rose/Stemmer/AbstractStemmer.php @@ -1,12 +1,14 @@ -nextStemmer = $nextStemmer; } - - /** - * {@inheritdoc} - */ - public function irregularWordsFromStems(array $stems): array - { - $flippedStems = array_flip($stems); - - $words = array_keys(array_filter($this->getIrregularWords(), static function ($irregularStem) use ($flippedStems) { - return isset($flippedStems[$irregularStem]); - })); - - if ($this->nextStemmer instanceof IrregularWordsStemmerInterface) { - $words = array_merge($words, $this->nextStemmer->irregularWordsFromStems($stems)); - } - - return $words; - } - - /** - * @return array|string[] - */ - abstract protected function getIrregularWords(): array; } diff --git a/src/S2/Rose/Stemmer/IrregularWordsStemmerInterface.php b/src/S2/Rose/Stemmer/IrregularWordsStemmerInterface.php deleted file mode 100644 index 497f0f9..0000000 --- a/src/S2/Rose/Stemmer/IrregularWordsStemmerInterface.php +++ /dev/null @@ -1,37 +0,0 @@ - '[iy]'] - * that replaces the last entry of 'i' into entry of either 'i' or 'y'. - * - * Possible false positive matches are not mistakes since found matches are checked - * through the stemmer. - * - * @return mixed - */ - public function getRegexTransformationRules(): array; -} diff --git a/src/S2/Rose/Stemmer/PorterStemmerEnglish.php b/src/S2/Rose/Stemmer/PorterStemmerEnglish.php index e44521f..d69b69b 100644 --- a/src/S2/Rose/Stemmer/PorterStemmerEnglish.php +++ b/src/S2/Rose/Stemmer/PorterStemmerEnglish.php @@ -675,24 +675,4 @@ protected static function step5(string $word): string return $word; } - - /** - * {@inheritdoc} - */ - protected function getIrregularWords(): array - { - return self::$irregularWords; - } - - /** - * {@inheritdoc} - */ - public function getRegexTransformationRules(): array - { - return array_merge([ - '#i$#i' => '[iy]', // legaci -> legacy - '#e$#i' => '', // live -> living, rate -> rating - '#bl$#i' => 'bi?l', // possibl -> possibility, but abl -> able - ], $this->nextStemmer !== null ? $this->nextStemmer->getRegexTransformationRules() : []); - } } diff --git a/src/S2/Rose/Stemmer/PorterStemmerRussian.php b/src/S2/Rose/Stemmer/PorterStemmerRussian.php index 580fcca..61b5ed3 100644 --- a/src/S2/Rose/Stemmer/PorterStemmerRussian.php +++ b/src/S2/Rose/Stemmer/PorterStemmerRussian.php @@ -346,9 +346,6 @@ public function stemWord(string $word, bool $normalize = true): string /** * TODO How to deal with postfixes like "кто-либо" -> "кого-либо"? * Ignoring postfix is not an option - there are a lot of trash results found. - * Transforming like `stem('кто') . '-либо'` requires some hack for reverse transform when highlighting. - * - * @see \S2\Rose\Stemmer\IrregularWordsStemmerInterface::irregularWordsFromStems */ // $word = preg_replace('/^(.*)-(то|либо|нибудь)$/Su', '-\\2-\\1', $word); @@ -414,22 +411,4 @@ protected static function s(&$s, $re, $to) return $orig !== $s; } - - /** - * {@inheritdoc} - */ - protected function getIrregularWords(): array - { - return self::$irregularWords; - } - - /** - * {@inheritdoc} - */ - public function getRegexTransformationRules(): array - { - return array_merge([ - '#е#i' => '[её]', - ], $this->nextStemmer !== null ? $this->nextStemmer->getRegexTransformationRules() : []); - } } diff --git a/tests/unit/Rose/Entity/ResultSetTest.php b/tests/unit/Rose/Entity/ResultSetTest.php index 2f8fd1d..7dde1a4 100644 --- a/tests/unit/Rose/Entity/ResultSetTest.php +++ b/tests/unit/Rose/Entity/ResultSetTest.php @@ -1,7 +1,7 @@ expectException(UnknownIdException::class); $resultSet = new ResultSet(); - $resultSet->attachSnippet(new ExternalId('not found'), new Snippet(0, '%s', new SnippetLine('', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0))); + $resultSet->attachSnippet(new ExternalId('not found'), new Snippet('%s', new SnippetLine('', SnippetSource::FORMAT_PLAIN_TEXT, new PorterStemmerEnglish(), [], 0.0))); } public function testNotFrozenGetFoundExternalIds() diff --git a/tests/unit/Rose/Entity/SnippetLineTest.php b/tests/unit/Rose/Entity/SnippetLineTest.php index 6d365a2..cda1c7b 100644 --- a/tests/unit/Rose/Entity/SnippetLineTest.php +++ b/tests/unit/Rose/Entity/SnippetLineTest.php @@ -1,6 +1,6 @@ assertEquals('Testing string to highlight some test values, Test is case-sensitive.', $snippetLine->getHighlighted('%s', false)); + $this->assertEquals( + 'Testing string to highlight some test values, Test is case-sensitive.', + $snippetLine->getHighlighted('%s', false) + ); } public function testCreateHighlighted2() @@ -35,11 +39,15 @@ public function testCreateHighlighted2() $snippetLine = new SnippetLine( 'Testing string to highlight some test values, Test is case-sensitive.', SnippetSource::FORMAT_PLAIN_TEXT, - ['Test'], + new PorterStemmerEnglish(), + ['Test'], // unknown stem, stems are normalized to lower case, however there is a match due to direct comparison 1 ); - $this->assertEquals('Testing string to highlight some test values, Test is case-sensitive.', $snippetLine->getHighlighted('%s', false)); + $this->assertEquals( + 'Testing string to highlight some test values, Test is case-sensitive.', + $snippetLine->getHighlighted('%s', false) + ); } public function testJoinHighlighted() @@ -47,11 +55,15 @@ public function testJoinHighlighted() $snippetLine = new SnippetLine( 'Testing string to highlight some test values, Test is case-sensitive.', SnippetSource::FORMAT_PLAIN_TEXT, + new PorterStemmerEnglish(), ['to', 'highlight'], 1 ); - $this->assertEquals('Testing string to highlight some test values, Test is case-sensitive.', $snippetLine->getHighlighted('%s',false)); + $this->assertEquals( + 'Testing string to highlight some test values, Test is case-sensitive.', + $snippetLine->getHighlighted('%s', false) + ); } public function testCreateHighlightedFail() @@ -59,6 +71,7 @@ public function testCreateHighlightedFail() $snippetLine = new SnippetLine( 'Testing string to highlight some test values, Test is case-sensitive.', SnippetSource::FORMAT_PLAIN_TEXT, + new PorterStemmerEnglish(), ['test', 'is'], 2 ); diff --git a/tests/unit/Rose/Entity/SnippetTest.php b/tests/unit/Rose/Entity/SnippetTest.php index 4fc533f..81460fe 100644 --- a/tests/unit/Rose/Entity/SnippetTest.php +++ b/tests/unit/Rose/Entity/SnippetTest.php @@ -1,6 +1,6 @@ %s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0)); + $snippet = new Snippet( + '%s', + SnippetLine::createFromSnippetSourceWithoutFoundWords(new SnippetSource('introduction', SnippetSource::FORMAT_PLAIN_TEXT, 0, 0)) + ); $snippet - ->attachSnippetLine(0, 6, $snippetLine1) - ->attachSnippetLine(7, 9, $snippetLine2) + ->attachSnippetLine(1, 7, $snippetLine1) + ->attachSnippetLine(8, 10, $snippetLine2) ; $this->assertEquals( - 'Testing string to highlight some test values. Test is case-sensitive.', + 'Testing string to highlight some test values. Test is case-sensitive.', $snippet->toString() ); } @@ -51,7 +58,7 @@ public function testSnippet2() 2, 13, 'Тут есть тонкость - нужно проверить, как происходит экранировка в сущностях вроде +.', - ['сущностях'], + ['сущност'], ], [ 14, @@ -61,10 +68,13 @@ public function testSnippet2() ], ]; - $snippet = new Snippet(2, '%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0)); + $snippet = new Snippet( + '%s', + SnippetLine::createFromSnippetSourceWithoutFoundWords(new SnippetSource('introduction', SnippetSource::FORMAT_PLAIN_TEXT, 0, 1)) + ); foreach ($data as $row) { - $snippet->attachSnippetLine($row[0], $row[1], new SnippetLine($row[2], SnippetSource::FORMAT_PLAIN_TEXT, $row[3], count($row[3]))); + $snippet->attachSnippetLine($row[0], $row[1], new SnippetLine($row[2], SnippetSource::FORMAT_PLAIN_TEXT, new PorterStemmerRussian(), $row[3], \count($row[3]))); } $this->assertEquals( @@ -75,51 +85,53 @@ public function testSnippet2() public function testSnippetsUnique() { - $snippet = new Snippet(1, '%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0)); + $stemmer = new PorterStemmerEnglish(); + $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); $snippet - ->attachSnippetLine(0, 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(4, 7, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(8, 11, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(12, 15, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(16, 19, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(20, 23, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(24, 27, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) + ->attachSnippetLine(0, 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(4, 7, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(8, 11, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(12, 15, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(16, 19, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(20, 23, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(24, 27, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) ; $this->assertEquals( 'Try to test 1... Try to test 2.', - $snippet->toString(0.6) + $snippet->toString() ); - $snippet = new Snippet(1, '%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0)); + $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); $snippet - ->attachSnippetLine(0 * 4, 0 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(1 * 4, 1 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(2 * 4, 2 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(3 * 4, 3 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(4 * 4, 4 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(5 * 4, 5 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(6 * 4, 6 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(7 * 4, 7 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(8 * 4, 8 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(9 * 4, 9 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(10 * 4, 10 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 1)) - ->attachSnippetLine(11 * 4, 11 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, ['test'], 2)) + ->attachSnippetLine(0 * 4, 0 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(1 * 4, 1 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(2 * 4, 2 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(3 * 4, 3 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(4 * 4, 4 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(5 * 4, 5 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(6 * 4, 6 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(7 * 4, 7 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(8 * 4, 8 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(9 * 4, 9 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(10 * 4, 10 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) + ->attachSnippetLine(11 * 4, 11 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 2)) ; $this->assertEquals( 'Try to test 1... Try to test 2... Try to test 4.', - $snippet->toString(0.6) + $snippet->toString() ); } public function testEmptySnippet() { - $snippet = new Snippet(0, '%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0)); + $stemmer = new PorterStemmerEnglish(); + $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); $snippet->toString(); - $snippet = new Snippet(0, '%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, [], 0.0)); - $snippet->attachSnippetLine(1, 1, new SnippetLine('line1', SnippetSource::FORMAT_PLAIN_TEXT, [], 0)); + $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); + $snippet->attachSnippetLine(1, 1, new SnippetLine('line1', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0)); $snippet->toString(); } } diff --git a/tests/unit/Rose/Helper/StringHelperTest.php b/tests/unit/Rose/Helper/StringHelperTest.php index 2e4f168..af2a5ac 100644 --- a/tests/unit/Rose/Helper/StringHelperTest.php +++ b/tests/unit/Rose/Helper/StringHelperTest.php @@ -1,9 +1,11 @@ - -1, 'u' => 1, 'b' => 1], ], + [ + '\i123 \b456 \i789', + '\i123 \b456 \i789\B\I\I', // NOTE: This not what one expects. Current implementation does not account for the same nested tags since they do not make sense + ['i' => 2, 'b' => 1], + ], + [ + '\I 123 \i', + '\I 123 \i', + ['i' => 0], + ], + ]; + } + + /** + * @dataProvider getUnbalancedInternalFormattingDataProvider + */ + public function testGetUnbalancedInternalFormatting(string $text, array $expected): void + { + $this->assertEquals($expected, StringHelper::getUnbalancedInternalFormatting($text)); + } + + public function getUnbalancedInternalFormattingDataProvider(): array + { + return [ + [ + '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.', + [['i'], []], + ], + [ + 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.', + [[], []], + ], + ['', [[], []]], + ['456789i', [[], []]], + [ + '456789\\I', + [[], ['I']], + ], + [ + '456789\\\\I', + [[], []], + ], + [ + '456789\\\\\\I', + [[], ['I']], + ], + [ + '456789\\\\\\\\I', + [[], []], + ], + [ + '456789\\\\\\\\\\I', + [[], ['I']], + ], + [ + '\\u456789', + [['u'], []], + ], + [ + '\\u\\D\\\\I\\b', + [['u', 'b'], ['D']], + ], + [ + '\i123 \b456 \i789', + [['i', 'b', 'i'], []], + ], + [ + '\I 123 \i', + [['i'], ['I']], + ], ]; } } diff --git a/tests/unit/Rose/IntegrationTest.php b/tests/unit/Rose/IntegrationTest.php index 73bce27..e733338 100644 --- a/tests/unit/Rose/IntegrationTest.php +++ b/tests/unit/Rose/IntegrationTest.php @@ -106,7 +106,7 @@ public function testFeatures( $this->assertEquals(2.5953804134970615, $items[0]->getRelevance()); $this->assertEquals(new \DateTime('2016-08-20 00:00:00+00:00'), $items[0]->getDate()); - $this->assertEquals('This is the second page to be indexed. Let\'s compose something new.', $items[0]->getSnippet(), 'No snippets due to keyword match, no description provided, first sentences are used.'); + $this->assertEquals('This is the second page to be indexed. Let's compose something new.', $items[0]->getSnippet(), 'No snippets due to keyword match, no description provided, first sentences are used.'); $resultSet2 = $finder->find((new Query('content'))->setLimit(2)); diff --git a/tests/unit/Rose/SnippetsTest.php b/tests/unit/Rose/SnippetsTest.php index da5f1f1..28f6f07 100644 --- a/tests/unit/Rose/SnippetsTest.php +++ b/tests/unit/Rose/SnippetsTest.php @@ -212,7 +212,31 @@ public function testSnippets(array $indexables) $resultSet = $this->finder->find(new Query('nu')); $this->assertEquals( - 'Абзац с формулой с буквой nu, которая не должна подсвечиваться в формуле $$E=h\nu$$.', + 'Абзац с формулой с буквой nu, которая не должна подсвечиваться в формуле $$E=h\nu$$. А это просто строка с формулой $$g{\mu\nu}$$.', + $resultSet->getItems()[0]->getFormattedSnippet() + ); + + $resultSet = $this->finder->find(new Query('mu')); + $this->assertEquals( + 'А это просто строка с формулой $$g{\mu\nu}$$.', + $resultSet->getItems()[0]->getFormattedSnippet() + ); + + $resultSet = $this->finder->find(new Query('экстремум')); + $this->assertEquals( + 'Экстремум функции, в первом приближении, восстанавливает абстрактный разрыв функции.', + $resultSet->getItems()[0]->getFormattedSnippet() + ); + + $resultSet = $this->finder->find(new Query('разрыв абстрактно')); + $this->assertEquals( + 'Экстремум функции, в первом приближении, восстанавливает абстрактный разрыв функции. · Отсюда естественно следует, что интеграл от функции, имеющий конечный разрыв обуславливает тригонометрический интеграл по поверхности, явно демонстрируя всю чушь вышесказанного. В соответствии с законом больших чисел, интеграл Пуассона стремительно обуславливает положительный разрыв функции.', + $resultSet->getItems()[0]->getFormattedSnippet() + ); + + $resultSet = $this->finder->find(new Query('восстанавливать приближение')); + $this->assertEquals( + 'Экстремум функции, в первом приближении, восстанавливает абстрактный разрыв функции. · Линейное программирование, в первом приближении, необходимо и достаточно. · Метод последовательных приближений, следовательно, реально создает график функции.', $resultSet->getItems()[0]->getFormattedSnippet() ); } @@ -225,7 +249,7 @@ public function indexableProvider() Натуралистическая парадигма, короче говоря, ограничивает экзистенциальный референдум. Политический процесс в современной России определяет гуманизм. Иначе говоря, политическая культура практически представляет собой механизм власти. Теория политических подсистем нетривиальна, что такое подсистема? Технология коммуникации обретает онтологический референдум, утверждает руководитель аппарата Правительства. Согласно теории Э.Тоффлера ("Шок будущего"), коллапс Советского Союза иллюстрирует твердый экзистенциальный континентально-европейский тип политической культуры. Марксизм вызывает современный референдум. В данном случае можно согласиться с Данилевским, считавшим, что информационно-технологическая революция сохраняет экзистенциальный референдум.'), - new Indexable('id_2', 'Анормальный предел последовательности: предпосылки и развитие', 'Функция выпуклая кверху вырождена. Функция многих переменных положительна. Экстремум функции, в первом приближении, восстанавливает абстрактный разрыв функции. Несмотря на сложности, аффинное преобразование реально отражает интеграл от функции, обращающейся в бесконечность вдоль линии. Теорема порождает интеграл от функции, обращающейся в бесконечность вдоль линии, откуда следует доказываемое равенство. + new Indexable('id_2', 'Анормальный предел последовательности: предпосылки и развитие', 'Функция выпуклая кверху вырождена. Функция многих переменных положительна. Экстремум функции, в первом приближении, восстанавливает абстрактный разрыв функции. Несмотря на сложности, аффинное преобразование реально отражает интеграл от функции, обращающейся в бесконечность вдоль линии. Теорема порождает интеграл от функции, обращающейся в бесконечность вдоль линии, откуда следует доказываемое равенство. Линейное программирование, в первом приближении, необходимо и достаточно. Отсюда естественно следует, что интеграл от функции, имеющий конечный разрыв обуславливает тригонометрический интеграл по поверхности, явно демонстрируя всю чушь вышесказанного. В соответствии с законом больших чисел, интеграл Пуассона стремительно обуславливает положительный разрыв функции. @@ -270,7 +294,7 @@ public function indexableProvider()

Ошибка астатически даёт более простую систему дифференциальных уравнений, если исключить небольшой угол тангажа. Если пренебречь малыми величинами, то видно, что механическая природа устойчиво требует большего внимания к анализу ошибок, которые даёт устойчивый маховик. Исходя из уравнения Эйлера, прибор вертикально позволяет пренебречь колебаниями корпуса, хотя этого в любом случае требует поплавковый ньютонометр.

-

Абзац с формулой с буквой nu, которая не должна подсвечиваться в формуле $$E=h\nu$$.

+

Абзац с формулой с буквой nu, которая не должна подсвечиваться в формуле $$E=h\nu$$. А это просто строка с формулой $$g{\mu\nu}$$.

Уравнение возмущенного движения поступательно характеризует подвижный объект. Прецессия гироскопа косвенно интегрирует нестационарный вектор угловой скорости, изменяя направление движения. Угловая скорость, обобщая изложенное, неподвижно не входит своими составляющими, что очевидно, в силы нормальных реакций связей, так же как и кожух. Динамическое уравнение Эйлера, в силу третьего закона Ньютона, вращательно связывает ньютонометр, не забывая о том, что интенсивность диссипативных сил, характеризующаяся величиной коэффициента D, должна лежать в определённых пределах. Еще 1 раз проверим, как gt работает защита против <script>alert();</script> xss-уязвимостей.

'), new Indexable('id_4', 'Мне не душно', 'Я просто не ощущаю уровень углекислого газа в воздухе. Меня не устраивает.'),