Skip to content

Commit

Permalink
Rewritten highlighting algorithm. IrregularWordsStemmerInterface is r…
Browse files Browse the repository at this point in the history
…emoved as it is not used anymore.
  • Loading branch information
parpalak committed Aug 24, 2024
1 parent 251778d commit c93a642
Show file tree
Hide file tree
Showing 18 changed files with 434 additions and 363 deletions.
36 changes: 36 additions & 0 deletions src/S2/Rose/Entity/HighlightIntervals.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php
/**
* @copyright 2024 Roman Parpalak
* @license https://opensource.org/license/mit MIT
*/

declare(strict_types=1);

namespace S2\Rose\Entity;

class HighlightIntervals
{
protected array $highlightIntervals = [];
protected bool $hasPreviousInterval = false;

public function addInterval(int $start, int $end): void
{
if (!$this->hasPreviousInterval) {
$this->highlightIntervals[] = [$start, $end];
} else {
$this->highlightIntervals[\count($this->highlightIntervals) - 1][1] = $end;
}

$this->hasPreviousInterval = true;
}

public function skipInterval(): void
{
$this->hasPreviousInterval = false;
}

public function toArray(): array
{
return $this->highlightIntervals;
}
}
30 changes: 9 additions & 21 deletions src/S2/Rose/Entity/ResultItem.php
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
<?php declare(strict_types=1);
<?php
/**
* @copyright 2016-2023 Roman Parpalak
* @copyright 2016-2024 Roman Parpalak
* @license MIT
*/

declare(strict_types=1);

namespace S2\Rose\Entity;

use S2\Rose\Entity\Metadata\ImgCollection;
use S2\Rose\Entity\Metadata\SnippetSource;
use S2\Rose\Exception\InvalidArgumentException;
use S2\Rose\Exception\RuntimeException;
use S2\Rose\Snippet\WordsByStemsExtractor;
use S2\Rose\Stemmer\StemmerInterface;

class ResultItem
Expand Down Expand Up @@ -113,7 +114,7 @@ public function getSnippet(): string
return $this->description;
}

$snippet = $this->snippet->toString(0.3);
$snippet = $this->snippet->toString();
if ($snippet) {
return $snippet;
}
Expand All @@ -127,7 +128,7 @@ public function getFormattedSnippet(): string
return $this->description;
}

$snippet = $this->snippet->toString(0.3, true);
$snippet = $this->snippet->toString(true);
if ($snippet) {
return $snippet;
}
Expand All @@ -137,8 +138,6 @@ public function getFormattedSnippet(): string

/**
* @param string[] $words
*
* @return $this
*/
public function setFoundWords(array $words): self
{
Expand All @@ -148,14 +147,7 @@ public function setFoundWords(array $words): self
}

/**
* TODO Refactor the highlight logic to a separate class.
*
* @param StemmerInterface $stemmer
*
* @return string
*
* @throws RuntimeException
* @see \S2\Rose\Snippet\SnippetBuilder::buildSnippet for dublicated logic
*/
public function getHighlightedTitle(StemmerInterface $stemmer): string
{
Expand All @@ -165,16 +157,12 @@ public function getHighlightedTitle(StemmerInterface $stemmer): string
throw new InvalidArgumentException('Highlight template must contain "%s" substring for sprintf() function.');
}

$extractor = new WordsByStemsExtractor($stemmer, $this->foundWords);

[$foundWords,] = $extractor->extract($this->title);


$snippetLine = new SnippetLine(
$this->title,
SnippetSource::FORMAT_PLAIN_TEXT,
array_keys($foundWords),
\count($foundWords)
$stemmer,
$this->foundWords,
0
);

return $snippetLine->getHighlighted($template, false);
Expand Down
49 changes: 8 additions & 41 deletions src/S2/Rose/Entity/Snippet.php
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
<?php declare(strict_types=1);
<?php
/**
* @copyright 2016-2023 Roman Parpalak
* @copyright 2016-2024 Roman Parpalak
* @license MIT
*/

namespace S2\Rose\Entity;
declare(strict_types=1);

use S2\Rose\Entity\Metadata\SnippetSource;
use S2\Rose\Helper\StringHelper;
namespace S2\Rose\Entity;

class Snippet
{
Expand All @@ -19,23 +18,19 @@ class Snippet
* @var SnippetLine[]
*/
protected array $snippetLines = [];
protected array $snippetLineWeights = [];

/**
* @var SnippetLine[]
*/
protected array $introductionSnippetLines;
protected string $textIntroduction = '';

protected int $foundWordCount = 0;

protected string $highlightTemplate;
private array $snippetMinWordPositions = [];
private array $snippetMaxWordPositions = [];
protected array $snippetMinWordPositions = [];
protected array $snippetMaxWordPositions = [];

public function __construct(int $foundWordNum, string $highlightTemplate, SnippetLine ...$introductionSnippetLines)
public function __construct(string $highlightTemplate, SnippetLine ...$introductionSnippetLines)
{
$this->foundWordCount = $foundWordNum;
$this->highlightTemplate = $highlightTemplate;
$this->introductionSnippetLines = $introductionSnippetLines;
}
Expand All @@ -50,7 +45,6 @@ public function setLineSeparator(string $lineSeparator): self
public function attachSnippetLine(int $minWordPosition, int $maxWordPosition, SnippetLine $snippetLine): self
{
$this->snippetLines[] = $snippetLine;
$this->snippetLineWeights[] = $snippetLine->getRelevance();
$this->snippetMinWordPositions[] = $minWordPosition;
$this->snippetMaxWordPositions[] = $maxWordPosition;

Expand All @@ -67,7 +61,7 @@ public function getTextIntroduction(bool $includeFormatting = false): string
return implode(' ', $result);
}

public function toString(float $acceptableRelevance = 0.6, bool $includeFormatting = false): ?string
public function toString(bool $includeFormatting = false): ?string
{
$stat = [];
foreach ($this->snippetLines as $index => $snippetLine) {
Expand Down Expand Up @@ -99,17 +93,11 @@ public function toString(float $acceptableRelevance = 0.6, bool $includeFormatti
$resultSnippetLines[$idx] = $this->snippetLines[$idx];
}

if ($this->calcLinesRelevance($resultSnippetLines) < $acceptableRelevance) {
return null;
}

return $this->implodeLines($resultSnippetLines, $includeFormatting);
}

/**
* @param array|SnippetLine[] $snippetLines
*
* @return string
*/
private function implodeLines(array $snippetLines, bool $includeFormatting): string
{
Expand Down Expand Up @@ -149,25 +137,4 @@ private function implodeLines(array $snippetLines, bool $includeFormatting): str

return $result;
}

/**
* @param array|SnippetLine[] $snippetLines
*
* @return float|int
*/
private function calcLinesRelevance(array $snippetLines)
{
if (!($this->foundWordCount > 0)) {
return 0;
}

$foundWords = [];
foreach ($snippetLines as $snippetLine) {
foreach ($snippetLine->getFoundWords() as $word) {
$foundWords[$word] = 1;
}
}

return \count($foundWords) * 1.0 / $this->foundWordCount;
}
}
Loading

0 comments on commit c93a642

Please sign in to comment.