Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
parpalak committed Nov 14, 2023
1 parent 67c95db commit b4a1da8
Show file tree
Hide file tree
Showing 20 changed files with 243 additions and 822 deletions.
89 changes: 36 additions & 53 deletions src/S2/Rose/Entity/FulltextResult.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,11 @@

class FulltextResult
{
/**
* @var int
*/
protected $tocSize = 0;

/**
* @var FulltextQuery
*/
protected $query;

/**
* @var FulltextIndexContent
*/
protected $fulltextIndexContent;
protected int $tocSize = 0;
protected FulltextQuery $query;
protected FulltextIndexContent $fulltextIndexContent;

/**
* @param FulltextQuery $query
* @param FulltextIndexContent $fulltextIndexContent
* @param int $tocSize
*/
public function __construct(FulltextQuery $query, FulltextIndexContent $fulltextIndexContent, $tocSize = 0)
public function __construct(FulltextQuery $query, FulltextIndexContent $fulltextIndexContent, int $tocSize = 0)
{
$this->query = $query;
$this->fulltextIndexContent = $fulltextIndexContent;
Expand All @@ -40,13 +24,8 @@ public function __construct(FulltextQuery $query, FulltextIndexContent $fulltext

/**
* https://i.upmath.me/svg/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09xmin%3D-1.2%2C%20xmax%3D1.2%2C%0A%09ymin%3D0%2C%20ymax%3D1.1%2C%0A%09restrict%20y%20to%20domain%3D-0.1%3A1%2C%0A%09ytick%3D%7B1%7D%2C%0A%09xtick%3D%7B-1%2C1%7D%2C%0A%09axis%20equal%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D-2%3A1%2Csemithick%5D%7Bexp(-(x%2F0.38)%5E2)%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(0.8%2C0.6)%7D%20node%7B%24y%3De%5E%7B-%5Cleft(x%2F0.38%5Cright)%5E2%7D%24%7D%3B%0A%5Cpath%20(axis%20cs%3A0%2C0)%20node%20%5Banchor%3Dnorth%20west%2Cyshift%3D-0.07cm%5D%20%7B0%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D
*
* @param int $tocSize
* @param int $foundTocEntriesNum
*
* @return float
*/
public static function frequencyReduction($tocSize, $foundTocEntriesNum)
public static function frequencyReduction(int $tocSize, int $foundTocEntriesNum): float
{
if ($tocSize < 5) {
return 1;
Expand All @@ -57,12 +36,8 @@ public static function frequencyReduction($tocSize, $foundTocEntriesNum)

/**
* Weight ratio for repeating words in the indexed item.
*
* @param int $repeatNum
*
* @return float
*/
protected static function repeatWeightRatio($repeatNum)
protected static function repeatWeightRatio(int $repeatNum): float
{
return min(0.5 * ($repeatNum - 1) + 1, 4);
}
Expand All @@ -71,12 +46,8 @@ protected static function repeatWeightRatio($repeatNum)
* Weight ratio for entry size (prefer some middle size)
*
* https://i.upmath.me/g/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09ymin%3D0%2C%20ymax%3D5%2C%0A%09xmin%3D0%2C%20xmax%3D1100%2C%0A%09ytick%3D%7B1%2C2%7D%2C%0A%09xtick%3D%7B50%2C200%2C500%2C1000%7D%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%7D%3B%0A%5Caddplot%5Bblue%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(600%2C3)%7D%20node%7B%24y%3D1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%24%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D
*
* @param int $totalWordsNum
*
* @return float
*/
protected static function entrySizeWeightRatio($totalWordsNum)
protected static function entrySizeWeightRatio(int $totalWordsNum): float
{
return $totalWordsNum >= 10 ? 1.0 + 1.0 / (1.0 + exp((sqrt($totalWordsNum) - 18) ** 2 / 60.0)) : 1;
}
Expand All @@ -89,42 +60,54 @@ protected static function entrySizeWeightRatio($totalWordsNum)
*
* @return float
*/
protected static function neighbourWeight($distance)
protected static function neighbourWeight(float $distance): float
{
return 30.0 / (1 + pow($distance / 7.0, 2));
}

/**
* @param ResultSet $resultSet
*
* @throws ImmutableException
*/
public function fillResultSet(ResultSet $resultSet)
public function fillResultSet(ResultSet $resultSet): void
{
// $queryWordCount = $this->query->getCount();

$wordReductionRatios = [];
foreach ($this->fulltextIndexContent->toArray() as $word => $items) {
$reductionRatio = self::frequencyReduction($this->tocSize, count($items));
foreach ($this->fulltextIndexContent->toArray() as $word => $indexedItems) {
$reductionRatio = self::frequencyReduction($this->tocSize, \count($indexedItems));
$wordReductionRatios[$word] = $reductionRatio;

foreach ($items as $positions) {
$weights = [
'abundance_reduction' => $reductionRatio,
'repeat_multiply' => self::repeatWeightRatio(count($positions['pos'])),
'entry_size' => self::entrySizeWeightRatio($positions['wordCount']),
];
$resultSet->addWordWeight($word, $positions['extId'], $weights, $positions['pos']);
foreach ($indexedItems as $positions) {
$externalId = $positions['extId'];
if (\count($positions['pos']) > 0) {
$weights = [
'abundance_reduction' => $reductionRatio,
'repeat_multiply' => self::repeatWeightRatio(\count($positions['pos'])),
'entry_size' => self::entrySizeWeightRatio($positions['wordCount']),
];
$resultSet->addWordWeight($word, $externalId, $weights, $positions['pos']);
}
if (\count($positions['kpos']) > 0) {
$resultSet->addWordWeight($word, $externalId, [
'keyword' => 15,
'abundance_reduction' => $reductionRatio,
]);
}
if (\count($positions['tpos']) > 0) {
$resultSet->addWordWeight($word, $externalId, [
'title' => 25,
// TODO seems like this was not used before
// 'abundance_reduction' => $reductionRatio,
]);
}
}
}

$referenceContainer = $this->query->toWordPositionContainer();

$this->fulltextIndexContent->iterateWordPositions(
$this->fulltextIndexContent->iterateContentWordPositions(
static function (ExternalId $id, WordPositionContainer $container) use ($referenceContainer, $wordReductionRatios, $resultSet) {
$pairsDistance = $container->compareWith($referenceContainer);
foreach ($pairsDistance as $pairDistance) {
list($word1, $word2, $distance) = $pairDistance;
[$word1, $word2, $distance] = $pairDistance;
$weight = self::neighbourWeight($distance);
if (isset($wordReductionRatios[$word1])) {
$weight *= $wordReductionRatios[$word1];
Expand Down
12 changes: 0 additions & 12 deletions src/S2/Rose/Exception/UnknownKeywordTypeException.php

This file was deleted.

64 changes: 2 additions & 62 deletions src/S2/Rose/Finder.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php
/**
* Fulltext and keyword search
* Fulltext search
*
* @copyright 2010-2023 Roman Parpalak
* @license MIT
Expand All @@ -17,17 +17,13 @@
use S2\Rose\Exception\ImmutableException;
use S2\Rose\Exception\LogicException;
use S2\Rose\Exception\UnknownIdException;
use S2\Rose\Exception\UnknownKeywordTypeException;
use S2\Rose\Snippet\SnippetBuilder;
use S2\Rose\Stemmer\StemmerInterface;
use S2\Rose\Storage\Dto\SnippetQuery;
use S2\Rose\Storage\StorageReadInterface;

class Finder
{
public const TYPE_TITLE = 1;
public const TYPE_KEYWORD = 2;

protected StorageReadInterface $storage;
protected StemmerInterface $stemmer;
protected ?string $highlightTemplate = null;
Expand Down Expand Up @@ -63,19 +59,10 @@ public function find(Query $query, bool $isDebug = false): ResultSet
$resultSet->setHighlightTemplate($this->highlightTemplate);
}

$rawWords = $query->valueToArray();
$cleanedQuery = implode(' ', $rawWords);
$rawWords = $query->valueToArray();
$resultSet->addProfilePoint('Input cleanup');

if (\count($rawWords) > 1) {
$this->findSpacedKeywords($cleanedQuery, $query->getInstanceId(), $resultSet);
$resultSet->addProfilePoint('Keywords with space');
}

if (\count($rawWords) > 0) {
$this->findSimpleKeywords($rawWords, $query->getInstanceId(), $resultSet);
$resultSet->addProfilePoint('Simple keywords');

$this->findFulltext($rawWords, $query->getInstanceId(), $resultSet);
$resultSet->addProfilePoint('Fulltext search');
}
Expand Down Expand Up @@ -108,23 +95,6 @@ public static function fulltextRateExcludeNum(int $tocSize): int
return max($tocSize * 0.5, 20);
}

/**
* @return int[]|array
* @throws UnknownKeywordTypeException
*/
protected static function getKeywordWeight(int $type): array
{
if ($type === self::TYPE_KEYWORD) {
return ['keyword' => 15];
}

if ($type === self::TYPE_TITLE) {
return ['title' => 25];
}

throw new UnknownKeywordTypeException(sprintf('Unknown type "%s"', $type));
}

/**
* @throws ImmutableException
*/
Expand All @@ -141,36 +111,6 @@ protected function findFulltext(array $words, ?int $instanceId, ResultSet $resul
$fulltextResult->fillResultSet($resultSet);
}

/**
* @param string[] $words
*/
protected function findSimpleKeywords(array $words, ?int $instanceId, ResultSet $result): void
{
$wordsWithStems = $words;
foreach ($words as $word) {
$stem = $this->stemmer->stemWord($word);
$wordsWithStems[] = $stem;
}

foreach ($this->storage->getSingleKeywordIndexByWords($wordsWithStems, $instanceId) as $word => $content) {
$content->iterate(static function (ExternalId $externalId, $type, $tocSize, $foundTocEntriesNum) use ($word, $result) {
$weights = self::getKeywordWeight($type);
if ($tocSize !== null && $foundTocEntriesNum !== null) {
$weights['abundance_reduction'] = FulltextResult::frequencyReduction($tocSize, $foundTocEntriesNum);
}
$result->addWordWeight($word, $externalId, $weights);
});
}
}

protected function findSpacedKeywords(string $string, ?int $instanceId, ResultSet $result): void
{
$content = $this->storage->getMultipleKeywordIndexByString($string, $instanceId);
$content->iterate(static function (ExternalId $externalId, $type) use ($string, $result) {
$result->addWordWeight($string, $externalId, self::getKeywordWeight($type));
});
}

public function buildSnippets(array $relevanceByExternalIds, ResultSet $resultSet): void
{
$snippetQuery = new SnippetQuery(ExternalIdCollection::fromStringArray(array_keys($relevanceByExternalIds)));
Expand Down
94 changes: 46 additions & 48 deletions src/S2/Rose/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,65 +76,26 @@ protected static function arrayFromStr(string $contents): array
return $words;
}

protected function addKeywordToIndex(string $word, ExternalId $externalId, int $type): void
{
if ($word === '') {
return;
}

$word = str_replace('ё', 'е', $word);

if (strpos($word, ' ') !== false) {
$this->storage->addToMultipleKeywordIndex($word, $externalId, $type);
} else {
$this->storage->addToSingleKeywordIndex($word, $externalId, $type);
}
}

protected function addToIndex(ExternalId $externalId, string $title, ContentWithMetadata $content, string $keywords): void
{
// Processing title
foreach (self::arrayFromStr($title) as $titleWord) {
$this->addKeywordToIndex($this->stemmer->stemWord(trim($titleWord)), $externalId, Finder::TYPE_TITLE);
}

// Processing keywords
foreach (explode(',', $keywords) as $item) {
$this->addKeywordToIndex($this->stemmer->stemWord(trim($item)), $externalId, Finder::TYPE_KEYWORD);
}

// Fulltext index

$sentenceCollection = $content->getSentenceMap()->toSentenceCollection();
$words = $sentenceCollection->getWordsArray();
$words = array_merge($words, self::arrayFromStr(str_replace(', ', ' ', $keywords)));

$subWords = [];

foreach ($words as $i => &$word) {
if ($this->storage->isExcluded($word)) {
foreach ($words as $i => $word) {
if ($this->storage->isExcludedWord($word)) {
unset($words[$i]);
continue;
}

$stemmedWord = $this->stemmer->stemWord($word, false);

// If the word contains punctuation marks like hyphen, add a variant without it
if (false !== strpbrk($stemmedWord, '-.,')) {
foreach (preg_split('#[\-.,]#', $word) as $k => $subWord) {
if ($subWord) {
$subWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false);
}
}
}

$word = $stemmedWord;
}
unset($word);

$this->storage->addMetadata($externalId, \count($words), $content->getImageCollection());
$this->storage->addSnippets($externalId, ...$sentenceCollection->getSnippetSources());
$this->storage->addToFulltext(array_merge($words, $subWords), $externalId);
$this->storage->addToFulltextIndex(
$this->getStemsWithComponents(self::arrayFromStr($title)),
$this->getStemsWithComponents(self::arrayFromStr($keywords)), // TODO consider different semantics of space and comma?
$this->getStemsWithComponents($words),
$externalId
);
}

public function removeById(string $id, ?int $instanceId): void
Expand Down Expand Up @@ -183,7 +144,7 @@ protected function doIndex(Indexable $indexable): void

$this->storage->addEntryToToc($indexable->toTocEntry(), $externalId);

if (!$oldTocEntry || $oldTocEntry->getHash() !== $indexable->calcHash()) {
if ($oldTocEntry === null || $oldTocEntry->getHash() !== $indexable->calcHash()) {
$this->storage->removeFromIndex($externalId);

$extractionResult = $this->extractor->extract($indexable->getContent());
Expand Down Expand Up @@ -219,4 +180,41 @@ protected function doIndex(Indexable $indexable): void
throw $e;
}
}

/**
* Replaces words with stems. Also, this method detects compound words and adds the component stems to the result.
*
* The keys in the result arrays are the positions of the word. For compound words a string representation
* of a float is used to map one index to several words. For example, for input
*
* [10 => 'well-known', 11 => 'facts']
*
* this method returns
*
* [10 => 'well-known', '10.001' => 'well', '10.002' => 'known', 11 => 'fact']
*
* @param array $words
* @return array
*/
private function getStemsWithComponents(array $words): array
{
$componentsOfCompoundWords = [];
foreach ($words as $i => &$word) {
$stemmedWord = $this->stemmer->stemWord($word, false);

// If the word contains punctuation marks like hyphen, add a variant without it
if (false !== strpbrk($stemmedWord, '-.,')) {
foreach (preg_split('#[\-.,]#', $word) as $k => $subWord) {
if ($subWord) {
$componentsOfCompoundWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false);
}
}
}

$word = $stemmedWord;
}
unset($word);

return array_merge($words, $componentsOfCompoundWords);
}
}
Loading

0 comments on commit b4a1da8

Please sign in to comment.