Skip to content

Commit

Permalink
Adopted refactoring in ArrayStorage and fixed tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
parpalak committed Nov 15, 2023
1 parent f476ff1 commit 9a33080
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 219 deletions.
12 changes: 6 additions & 6 deletions src/S2/Rose/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -79,21 +79,21 @@ protected static function arrayFromStr(string $contents): array
protected function addToIndex(ExternalId $externalId, string $title, ContentWithMetadata $content, string $keywords): void
{
$sentenceCollection = $content->getSentenceMap()->toSentenceCollection();
$words = $sentenceCollection->getWordsArray();
$words = array_merge($words, self::arrayFromStr(str_replace(', ', ' ', $keywords)));
$contentWords = $sentenceCollection->getWordsArray();
$contentWords = array_merge($contentWords, self::arrayFromStr(str_replace(', ', ' ', $keywords))); // TODO not to merge

foreach ($words as $i => $word) {
foreach ($contentWords as $i => $word) {
if ($this->storage->isExcludedWord($word)) {
unset($words[$i]);
unset($contentWords[$i]);
}
}

$this->storage->addMetadata($externalId, \count($words), $content->getImageCollection());
$this->storage->addMetadata($externalId, \count($contentWords), $content->getImageCollection());
$this->storage->addSnippets($externalId, ...$sentenceCollection->getSnippetSources());
$this->storage->addToFulltextIndex(
$this->getStemsWithComponents(self::arrayFromStr($title)),
$this->getStemsWithComponents(self::arrayFromStr($keywords)), // TODO consider different semantics of space and comma?
$this->getStemsWithComponents($words),
$this->getStemsWithComponents($contentWords),
$externalId
);
}
Expand Down
38 changes: 30 additions & 8 deletions src/S2/Rose/Storage/ArrayFulltextStorage.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

class ArrayFulltextStorage implements FulltextProxyInterface
{
public const PREFIX_KEYWORD = 'K';
public const PREFIX_TITLE = 'T';

/**
* @var array|string[][]
*/
Expand Down Expand Up @@ -37,11 +40,17 @@ public function getByWord(string $word): array
$result = [];
foreach ($this->fulltextIndex[$word] as $id => $entries) {
if (\is_int($entries)) {
$result[$id][] = $entries;
$result[$id][self::TYPE_CONTENT][] = $entries;
} else {
$entries = explode('|', $entries);
foreach ($entries as $position) {
$result[$id][] = base_convert($position, 36, 10);
if ($position[0] === self::PREFIX_TITLE) {
$result[$id][self::TYPE_TITLE][] = base_convert(substr($position, 1), 36, 10);
} elseif ($position[0] === self::PREFIX_KEYWORD) {
$result[$id][self::TYPE_KEYWORD][] = base_convert(substr($position, 1), 36, 10);
} else {
$result[$id][self::TYPE_CONTENT][] = base_convert($position, 36, 10);
}
}
}
}
Expand All @@ -64,25 +73,38 @@ public function countByWord(string $word): int
/**
* {@inheritdoc}
*/
public function addWord(string $word, int $id, int $position): void
public function addWord(string $word, int $id, int $type, int $position): void
{
$word = (string)$word;
if ($word === '') {
return;
}

if (isset($this->fulltextIndex[$word][$id])) {
$positionStr = base_convert($position, 10, 36);
if ($type === self::TYPE_KEYWORD) {
$positionStr = self::PREFIX_KEYWORD . $positionStr;
} elseif ($type === self::TYPE_TITLE) {
$positionStr = self::PREFIX_TITLE . $positionStr;
}

$value = $this->fulltextIndex[$word][$id];
if (\is_int($value)) {
// There was the only one position, but it's no longer the case.
// There was the only one content position, but it's no longer the case.
// Convert to the 36-based number system.
$this->fulltextIndex[$word][$id] = base_convert($value, 10, 36) . '|' . base_convert($position, 10, 36);
$this->fulltextIndex[$word][$id] = base_convert($value, 10, 36) . '|' . $positionStr;
} else {
// Appending
$this->fulltextIndex[$word][$id] = $value . '|' . base_convert($position, 10, 36);
$this->fulltextIndex[$word][$id] = $value . '|' . $positionStr;
}
} else {
// If there is the only one position in index, the position is stored as decimal number
// If there is the only one content position in index, the position is stored as decimal number
if ($type === self::TYPE_KEYWORD) {
/** @noinspection CallableParameterUseCaseInTypeContextInspection */
$position = self::PREFIX_KEYWORD . base_convert($position, 10, 36);
} elseif ($type === self::TYPE_TITLE) {
/** @noinspection CallableParameterUseCaseInTypeContextInspection */
$position = self::PREFIX_TITLE . base_convert($position, 10, 36);
}
$this->fulltextIndex[$word][$id] = $position;
}
}
Expand Down
83 changes: 24 additions & 59 deletions src/S2/Rose/Storage/ArrayStorage.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php
/**
* @copyright 2016-2020 Roman Parpalak
* @copyright 2016-2023 Roman Parpalak
* @license MIT
*/

Expand All @@ -20,40 +20,14 @@

abstract class ArrayStorage implements StorageReadInterface, StorageWriteInterface
{
/**
* @var array
*/
protected $excludedWords = [];

/**
* @var array
*/
protected $indexSingleKeywords = [];

/**
* @var array
*/
protected $indexBaseKeywords = [];

/**
* @var array
*/
protected $indexMultiKeywords = [];

/**
* @var array
*/
protected $metadata = [];
protected array $excludedWords = [];
protected array $metadata = [];

/**
* @var TocEntry[]
*/
protected $toc = [];

/**
* @var FulltextProxyInterface
*/
protected $fulltextProxy;
protected array $toc = [];
protected FulltextProxyInterface $fulltextProxy;

/**
* @var array|ExternalId
Expand All @@ -68,13 +42,19 @@ public function fulltextResultByWords(array $words, ?int $instanceId): FulltextI
$result = new FulltextIndexContent();
foreach ($words as $word) {
$data = $this->fulltextProxy->getByWord($word);
foreach ($data as $id => $positions) {
foreach ($data as $id => $positionsByType) {
$externalId = $this->externalIdFromInternalId($id);
if ($externalId === null) {
continue;
}
if ($instanceId === null || $externalId->getInstanceId() === $instanceId) {
$result->add($word, $externalId, $positions, isset($this->metadata[$id]) ? $this->metadata[$id]['wordCount'] : 0);
$result->add($word, new FulltextIndexPositionBag(
$externalId,
$positionsByType[FulltextProxyInterface::TYPE_TITLE] ?? [],
$positionsByType[FulltextProxyInterface::TYPE_KEYWORD] ?? [],
$positionsByType[FulltextProxyInterface::TYPE_CONTENT] ?? [],
isset($this->metadata[$id]) ? $this->metadata[$id]['wordCount'] : 0
));
}
}
}
Expand All @@ -84,6 +64,7 @@ public function fulltextResultByWords(array $words, ?int $instanceId): FulltextI

/**
* {@inheritdoc}
* @throws UnknownIdException
*/
public function getSnippets(SnippetQuery $snippetQuery): SnippetResult
{
Expand All @@ -108,12 +89,17 @@ public function getSnippets(SnippetQuery $snippetQuery): SnippetResult
* {@inheritdoc}
* @throws UnknownIdException
*/
public function addToFulltextIndex(array $contentWords, array $titleWords, array $keywords, ExternalId $externalId): void
public function addToFulltextIndex(array $titleWords, array $keywords, array $contentWords, ExternalId $externalId): void
{
// TODO
$id = $this->internalIdFromExternalId($externalId);
foreach ($titleWords as $position => $word) {
$this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_TITLE, (int)$position);
}
foreach ($keywords as $position => $word) {
$this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_KEYWORD, (int)$position);
}
foreach ($contentWords as $position => $word) {
$this->fulltextProxy->addWord($word, $id, (int)$position);
$this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_CONTENT, (int)$position);
}
}

Expand Down Expand Up @@ -149,33 +135,12 @@ public function removeFromIndex(ExternalId $externalId): void

$this->fulltextProxy->removeById($internalId);

foreach ($this->indexSingleKeywords as &$data) {
foreach ($this->metadata as &$data) {
if (isset($data[$internalId])) {
unset($data[$internalId]);
}
}
unset($data);

foreach ($this->indexBaseKeywords as &$data2) {
if (isset($data2[$internalId])) {
unset($data2[$internalId]);
}
}
unset($data2);

foreach ($this->indexMultiKeywords as &$data3) {
if (isset($data3[$internalId])) {
unset($data3[$internalId]);
}
}
unset($data3);

foreach ($this->metadata as &$data4) {
if (isset($data4[$internalId])) {
unset($data4[$internalId]);
}
}
unset($data4);
}

/**
Expand Down Expand Up @@ -249,7 +214,7 @@ public function getTocByExternalId(ExternalId $externalId): ?TocEntry
{
$serializedExtId = $externalId->toString();

return isset($this->toc[$serializedExtId]) ? $this->toc[$serializedExtId] : null;
return $this->toc[$serializedExtId] ?? null;
}

/**
Expand Down
7 changes: 4 additions & 3 deletions src/S2/Rose/Storage/Database/PdoStorage.php
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,13 @@ public function getSnippets(SnippetQuery $snippetQuery): SnippetResult
*/
public function addToFulltextIndex(array $titleWords, array $keywords, array $contentWords, ExternalId $externalId): void
{
if (empty($contentWords)) {
$allWords = array_merge(array_values($contentWords), array_values($titleWords), array_values($keywords));
if (\count($allWords) === 0) {
return;
}

$internalId = $this->getInternalIdFromExternalId($externalId);
$wordIds = $this->getWordIds(array_merge(array_values($contentWords), array_values($titleWords), array_values($keywords)));
$wordIds = $this->getWordIds($allWords);

/**
* @see \S2\Rose\Entity\WordPositionContainer::compareArrays for sorting requirement
Expand Down Expand Up @@ -292,7 +293,7 @@ public function getSimilar(ExternalId $externalId, bool $includeFormatting, ?int
$row['snippet2'] = '';
}
// TODO take into account format_id of these snippets
$row['snippet'] = $includeFormatting ? StringHelper::convertInternalFormattingToHtml($row['snippet']) : StringHelper::clearInternalFormatting($row['snippet']);
$row['snippet'] = $includeFormatting ? StringHelper::convertInternalFormattingToHtml($row['snippet']) : StringHelper::clearInternalFormatting($row['snippet']);
$row['snippet2'] = $includeFormatting ? StringHelper::convertInternalFormattingToHtml($row['snippet2']) : StringHelper::clearInternalFormatting($row['snippet2']);
}

Expand Down
Loading

0 comments on commit 9a33080

Please sign in to comment.