diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php index 67a1ab7e..3dd16346 100644 --- a/.php-cs-fixer.php +++ b/.php-cs-fixer.php @@ -19,7 +19,7 @@ 'method_chaining_indentation' => false, 'multiline_whitespace_before_semicolons' => false, 'native_function_invocation' => ['include' => ['@all']], - 'no_superfluous_phpdoc_tags' => false, + 'no_superfluous_phpdoc_tags' => true, 'no_unset_on_property' => false, 'ordered_imports' => ['imports_order' => ['class', 'function', 'const'], 'sort_algorithm' => 'alpha'], 'php_unit_internal_class' => false, @@ -27,8 +27,9 @@ 'php_unit_test_class_requires_covers' => false, 'phpdoc_align' => false, 'phpdoc_types_order' => ['null_adjustment' => 'always_last', 'sort_algorithm' => 'none'], + 'phpdoc_no_alias_tag' => ['replacements' => ['type' => 'var', 'link' => 'see']], 'single_line_comment_style' => false, - 'trailing_comma_in_multiline' => ['elements' => ['arrays', 'arguments']], + 'trailing_comma_in_multiline' => ['elements' => ['arrays', 'arguments', 'match', 'parameters']], 'yoda_style' => false, 'php_unit_strict' => false, 'php_unit_test_annotation' => false, @@ -50,7 +51,8 @@ 'static_lambda' => true, 'ternary_to_null_coalescing' => true, 'use_arrow_functions' => true, - ]) + 'no_alias_language_construct_call' => true, + ]) ->setRiskyAllowed(true) ->setFinder($finder) -; + ; diff --git a/bundle/Command/IndexPageContentCommand.php b/bundle/Command/IndexPageContentCommand.php new file mode 100644 index 00000000..16069534 --- /dev/null +++ b/bundle/Command/IndexPageContentCommand.php @@ -0,0 +1,174 @@ + $sitesConfig + */ + public function __construct( + private readonly ContentService $contentService, + private readonly SearchHandler $searchHandler, + private readonly PersistenceHandler $persistenceHandler, + private readonly array $sitesConfig, + ) { + parent::__construct($this::$defaultName); + } + + protected function configure(): void + { + $this + ->setDescription('Index content related through layouts') + ->addOption( + 'content-ids', + null, + InputOption::VALUE_OPTIONAL, + 'Comma separated list of content id\'s of content to index.', + ); + } + + protected function initialize(InputInterface $input, OutputInterface $output): void + { + $this->style = new SymfonyStyle($input, $output); + } + + /** + * @throws NotFoundException + * @throws InvalidArgumentException + * @throws UnauthorizedException + */ + protected function execute(InputInterface $input, OutputInterface $output): int + { + foreach ($this->sitesConfig as $site => $siteConfig) { + $this->style->info('Indexing for site ' . $site); + $this->indexContent($output, $input, $siteConfig); + } + + return Command::SUCCESS; + } + + private function indexContent(OutputInterface $output, InputInterface $input, array $siteConfig): void + { + $contentIds = explode(',', $input->getOption('content-ids')); + + $allowedContentTypes = $siteConfig['allowed_content_types']; + $offset = 0; + $limit = 50; + $totalCount = $this->getTotalCount($allowedContentTypes, $contentIds); + $progressBar = new ProgressBar($output, $totalCount); + + if ($totalCount <= 0) { + $this->style->info('No content found to index, exiting.'); + + return; + } + + $this->style->info('Found ' . $totalCount . ' content objects...'); + + $progressBar->start($totalCount); + + while ($offset < $totalCount) { + $chunk = $this->getChunk($limit, $offset, $allowedContentTypes, $contentIds); + + $this->processChunk($chunk, $progressBar); + + $offset += $limit; + } + + $progressBar->finish(); + + $output->writeln(''); + $this->style->info('Finished.'); + } + + /** + * @throws InvalidArgumentException + */ + private function getTotalCount(array $allowedContentTypes, array $contentIds): int + { + $filter = $this->getFilter($allowedContentTypes, $contentIds); + + $filter + ->withLimit(0) + ->withOffset(0); + + return $this->contentService->find($filter)->getTotalCount() ?? 0; + } + + /** + * @throws InvalidArgumentException + */ + private function getChunk(int $limit, int $offset, array $allowedContentTypes, array $contentIds): ContentList + { + $filter = $this->getFilter($allowedContentTypes, $contentIds); + $filter + ->withLimit($limit) + ->withOffset($offset) + ; + + return $this->contentService->find($filter); + } + + private function getFilter(array $allowedContentTypes, array $contentIds = []): Filter + { + $filter = new Filter(); + $filter->withCriterion(new Query\Criterion\ContentTypeIdentifier($allowedContentTypes)); + + if (count($contentIds) > 0) { + $filter->andWithCriterion(new Query\Criterion\ContentId($contentIds)); + } + + return $filter; + } + + private function processChunk(ContentList $contentList, ProgressBar $progressBar): void + { + foreach ($contentList->getIterator() as $content) { + try { + $this->indexContentWithLocations($content); + $progressBar->advance(); + } catch (IndexPageUnavailableException $exception) { + $this->style->error($exception->getMessage()); + } + } + } + + private function indexContentWithLocations(Content $content): void + { + $this->searchHandler->indexContent( + $this->persistenceHandler->contentHandler()->load($content->id, $content->versionInfo->versionNo), + ); + + $locations = $this->persistenceHandler->locationHandler()->loadLocationsByContent($content->id); + foreach ($locations as $location) { + $this->searchHandler->indexLocation($location); + } + } +} diff --git a/bundle/DependencyInjection/Configuration.php b/bundle/DependencyInjection/Configuration.php index ce69b341..ff931cbe 100644 --- a/bundle/DependencyInjection/Configuration.php +++ b/bundle/DependencyInjection/Configuration.php @@ -8,6 +8,9 @@ use Symfony\Component\Config\Definition\Builder\TreeBuilder; use Symfony\Component\Config\Definition\ConfigurationInterface; +use function array_keys; +use function is_string; + class Configuration implements ConfigurationInterface { protected string $rootNodeName; @@ -25,6 +28,7 @@ public function getConfigTreeBuilder(): TreeBuilder $this->addIndexableFieldTypeSection($rootNode); $this->addSearchResultExtractorSection($rootNode); $this->addAsynchronousIndexingSection($rootNode); + $this->addPageIndexingSection($rootNode); return $treeBuilder; } @@ -73,4 +77,94 @@ private function addAsynchronousIndexingSection(ArrayNodeDefinition $nodeDefinit ->end() ->end(); } + + private function addPageIndexingSection(ArrayNodeDefinition $nodeDefinition): void + { + $keyValidator = static function ($v) { + foreach (array_keys($v) as $key) { + if (!is_string($key)) { + return true; + } + } + + return false; + }; + $nodeDefinition + ->children() + ->arrayNode('page_indexing') + ->addDefaultsIfNotSet() + ->info('Page indexing configuration') + ->children() + ->booleanNode('enabled') + ->info('Use page indexing') + ->defaultFalse() + ->end() + ->arrayNode('sites') + ->useAttributeAsKey('name') + ->normalizeKeys(false) + ->validate() + ->ifTrue($keyValidator) + ->thenInvalid('Site name must be of string type') + ->end() + ->arrayPrototype() + ->children() + ->integerNode('tree_root_location_id') + ->info('Site root Location ID') + ->beforeNormalization()->always(static fn ($v) => is_string($v) ? (int) $v : $v)->end() + ->end() + ->arrayNode('languages_siteaccess_map') + ->info('Language code mapped to page siteaccess') + ->useAttributeAsKey('name') + ->normalizeKeys(false) + ->validate() + ->ifTrue($keyValidator) + ->thenInvalid('Language code must be of string type.') + ->end() + ->scalarPrototype() + ->validate() + ->ifTrue(static fn ($v) => !is_string($v)) + ->thenInvalid('Siteaccess name must be of string type.') + ->end() + ->end() + ->end() + ->arrayNode('fields') + ->info('Mapping of indexed field names to an array of HTML tag selectors') + ->validate() + ->ifTrue($keyValidator) + ->thenInvalid('Indexed field name must be of string type') + ->end() + ->arrayPrototype() + ->useAttributeAsKey('name') + ->normalizeKeys(false) + ->scalarPrototype() + ->validate() + ->ifTrue(static fn ($v) => !is_string($v)) + ->thenInvalid('HTML selector must be of string type.') + ->end() + ->end() + ->end() + ->end() + ->arrayNode('allowed_content_types') + ->info('Content types to index') + ->useAttributeAsKey('name') + ->normalizeKeys(false) + ->scalarPrototype() + ->validate() + ->ifTrue(static fn ($v) => !is_string($v)) + ->thenInvalid('Content type identifier must be of string type.') + ->end() + ->end() + ->end() + ->scalarNode('host') + ->info('Host to index page from, defined in .env files') + ->validate() + ->ifTrue(static fn ($v) => !is_string($v)) + ->thenInvalid('Host must be of string type.') + ->end() + ->end() + ->end() + ->end() + ->end() + ->end(); + } } diff --git a/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php b/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php index 729acae5..fd99e893 100644 --- a/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php +++ b/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php @@ -10,12 +10,21 @@ use Symfony\Component\DependencyInjection\Extension\PrependExtensionInterface; use Symfony\Component\DependencyInjection\Loader; use Symfony\Component\HttpKernel\DependencyInjection\Extension; - use Symfony\Component\Yaml\Yaml; + use function array_key_exists; +use function file_get_contents; class NetgenIbexaSearchExtraExtension extends Extension implements PrependExtensionInterface { + private static array $defaultConfiguration = [ + 'tree_root_location_id' => null, + 'languages_siteaccess_map' => [], + 'host' => null, + 'fields' => [], + 'allowed_content_types' => [], + ]; + public function getAlias(): string { return 'netgen_ibexa_search_extra'; @@ -82,12 +91,11 @@ private function loadBundleSolrEngine(ContainerBuilder $container): void private function processExtensionConfiguration(array $configs, ContainerBuilder $container): void { $configuration = $this->getConfiguration($configs, $container); - $configuration = $this->processConfiguration($configuration, $configs); - $this->processIndexableFieldTypeConfiguration($configuration, $container); $this->processSearchResultExtractorConfiguration($configuration, $container); $this->processAsynchronousIndexingConfiguration($configuration, $container); + $this->processPageIndexingConfiguration($configuration, $container); } private function processSearchResultExtractorConfiguration(array $configuration, ContainerBuilder $container): void @@ -117,4 +125,52 @@ private function processAsynchronousIndexingConfiguration(array $configuration, $configuration['use_asynchronous_indexing'], ); } + + private function processPageIndexingConfiguration(array $configuration, ContainerBuilder $container): void + { + $container->setParameter( + 'netgen_ibexa_search_extra.page_indexing.sites', + $configuration['page_indexing']['sites'] ?? [], + ); + + $container->setParameter( + 'netgen_ibexa_search_extra.page_indexing.enabled', + $configuration['page_indexing']['enabled'] ?? false, + ); + + foreach ($container->getParameter('netgen_ibexa_search_extra.page_indexing.sites') as $siteName => $config) { + $this->setPageIndexingSitesParameters($container, $siteName); + } + } + + private function setPageIndexingSitesParameters(ContainerBuilder $container, string $siteName): void + { + /** @var array $pageIndexingSitesConfig */ + $pageIndexingSitesConfig = $container->getParameter('netgen_ibexa_search_extra.page_indexing.sites'); + + if (!array_key_exists('tree_root_location_id', $container->getParameter('netgen_ibexa_search_extra.page_indexing.sites')[$siteName])) { + $pageIndexingSitesConfig[$siteName]['tree_root_location_id'] = null; + } + + if (!array_key_exists('languages_siteaccess_map', $container->getParameter('netgen_ibexa_search_extra.page_indexing.sites')[$siteName])) { + $pageIndexingSitesConfig[$siteName]['languages_siteaccess_map'] = []; + } + + if (!array_key_exists('host', $container->getParameter('netgen_ibexa_search_extra.page_indexing.sites')[$siteName])) { + $pageIndexingSitesConfig[$siteName]['host'] = null; + } + + if (!array_key_exists('fields', $container->getParameter('netgen_ibexa_search_extra.page_indexing.sites')[$siteName])) { + $pageIndexingSitesConfig[$siteName]['fields'] = []; + } + + if (!array_key_exists('allowed_content_types', $container->getParameter('netgen_ibexa_search_extra.page_indexing.sites')[$siteName])) { + $pageIndexingSitesConfig[$siteName]['allowed_content_types'] = []; + } + + $container->setParameter( + 'netgen_ibexa_search_extra.page_indexing.sites', + $pageIndexingSitesConfig, + ); + } } diff --git a/bundle/NetgenIbexaSearchExtraBundle.php b/bundle/NetgenIbexaSearchExtraBundle.php index eb3dbec3..e2674081 100644 --- a/bundle/NetgenIbexaSearchExtraBundle.php +++ b/bundle/NetgenIbexaSearchExtraBundle.php @@ -24,5 +24,7 @@ public function build(ContainerBuilder $container): void $container->addCompilerPass(new Compiler\FieldType\RichTextIndexablePass()); $container->addCompilerPass(new Compiler\SearchResultExtractorPass()); $container->addCompilerPass(new Compiler\RawFacetBuilderDomainVisitorPass()); + $container->addCompilerPass(new Compiler\PageIndexingPass()); + $container->addCompilerPass(new Compiler\ElasticsearchExtensibleDocumentFactoryPass()); } } diff --git a/composer.json b/composer.json index c629c7e4..f67205ed 100644 --- a/composer.json +++ b/composer.json @@ -15,7 +15,9 @@ "ext-dom": "*", "ibexa/core": "^4.6", "symfony/messenger": "^5.4", - "symfony/proxy-manager-bridge": "^5.4" + "symfony/proxy-manager-bridge": "^5.4", + "ext-libxml": "*", + "ext-curl": "*" }, "require-dev": { "ibexa/fieldtype-richtext": "^4.5", @@ -30,7 +32,8 @@ }, "suggest": { "netgen/ibexa-site-api": "Boost your site-building productivity with Ibexa CMS", - "ibexa/solr": "Supports advanced capabilities with Ibexa search API" + "ibexa/solr": "Supports advanced capabilities with Ibexa search API", + "ibexa/elasticsearch": "Supports advanced capabilities with Ibexa search API" }, "autoload": { "psr-4": { diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 996318ee..c14b0b77 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -9,5 +9,7 @@ Reference subdocuments spellcheck_suggestions extra_fields + asynchronous_indexing + page_indexing .. include:: /reference/map.rst.inc diff --git a/docs/reference/map.rst.inc b/docs/reference/map.rst.inc index 626ee8e9..fb270269 100644 --- a/docs/reference/map.rst.inc +++ b/docs/reference/map.rst.inc @@ -4,3 +4,4 @@ * :doc:`/reference/subdocuments` * :doc:`/reference/spellcheck_suggestions` * :doc:`/reference/extra_fields` +* :doc:`/reference/page_indexing` diff --git a/docs/reference/page_indexing.rst b/docs/reference/page_indexing.rst new file mode 100644 index 00000000..62158241 --- /dev/null +++ b/docs/reference/page_indexing.rst @@ -0,0 +1,95 @@ +Page indexing +===================== + +This feature allows indexing of content by scraping the page using Symfony's HTTP client and indexing its content into document fields. + +Configuration +------------- +To enable this feature, set up the page indexing configuration: + +.. code-block:: yaml + + netgen_ibexa_search_extra: + page_indexing: + enabled: true + sites: + site1: + tree_root_location_id: '%site1.locations.tree_root.id%' + languages_siteaccess_map: + cro-HR: cro + eng-GB: eng + fields: + level_1: + - h1 + level_2: + - h2 + - h3 + - div.short + level_3: + - div.item-short + allowed_content_types: + - ng_article + - ng_frontpage + host: "%env(PAGE_INDEXING_HOST)%" + site2: + tree_root_location_id: '%site2.locations.tree_root.id%' + languages_siteaccess_map: + cro-HR: cro + eng-GB: eng + fields: + level_1: + - h1 + level_2: + - h2 + - h3 + - div.short + level_3: + - div.item-short + allowed_content_types: + - ng_landing_page + host: "%env(PAGE_INDEXING_HOST)%" + +To activate the feateure, set the ``enabled`` parameter to true. Define the individual page sites under the ``sites`` +array parameter. In this example we have ``site1`` and ``site2``. For each site configuration, specify +``tree_root_location_id``, ``languages_siteaccess_map``, ``fields``, ``allowed_content_types`` and ``host``. + +``tree_root_location_id``: is an integer defining the root location of the site we are configuring. + +``languages_siteaccess_map``: define all languages present on the site to determine which document should be indexed based on the language. + +``fields``: Defines the importance of text by HTML tags. Only the text under the specified HTML tags will be indexed. +Importance is indicated by listing tags under the desired level. You can also specify content importance by CSS class by +following the HTML tag with a class name as shown in the example. + +``allowed_content_types``: Only content types listed here will be indexed with additional fields from the page indexer. + +``host`` Define this parameter in the .env file. It's used by the Symfony HTTP client to resolve the page URL. + +PageTextExtractor +----------------- +The PageTextExtractor is a service that scrapes the page with Symfony's http client. It contains a cache parameter that +holds the last 10 indexed contents by language. The entire logic is stored in the ``NativePageTextExtractor``, allowing +for new methods of indexing page content to be implemented if needed. This service extends PageTextExtractor so to +implement new logic, extend ``PageTextExtractor`` and implement the new logic. + +This service also manages the fields configuration explained above. + +Command +------- +As a part of this feature we have implemented the ``IndexPageContentCommand``. + +This command is used to perform a complete page index when the feature is new to the project. It goes through all +content types specified in the configuration (``allowed_content_types``) and reindexes all existing content of the specified +types by their pages. + +To start the reindex, use the following command:: + + netgen-search-extra:index-page-content + + +The command also has an option ``content-ids``:: + + netgen-search-extra:index-page-content --content-ids=38 + + +To index multiple content IDs, add them to the command separated by commas. diff --git a/lib/Container/Compiler/ElasticsearchExtensibleDocumentFactoryPass.php b/lib/Container/Compiler/ElasticsearchExtensibleDocumentFactoryPass.php new file mode 100644 index 00000000..950121d8 --- /dev/null +++ b/lib/Container/Compiler/ElasticsearchExtensibleDocumentFactoryPass.php @@ -0,0 +1,81 @@ +processVisitors($container, 'block_translation'); + $this->processVisitors($container, 'block'); + $this->processVisitors($container, 'content'); + $this->processVisitors($container, 'content_translation'); + $this->processVisitors($container, 'location'); + $this->processVisitors($container, 'location_translation'); + + $this->processDocumentFactory($container); + + $container + ->register(ContentVisibilityVisitor::class, ContentVisibilityVisitor::class) + ->addTag('ibexa.search.elasticsearch.query.content.criterion.visitor'); + + $container + ->register(LocationVisibilityVisitor::class, LocationVisibilityVisitor::class) + ->addTag('ibexa.search.elasticsearch.query.location.criterion.visitor'); + } + + public function processDocumentFactory(ContainerBuilder $container): void + { + $container + ->register(DocumentFactory::class, DocumentFactory::class) + ->setDecoratedService(DocumentFactoryInterface::class) + ->setArguments([ + new Reference('.inner'), + new Reference(Handler::class), + new Reference('netgen.ibexa_search_extra.elasticsearch.field_mapper.content.aggregate'), + new Reference('netgen.ibexa_search_extra.elasticsearch.field_mapper.location.aggregate'), + new Reference('netgen.ibexa_search_extra.elasticsearch.field_mapper.content_translation.aggregate'), + new Reference('netgen.ibexa_search_extra.elasticsearch.field_mapper.location_translation.aggregate'), + new Reference('netgen.ibexa_search_extra.elasticsearch.field_mapper.block.aggregate'), + new Reference('netgen.ibexa_search_extra.elasticsearch.field_mapper.block_translation.aggregate'), + ]); + } + + private function processVisitors(ContainerBuilder $container, string $name): void + { + if (!$container->hasDefinition(sprintf('netgen.ibexa_search_extra.elasticsearch.field_mapper.%s.aggregate', $name))) { + return; + } + + $aggregateDefinition = $container->getDefinition( + sprintf('netgen.ibexa_search_extra.elasticsearch.field_mapper.%s.aggregate', $name), + ); + + $this->registerMappers($aggregateDefinition, $container->findTaggedServiceIds(sprintf('netgen.ibexa_search_extra.elasticsearch.field_mapper.%s', $name))); + } + + private function registerMappers(Definition $definition, array $mapperIds): void + { + foreach (array_keys($mapperIds) as $id) { + $definition->addMethodCall('addMapper', [new Reference($id)]); + } + } +} diff --git a/lib/Container/Compiler/PageIndexingPass.php b/lib/Container/Compiler/PageIndexingPass.php new file mode 100644 index 00000000..23ea1975 --- /dev/null +++ b/lib/Container/Compiler/PageIndexingPass.php @@ -0,0 +1,32 @@ +getParameter( + 'netgen_ibexa_search_extra.page_indexing.enabled', + ); + + if ($usePageIndexing !== true) { + return; + } + + $container + ->register(ContentPageTextFieldMapper::class, ContentPageTextFieldMapper::class) + ->setArguments([ + new Reference('netgen.ibexa_search_extra.page_indexing.page_text_extractor'), + new Reference('netgen.ibexa_search_extra.page_indexing.site_access_resolver'), + ]) + ->addTag('ibexa.search.solr.field.mapper.content.translation'); + } +} diff --git a/lib/Core/Search/Common/PageTextExtractor.php b/lib/Core/Search/Common/PageTextExtractor.php new file mode 100644 index 00000000..38ac3d7e --- /dev/null +++ b/lib/Core/Search/Common/PageTextExtractor.php @@ -0,0 +1,10 @@ +|string>>> */ + private array $cache = []; + + private LoggerInterface $logger; + + public function __construct( + private readonly ContentHandler $contentHandler, + private readonly RouterInterface $router, + private readonly SiteConfigResolver $siteConfigResolver, + ) { + $this->logger = new NullLogger(); + } + + public function setLogger(LoggerInterface $logger): void + { + $this->logger = $logger; + } + + /** + * @return array|string> + */ + public function extractPageText(int $contentId, string $languageCode): array + { + if (isset($this->cache[$contentId][$languageCode])) { + return $this->cache[$contentId][$languageCode]; + } + + if (count($this->cache) > 10) { + $this->cache = []; + } + + $siteConfig = $this->siteConfigResolver->getSiteConfigForContent($contentId); + + try { + $html = $this->fetchPageSource($contentId, $languageCode, $siteConfig); + } catch (IndexPageUnavailableException|RuntimeException $e) { + $this->logger->error($e->getMessage()); + + return []; + } + + $textArray = $this->extractTextArray($html, $contentId); + + $this->cache[$contentId][$languageCode] = $textArray; + + return $textArray; + } + + /** + * @throws NotFoundException + */ + private function generateUrl(string $languageCode, int $contentId, array $siteConfig): string + { + $contentInfo = $this->contentHandler->loadContentInfo($contentId); + $siteAccess = $this->resolveSiteAccess($contentInfo, $languageCode); + + if (isset($siteConfig['host'])) { + $relativePath = $this->router->generate( + 'ibexa.url.alias', + [ + 'locationId' => (int) $contentInfo->mainLocationId, + 'siteaccess' => $siteAccess, + ], + UrlGeneratorInterface::RELATIVE_PATH, + ); + + return $siteConfig['host'] . $relativePath; + } + + return $this->router->generate( + 'ibexa.url.alias', + [ + 'locationId' => (int) $contentInfo->mainLocationId, + 'siteaccess' => $siteAccess, + ], + UrlGeneratorInterface::ABSOLUTE_URL, + ); + } + + private function resolveSiteAccess(ContentInfo $contentInfo, string $languageCode): string + { + $siteConfig = $this->siteConfigResolver->getSiteConfigForContent($contentInfo->id); + + if (!isset($siteConfig['languages_siteaccess_map'][$languageCode])) { + throw new RuntimeException( + sprintf( + 'Language not supported for matched siteaccess group %s', + $siteConfig['site'], + ), + ); + } + + return $siteConfig['languages_siteaccess_map'][$languageCode]; + } + + /** + * @param array> $textArray + * + * @return array> + */ + private function recursiveExtractTextArray(DOMNode $node, array &$textArray, int $contentId): array + { + if ($node->nodeType === XML_ELEMENT_NODE || $node->nodeType === XML_HTML_DOCUMENT_NODE) { + $fieldLevel = $this->getFieldName($node, $contentId); + + if ($fieldLevel !== null) { + $textArray[$fieldLevel][] = $node->textContent; + + return $textArray; + } + + foreach ($node->childNodes as $childNode) { + $this->recursiveExtractTextArray($childNode, $textArray, $contentId); + } + } + if ($node->nodeType === XML_TEXT_NODE) { + $textContent = trim($node->textContent); + if ($textContent !== '') { + $textArray['other'][] = $textContent; + } + } + + return $textArray; + } + + private function getFieldName(DOMNode $node, int $contentId): null|string + { + $siteConfig = $this->siteConfigResolver->getSiteConfigForContent($contentId); + $fields = $siteConfig['fields']; + + foreach ($fields as $level => $tags) { + foreach ($tags as $tag) { + $tagParts = explode('.', $tag); // Split tag and class if present + $tagName = $tagParts[0]; // Get the tag name + $class = $tagParts[1] ?? null; // Get the class if exists + + if ($node->nodeName !== $tagName) { + continue; + } + + if ($class !== null && !$this->hasClass($node, $class)) { + continue; + } + + return $level; + } + } + + return null; + } + + private function hasClass(DOMNode $node, string $className): bool + { + /** @var \DOMElement $node */ + $classes = explode(' ', $node->getAttribute('class')); + + return in_array($className, $classes, true); + } + + /** + * @throws NotFoundException + * @throws UnauthorizedException + * @throws RuntimeException + */ + private function fetchPageSource(int $contentId, string $languageCode, array $siteConfig): string + { + $url = $this->generateUrl($languageCode, $contentId, $siteConfig); + + $httpClient = HttpClient::create( + ); + + $response = $httpClient->request( + 'GET', + $url, + ); + + $html = $response->getContent(); + + if ($response->getStatusCode() !== 200) { + throw new IndexPageUnavailableException( + sprintf( + 'Could not fetch URL "%s": %s', + $url, + $response->getInfo()['error'], + ), + ); + } + + return $html; + } + + /** + * @return array> + */ + private function extractTextArray(string $html, int $contentId): array + { + $startTag = ''; + $endTag = ''; + + $startPos = mb_strpos($html, $startTag); + $endPos = mb_strpos($html, $endTag); + + $textArray = []; + + if ($startPos !== false && $endPos !== false) { + $startPos += mb_strlen($startTag); + $extractedContent = mb_substr($html, $startPos, $endPos - $startPos); + + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadHTML($extractedContent); + libxml_use_internal_errors(false); + $textArray = $this->recursiveExtractTextArray($doc, $textArray, $contentId); + } + + return $textArray; + } +} diff --git a/lib/Core/Search/Common/SiteConfigResolver.php b/lib/Core/Search/Common/SiteConfigResolver.php new file mode 100644 index 00000000..6602d3f5 --- /dev/null +++ b/lib/Core/Search/Common/SiteConfigResolver.php @@ -0,0 +1,60 @@ + $sitesConfig + */ + public function __construct( + private readonly ContentHandler $contentHandler, + private readonly LocationHandler $locationHandler, + private readonly array $sitesConfig, + ) {} + + public function getSiteConfigForContent(int $contentId): array + { + $contentInfo = $this->contentHandler->loadContentInfo($contentId); + + try { + $location = $this->locationHandler->load($contentInfo->mainLocationId); + } catch (NotFoundException) { + throw new RuntimeException( + sprintf( + 'Content #%d does not have a location', + $contentInfo->id, + ), + ); + } + + $pathString = $location->pathString; + $pathArray = explode('/', $pathString); + + foreach ($this->sitesConfig as $site => $siteConfig) { + if (in_array($siteConfig['tree_root_location_id'], $pathArray, false)) { + $siteConfig['site'] = $site; + + return $siteConfig; + } + } + + throw new RuntimeException( + sprintf( + 'Failed to match content ID %d to a siteaccess', + $contentInfo->id, + ), + ); + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/BlockFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/BlockFieldMapper.php new file mode 100644 index 00000000..88d829b1 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/BlockFieldMapper.php @@ -0,0 +1,22 @@ +addMapper($mapper); + } + } + + /** + * Adds given $mapper to the internal array. + */ + public function addMapper(BlockFieldMapper $mapper): void + { + $this->mappers[] = $mapper; + } + + public function accept(SPIContent $content): bool + { + return true; + } + + public function mapFields(SPIContent $content): array + { + $fields = []; + + foreach ($this->mappers as $mapper) { + if ($mapper->accept($content)) { + $fields = [...$fields, ...$mapper->mapFields($content)]; + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/BlockTranslationFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/BlockTranslationFieldMapper.php new file mode 100644 index 00000000..c55e4295 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/BlockTranslationFieldMapper.php @@ -0,0 +1,22 @@ +addMapper($mapper); + } + } + + /** + * Adds given $mapper to the internal array. + */ + public function addMapper(BlockTranslationFieldMapper $mapper): void + { + $this->mappers[] = $mapper; + } + + public function accept(SPIContent $content, string $languageCode): bool + { + return true; + } + + public function mapFields(SPIContent $content, string $languageCode): array + { + $fields = []; + + foreach ($this->mappers as $mapper) { + if ($mapper->accept($content, $languageCode)) { + $fields = [...$fields, ...$mapper->mapFields($content, $languageCode)]; + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/BlockTranslationFieldMapper/BlockPageTextFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/BlockTranslationFieldMapper/BlockPageTextFieldMapper.php new file mode 100644 index 00000000..f2a45675 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/BlockTranslationFieldMapper/BlockPageTextFieldMapper.php @@ -0,0 +1,49 @@ +siteConfigResolver->getSiteConfigForContent($content->versionInfo->contentInfo->id); + $fields = []; + $contentType = $this->contentTypeHandler->load($content->versionInfo->contentInfo->contentTypeId); + + if (in_array($contentType->identifier, $siteConfig['allowed_content_types'], true)) { + $text = $this->pageTextExtractor->extractPageText($content->versionInfo->contentInfo->id, $languageCode); + foreach ($text as $level => $value) { + $fields[] = new Field('page_text_' . $level, $value, new FullTextField()); + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/ContentFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/ContentFieldMapper.php new file mode 100644 index 00000000..c2d77fd4 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/ContentFieldMapper.php @@ -0,0 +1,22 @@ +addMapper($mapper); + } + } + + /** + * Adds given $mapper to the internal array. + */ + public function addMapper(ContentFieldMapper $mapper): void + { + $this->mappers[] = $mapper; + } + + public function accept(SPIContent $content): bool + { + return true; + } + + public function mapFields(SPIContent $content): array + { + $fields = []; + + foreach ($this->mappers as $mapper) { + if ($mapper->accept($content)) { + $fields = [...$fields, ...$mapper->mapFields($content)]; + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/ContentFieldMapper/ContentVisibilityFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/ContentFieldMapper/ContentVisibilityFieldMapper.php new file mode 100644 index 00000000..81689af1 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/ContentFieldMapper/ContentVisibilityFieldMapper.php @@ -0,0 +1,28 @@ +versionInfo->contentInfo->isHidden, + new BooleanField(), + )]; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/ContentTranslationFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/ContentTranslationFieldMapper.php new file mode 100644 index 00000000..c2197087 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/ContentTranslationFieldMapper.php @@ -0,0 +1,22 @@ +addMapper($mapper); + } + } + + /** + * Adds given $mapper to the internal array. + */ + public function addMapper(ContentTranslationFieldMapper $mapper): void + { + $this->mappers[] = $mapper; + } + + public function accept(SPIContent $content, string $languageCode): bool + { + return true; + } + + public function mapFields(SPIContent $content, string $languageCode): array + { + $fields = []; + + foreach ($this->mappers as $mapper) { + if ($mapper->accept($content, $languageCode)) { + $fields = [...$fields, ...$mapper->mapFields($content, $languageCode)]; + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/DocumentFactory.php b/lib/Core/Search/Elasticsearch/DocumentMapper/DocumentFactory.php new file mode 100644 index 00000000..dfd6aa76 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/DocumentFactory.php @@ -0,0 +1,81 @@ +innerDocumentFactory->fromContent($content); + + /** @var \Ibexa\Contracts\Elasticsearch\Mapping\ContentDocument[] $documents */ + $documents = iterator_to_array($result); + + foreach ($documents as $document) { + $contentFields = $this->contentFieldMapper->mapFields($content); + $contentTranslationDependentFields = $this->contentTranslationFieldMapper->mapFields($content, $document->languageCode); + $blockFields = $this->blockFieldMapper->mapFields($content); + $blockTranslationDependentFields = $this->blockTranslationMapper->mapFields($content, $document->languageCode); + + $document->fields = [ + ...$document->fields, + ...$contentFields, + ...$contentTranslationDependentFields, + ...$blockFields, + ...$blockTranslationDependentFields, + ]; + } + + return new ArrayIterator($documents); + } + + public function fromLocation(Location $location, ?Content $content = null): Iterator + { + if ($content === null) { + $content = $this->contentHandler->load($location->contentId); + } + $result = $this->innerDocumentFactory->fromLocation($location, $content); + + /** @var \Ibexa\Contracts\Elasticsearch\Mapping\LocationDocument[] $documents */ + $documents = iterator_to_array($result); + + foreach ($documents as $document) { + $locationFields = $this->locationFieldMapper->mapFields($location); + $locationTranslationDependentFields = $this->locationTranslationFieldMapper->mapFields($location, $document->languageCode); + $blockFields = $this->blockFieldMapper->mapFields($content); + $blockTranslationDependentFields = $this->blockTranslationMapper->mapFields($content, $document->languageCode); + + $document->fields = [ + ...$document->fields, + ...$locationFields, + ...$locationTranslationDependentFields, + ...$blockFields, + ...$blockTranslationDependentFields, + ]; + } + + return new ArrayIterator($documents); + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/LocationFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/LocationFieldMapper.php new file mode 100644 index 00000000..66c0cb46 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/LocationFieldMapper.php @@ -0,0 +1,22 @@ +addMapper($mapper); + } + } + + /** + * Adds given $mapper to the internal array. + */ + public function addMapper(LocationFieldMapper $mapper): void + { + $this->mappers[] = $mapper; + } + + public function accept(SPILocation $location): bool + { + return true; + } + + public function mapFields(SPILocation $location): array + { + $fields = []; + + foreach ($this->mappers as $mapper) { + if ($mapper->accept($location)) { + $fields = [...$fields, ...$mapper->mapFields($location)]; + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/LocationFieldMapper/LocationVisibilityFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/LocationFieldMapper/LocationVisibilityFieldMapper.php new file mode 100644 index 00000000..d1916b71 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/LocationFieldMapper/LocationVisibilityFieldMapper.php @@ -0,0 +1,36 @@ +contentHandler->load($location->contentId); + + return [ + new Field( + 'ng_location_visible', + !$location->hidden && !$location->invisible && !$content->versionInfo->contentInfo->isHidden, + new BooleanField(), + ), + ]; + } +} diff --git a/lib/Core/Search/Elasticsearch/DocumentMapper/LocationTranslationFieldMapper.php b/lib/Core/Search/Elasticsearch/DocumentMapper/LocationTranslationFieldMapper.php new file mode 100644 index 00000000..8926f892 --- /dev/null +++ b/lib/Core/Search/Elasticsearch/DocumentMapper/LocationTranslationFieldMapper.php @@ -0,0 +1,22 @@ +addMapper($mapper); + } + } + + /** + * Adds given $mapper to the internal array. + */ + public function addMapper(LocationTranslationFieldMapper $mapper): void + { + $this->mappers[] = $mapper; + } + + public function accept(SPILocation $location, string $languageCode): bool + { + return true; + } + + public function mapFields(SPILocation $location, string $languageCode): array + { + $fields = []; + + foreach ($this->mappers as $mapper) { + if ($mapper->accept($location, $languageCode)) { + $fields = [...$fields, ...$mapper->mapFields($location, $languageCode)]; + } + } + + return $fields; + } +} diff --git a/lib/Core/Search/Elasticsearch/Query/CriterionVisitor/Content/VisibilityVisitor.php b/lib/Core/Search/Elasticsearch/Query/CriterionVisitor/Content/VisibilityVisitor.php new file mode 100644 index 00000000..0d5d2fba --- /dev/null +++ b/lib/Core/Search/Elasticsearch/Query/CriterionVisitor/Content/VisibilityVisitor.php @@ -0,0 +1,31 @@ + $value */ + $value = $criterion->value; + + return $value[0] === true; + } +} diff --git a/lib/Core/Search/Elasticsearch/Query/CriterionVisitor/Location/VisibilityVisitor.php b/lib/Core/Search/Elasticsearch/Query/CriterionVisitor/Location/VisibilityVisitor.php new file mode 100644 index 00000000..e82d2abc --- /dev/null +++ b/lib/Core/Search/Elasticsearch/Query/CriterionVisitor/Location/VisibilityVisitor.php @@ -0,0 +1,31 @@ + $value */ + $value = $criterion->value; + + return $value[0] === true; + } +} diff --git a/lib/Core/Search/Solr/FieldMapper/ContentTranslation/ContentPageTextFieldMapper.php b/lib/Core/Search/Solr/FieldMapper/ContentTranslation/ContentPageTextFieldMapper.php new file mode 100644 index 00000000..4bb2ac2a --- /dev/null +++ b/lib/Core/Search/Solr/FieldMapper/ContentTranslation/ContentPageTextFieldMapper.php @@ -0,0 +1,49 @@ +versionInfo->contentInfo->contentTypeId; + $allowedContentTypes = $this->siteConfigResolver->getSiteConfigForContent($content->versionInfo->contentInfo->id); + + if (!in_array($contentTypeIdentifier, $allowedContentTypes, true)) { + return []; + } + + $text = $this->pageTextExtractor->extractPageText($content->versionInfo->contentInfo->id, $languageCode); + $pageTextFields = []; + foreach ($text as $level => $value) { + $pageTextFields[] = new Field( + 'page_text_' . $level, + $value, + new FullTextField(), + ); + } + + return $pageTextFields; + } +} diff --git a/lib/Exception/IndexPageUnavailableException.php b/lib/Exception/IndexPageUnavailableException.php new file mode 100644 index 00000000..40e4813c --- /dev/null +++ b/lib/Exception/IndexPageUnavailableException.php @@ -0,0 +1,15 @@ +setParameter('kernel.bundles', []); } - public function providerForIndexableFieldTypeDefaultConfiguration(): array + public function provideIndexableFieldTypeDefaultConfigurationCases(): iterable { return [ [ @@ -71,7 +72,7 @@ public function providerForIndexableFieldTypeDefaultConfiguration(): array } /** - * @dataProvider providerForIndexableFieldTypeDefaultConfiguration + * @dataProvider provideIndexableFieldTypeDefaultConfigurationCases */ public function testIndexableFieldTypeDefaultConfiguration(array $configuration): void { @@ -91,6 +92,326 @@ public function testIndexableFieldTypeDefaultConfiguration(array $configuration) ); } + public function providePageIndexingConfigurationCases(): iterable + { + return [ + [ + [ + 'page_indexing' => [ + 'enabled' => true, + ], + ], + null, + [], + null, + [], + [], + ], + + [ + [ + 'page_indexing' => [ + 'enabled' => true, + 'sites' => [ + 'finaweb' => [ + 'tree_root_location_id' => '42', + ], + ], + ], + ], + 42, + [], + null, + [], + [], + ], + + [ + [ + 'page_indexing' => [ + 'enabled' => true, + 'sites' => [ + 'finaweb' => [ + 'languages_siteaccess_map' => [ + 'cro-HR' => 'fina_cro', + ], + ], + ], + ], + ], + null, + [ + 'cro-HR' => 'fina_cro', + ], + null, + [], + [], + ], + + [ + [ + 'page_indexing' => [ + 'enabled' => true, + 'sites' => [ + 'finaweb' => [ + 'host' => 'string', + ], + ], + ], + ], + null, + [], + 'string', + [], + [], + ], + + [ + [ + 'page_indexing' => [ + 'enabled' => true, + 'sites' => [ + 'finaweb' => [ + 'fields' => [ + 'level1' => [ + 'h1', + 'h2', + ], + ], + ], + ], + ], + ], + null, + [], + null, + [ + 'level1' => [ + 'h1', + 'h2', + ], + ], + [], + ], + + [ + [ + 'page_indexing' => [ + 'enabled' => true, + 'sites' => [ + 'finaweb' => [ + 'allowed_content_types' => [ + 'ng_landing_page', + 'ng_frontpage', + ], + ], + ], + ], + ], + null, + [], + null, + [], + [ + 'ng_landing_page', + 'ng_frontpage', + ], + ], + + [ + [ + 'page_indexing' => [ + 'enabled' => true, + 'sites' => [ + 'finaweb' => [ + 'tree_root_location_id' => '42', + 'languages_siteaccess_map' => [ + 'cro-HR' => 'fina_cro', + ], + 'host' => 'string', + 'fields' => [ + 'level1' => [ + 'h1', + 'h2', + ], + ], + 'allowed_content_types' => [ + 'ng_landing_page', + 'ng_frontpage', + ], + ], + ], + ], + ], + 42, + [ + 'cro-HR' => 'fina_cro', + ], + 'string', + [ + 'level1' => [ + 'h1', + 'h2', + ], + ], + [ + 'ng_landing_page', + 'ng_frontpage', + ], + ], + ]; + } + + /** + * @dataProvider providePageIndexingConfigurationCases + */ + public function testPageIndexingConfiguration( + array $configuration, + ?int $expectedTreeRootLocationId, + array $expectedLanguagesSiteaccessMap, + ?string $expectedHost, + array $expectedFields, + array $expectedAllowedContentTypes, + ): void { + $this->load($configuration); + + $this->assertContainerBuilderHasParameter('netgen_ibexa_search_extra.page_indexing.sites'); + $sitesConfig = $this->container->getParameter('netgen_ibexa_search_extra.page_indexing.sites'); + + foreach ($sitesConfig as $site => $siteConfig) { + self::assertArrayHasKey( + 'tree_root_location_id', + $siteConfig, + ); + self::assertEquals($expectedTreeRootLocationId, $siteConfig['tree_root_location_id']); + + self::assertArrayHasKey( + 'languages_siteaccess_map', + $siteConfig, + ); + self::assertEquals($expectedLanguagesSiteaccessMap, $siteConfig['languages_siteaccess_map']); + + self::assertArrayHasKey( + 'fields', + $siteConfig, + ); + self::assertEquals($expectedFields, $siteConfig['fields']); + + self::assertArrayHasKey( + 'allowed_content_types', + $siteConfig, + ); + self::assertEquals($expectedAllowedContentTypes, $siteConfig['allowed_content_types']); + + self::assertArrayHasKey( + 'host', + $siteConfig, + ); + self::assertEquals($expectedHost, $siteConfig['host']); + } + } + + public function provideInvalidPageIndexingConfigurationCases(): iterable + { + return [ + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'tree_root_location_id' => [], + ], + ], + ], + InvalidConfigurationException::class, + 'Expected "int", but got "array"', + ], + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'tree_root_location_id' => true, + ], + ], + ], + InvalidConfigurationException::class, + 'Expected "int", but got "bool"', + ], + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'languages_siteaccess_map' => [ + 'cro-HR' => 5, + ], + ], + ], + ], + InvalidConfigurationException::class, + 'Expected "string", but got "int"', + ], + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'host' => [], + ], + ], + ], + InvalidConfigurationException::class, + 'Expected "string", but got "array"', + ], + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'config' => [ + 'level1' => 'a', + ], + ], + ], + ], + InvalidConfigurationException::class, + 'Expected "array", but got "string"', + ], + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'config' => [ + ['h1', 'h2'], + ], + ], + ], + ], + InvalidConfigurationException::class, + 'Array key (field importance level) must be of string type', + ], + [ + [ + 'page_indexing' => [ + 'finaweb' => [ + 'allowed_content_types' => [ + 34, + 52, + ], + ], + ], + ], + InvalidConfigurationException::class, + 'Expected "string", but got "int"', + ], + ]; + } + + /** + * @dataProvider provideInvalidPageIndexingConfigurationCases + */ + public function testInvalidPageIndexingConfiguration(array $siteRootsConfig): void + { + $this->expectException(InvalidConfigurationException::class); + $this->load($siteRootsConfig); + } + protected function getContainerExtensions(): array { return [ diff --git a/tests/lib/Integration/Implementation/Stubs/RouterStub.php b/tests/lib/Integration/Implementation/Stubs/RouterStub.php new file mode 100644 index 00000000..64fada4d --- /dev/null +++ b/tests/lib/Integration/Implementation/Stubs/RouterStub.php @@ -0,0 +1,37 @@ +