From bca54fb969d4e22c7d196efa24f8bacd79689132 Mon Sep 17 00:00:00 2001 From: otsch Date: Fri, 14 Jul 2023 00:06:48 +0200 Subject: [PATCH] Fix reading input sitemap in HTTP crawl step The `Http::crawl()` step now also work with sitemaps as input URL, where the `` tag contains attributes that would cause the symfony DomCrawler to not find any elements. --- CHANGELOG.md | 4 ++++ src/Steps/Loading/HttpCrawl.php | 3 ++- src/Steps/Sitemap/GetUrlsFromSitemap.php | 24 +++++++++++++++++++----- tests/_Integration/Http/CrawlingTest.php | 13 +++++++++++++ tests/_Integration/_Server/Crawling.php | 14 ++++++++++++++ 5 files changed, 52 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a290e57..320c37e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.1.4] - 2023-07-14 +### Fixed +* The `Http::crawl()` step now also work with sitemaps as input URL, where the `` tag contains attributes that would cause the symfony DomCrawler to not find any elements. + ## [1.1.3] - 2023-06-29 ### Fixed * Improved `Json` step: if the target of the "each" (like `Json::each('target', [...])`) does not exist in the input JSON data, the step yields nothing and logs a warning. diff --git a/src/Steps/Loading/HttpCrawl.php b/src/Steps/Loading/HttpCrawl.php index aff69e8..18db438 100644 --- a/src/Steps/Loading/HttpCrawl.php +++ b/src/Steps/Loading/HttpCrawl.php @@ -5,6 +5,7 @@ use Closure; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; use Crwlr\Crawler\Steps\Loading\Http\Document; +use Crwlr\Crawler\Steps\Sitemap\GetUrlsFromSitemap; use Crwlr\Url\Url; use Exception; use Generator; @@ -249,7 +250,7 @@ protected function getUrlsFromInitialResponse(RespondedRequest $respondedRequest */ protected function getUrlsFromSitemap(RespondedRequest $respondedRequest): array { - $domCrawler = new Crawler(Http::getBodyString($respondedRequest)); + $domCrawler = GetUrlsFromSitemap::fixUrlSetTag(new Crawler(Http::getBodyString($respondedRequest))); $urls = []; diff --git a/src/Steps/Sitemap/GetUrlsFromSitemap.php b/src/Steps/Sitemap/GetUrlsFromSitemap.php index 6494663..ad15e89 100644 --- a/src/Steps/Sitemap/GetUrlsFromSitemap.php +++ b/src/Steps/Sitemap/GetUrlsFromSitemap.php @@ -10,6 +10,24 @@ class GetUrlsFromSitemap extends Step { protected bool $withData = false; + /** + * Remove attributes from a sitemap's tag + * + * Symfony's DomCrawler component has problems when a sitemap's tag contains certain attributes. + * So, if the count of urls in the sitemap is zero, try to remove all attributes from the tag. + * + * @param Crawler $dom + * @return Crawler + */ + public static function fixUrlSetTag(Crawler $dom): Crawler + { + if ($dom->filter('urlset url')->count() === 0) { + return new Crawler(preg_replace('//', '', $dom->outerHtml())); + } + + return $dom; + } + public function withData(): static { $this->withData = true; @@ -22,11 +40,7 @@ public function withData(): static */ protected function invoke(mixed $input): Generator { - if ($input->filter('urlset url')->count() === 0) { - $xml = preg_replace('//', '', $input->outerHtml()); - - $input = new Crawler($xml); - } + $input = self::fixUrlSetTag($input); foreach ($input->filter('urlset url') as $urlNode) { $urlNode = new Crawler($urlNode); diff --git a/tests/_Integration/Http/CrawlingTest.php b/tests/_Integration/Http/CrawlingTest.php index 5a170e8..b53af19 100644 --- a/tests/_Integration/Http/CrawlingTest.php +++ b/tests/_Integration/Http/CrawlingTest.php @@ -188,6 +188,19 @@ public function getLoader(): TestLoader expect($crawler->getLoader()->loadedUrls)->toHaveCount(1); }); +it( + 'extracts URLs from a sitemap where the tag contains attributes that cause symfony DomCrawler to fail', + function () { + $crawler = (new Crawler()) + ->input('http://www.example.com/crawling/sitemap2.xml') + ->addStep(Http::crawl()->inputIsSitemap()); + + $crawler->runAndTraverse(); + + expect($crawler->getLoader()->loadedUrls)->toHaveCount(7); + } +); + it('loads only pages where the path starts with a certain string when method pathStartsWith() is called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') diff --git a/tests/_Integration/_Server/Crawling.php b/tests/_Integration/_Server/Crawling.php index 1058280..c8e69e2 100644 --- a/tests/_Integration/_Server/Crawling.php +++ b/tests/_Integration/_Server/Crawling.php @@ -25,6 +25,20 @@ XML; } +if ($route === '/crawling/sitemap2.xml') { + echo << + +http://www.example.com/crawling/main +http://www.example.com/crawling/sub1 +http://www.example.com/crawling/sub1/sub1 +http://www.example.com/crawling/sub2 +http://www.example.com/crawling/sub2/sub1 +http://www.example.com/crawling/sub2/sub1/sub1 + +XML; +} + if ($route === '/crawling/main') { echo <<