Skip to content

Commit

Permalink
optimized image fetching by using asynchronous requests
Browse files Browse the repository at this point in the history
  • Loading branch information
Kaishiyoku committed Oct 13, 2024
1 parent 5699264 commit 13d0337
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 33 deletions.
55 changes: 31 additions & 24 deletions src/Helper.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
use Carbon\Carbon;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Promise\PromiseInterface;
use GuzzleHttp\Promise\Utils;
use GuzzleHttp\Psr7\Response;
use Illuminate\Support\Arr;
use Illuminate\Support\Collection;
use Illuminate\Support\Str;
use Symfony\Component\DomCrawler\Crawler;
Expand Down Expand Up @@ -105,31 +107,36 @@ public static function getImageUrlsForFeedItem(string $feedItemUrl, ?string $con
[, $imageUrls] = $matches;

// don't allow GIF images because those will most likely be tracking pixels
return (new Collection($imageUrls))
->map(function (string $imageUrl) use ($baseUrl) {
if (Str::startsWith($imageUrl, 'http')) {
return $imageUrl;
}

return $baseUrl.'/'.ltrim($imageUrl, '/');
})
->filter(fn (string $imageUrl) => self::getHttpContentTypeForUrl($imageUrl, $httpClient) !== 'image/gif');
return self::filterImageUrls(
(new Collection($imageUrls))
->map(fn (string $imageUrl) => Str::startsWith($imageUrl, 'http') ? $imageUrl : $baseUrl.'/'.ltrim($imageUrl, '/')),
$httpClient
)
->unique();
}

public static function getHttpContentTypeForUrl(string $url, Client $httpClient): ?string
/**
* @param Collection<string> $urls
* @param Client $httpClient
* @return Collection<string>
*/
public static function filterImageUrls(Collection $urls, Client $httpClient): Collection
{
try {
return $httpClient->get($url)->getHeaderLine('Content-Type');
} catch (ConnectException) {
return null;
} catch (ClientException $exception) {
// return null for HTTP Not Found Exceptions
if ($exception->getResponse()->getStatusCode() === 404) {
return null;
}

throw $exception;
}
$promises = $urls->map(fn (string $url) => $httpClient->getAsync($url));

return (new Collection(Utils::inspectAll($promises->toArray())))
->filter(fn (array $result) => Arr::get($result, 'state') === PromiseInterface::FULFILLED)
->map(function (array $result, int $index) use ($urls) {
/*** @var $response Response */
$response = Arr::get($result, 'value');

return [
'url' => $urls->get($index),
'contentType' => $response->getHeaderLine('Content-Type'),
];
})
->filter(fn (array $result) => Str::startsWith(Arr::get($result, 'contentType'), 'image/') && Arr::get($result, 'contentType') !== 'image/gif')
->map(fn (array $result) => Arr::get($result, 'url'));
}

/**
Expand Down
10 changes: 6 additions & 4 deletions src/HeraRssCrawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -208,10 +208,12 @@ public function discoverFavicon(string $url): ?string
$crawler = new Crawler($response->getBody()->getContents());
$nodes = $crawler->filterXPath($this->cssConverter->toXPath('head > link'));

$faviconUrls = (new Collection($nodes))
->filter(fn (DOMElement $node) => Str::contains($node->getAttribute('rel'), 'icon')) /** @phpstan-ignore-line */
->map(fn (DOMElement $node) => Helper::normalizeUrl(Helper::transformUrl($url, $node->getAttribute('href')))) /** @phpstan-ignore-line */
->filter(fn (string $imageUrl) => Helper::getHttpContentTypeForUrl($imageUrl, $this->httpClient) !== null);
$faviconUrls = Helper::filterImageUrls(
(new Collection($nodes))
->filter(fn (DOMElement $node) => Str::contains($node->getAttribute('rel'), 'icon')) /** @phpstan-ignore-line */
->map(fn (DOMElement $node) => Helper::normalizeUrl(Helper::transformUrl($url, $node->getAttribute('href')))), /** @phpstan-ignore-line */
$this->httpClient
);

if ($faviconUrls->isEmpty()) {
return null;
Expand Down
24 changes: 19 additions & 5 deletions tests/HelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Exception;
use GuzzleHttp\Client;
use Illuminate\Support\Collection;
use Kaishiyoku\HeraRssCrawler\TestClasses\FailingTestClass;
use Mockery;
use PHPUnit\Framework\TestCase;
Expand Down Expand Up @@ -48,12 +49,25 @@ public function testGetImageUrlsForFeedItem(): void
static::assertEquals(['https://www.golem.de/2107/158391-284735-284731_rc.jpg'], $imageUrls->toArray());
}

/**
* @dataProvider faviconProvider
*/
public function testGetHttpContentTypeForUrl(string $faviconUrl, ?string $expectedContentType): void
public function testFilterImageUrls(): void
{
static::assertSame($expectedContentType, Helper::getHttpContentTypeForUrl($faviconUrl, new Client));
$imageUrls = new Collection([
'https://petapixel.com/wp-content/themes/petapixel-2017/assets/prod/img/favicon.ico',
'https://news.ycombinator.com/y18.svg',
'https://statamic.dev/img/favicons/apple-touch-icon-57x57.png',
'https://upload.wikimedia.org/wikipedia/commons/e/ea/Test.gif',
'https://invalid-url.dev',
]);

$filteredImageUrls = Helper::filterImageUrls($imageUrls, new Client);

$expectedImageUrls = [
'https://petapixel.com/wp-content/themes/petapixel-2017/assets/prod/img/favicon.ico',
'https://news.ycombinator.com/y18.svg',
'https://statamic.dev/img/favicons/apple-touch-icon-57x57.png',
];

static::assertSame($expectedImageUrls, $filteredImageUrls->toArray());
}

/**
Expand Down

0 comments on commit 13d0337

Please sign in to comment.