Skip to content

Commit

Permalink
Fixes in Http steps
Browse files Browse the repository at this point in the history
* Fix in `HttpCrawl` (`Http::crawl()`) step: when a page contains a
broken link, that can't be resolved and throws an `Exception` from the
URL library, ignore the link and log a warning message.
* Minor fix for merging HTTP headers when an `Http` step gets both,
statically defined headers and headers to use from array input.
* Also CS Fixer changed a lot of empty classes and constructors.
  • Loading branch information
otsch committed Sep 19, 2023
1 parent 1796e51 commit 10a10c7
Show file tree
Hide file tree
Showing 43 changed files with 182 additions and 166 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [1.2.2] - 2023-09-19
### Fixed
* Fix in `HttpCrawl` (`Http::crawl()`) step: when a page contains a broken link, that can't be resolved and throws an `Exception` from the URL library, ignore the link and log a warning message.
* Minor fix for merging HTTP headers when an `Http` step gets both, statically defined headers and headers to use from array input.

## [1.2.1] - 2023-08-21
### Fixed
* When a URL redirects, the `trackRequestEndFor()` method of the `HttpLoader`'s `Throttler` instance is called only once at the end and with the original request URL.
Expand Down
4 changes: 1 addition & 3 deletions src/Cache/Exceptions/MissingZlibExtensionException.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@
use Exception;
use Psr\SimpleCache\CacheException;

class MissingZlibExtensionException extends Exception implements CacheException
{
}
class MissingZlibExtensionException extends Exception implements CacheException {}
4 changes: 1 addition & 3 deletions src/Cache/Exceptions/ReadingCacheFailedException.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@
use Exception;
use Psr\SimpleCache\CacheException;

class ReadingCacheFailedException extends Exception implements CacheException
{
}
class ReadingCacheFailedException extends Exception implements CacheException {}
3 changes: 1 addition & 2 deletions src/Cache/FileCache.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ class FileCache implements CacheInterface

public function __construct(
protected readonly string $basePath,
) {
}
) {}

public function useCompression(): static
{
Expand Down
4 changes: 1 addition & 3 deletions src/Exceptions/UnknownLoaderKeyException.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@

use Exception;

class UnknownLoaderKeyException extends Exception
{
}
class UnknownLoaderKeyException extends Exception {}
4 changes: 1 addition & 3 deletions src/HttpCrawler/AnonymousHttpCrawlerBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@

class AnonymousHttpCrawlerBuilder
{
public function __construct()
{
}
public function __construct() {}

public function withBotUserAgent(string $productToken): HttpCrawler
{
Expand Down
4 changes: 1 addition & 3 deletions src/Input.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@

namespace Crwlr\Crawler;

class Input extends Io
{
}
class Input extends Io {}
4 changes: 1 addition & 3 deletions src/Loader/AddLoadersToStepAction.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ class AddLoadersToStepAction
* @param LoaderInterface|array<string, LoaderInterface> $loaders
* @param StepInterface $step
*/
public function __construct(protected LoaderInterface|array $loaders, protected StepInterface $step)
{
}
public function __construct(protected LoaderInterface|array $loaders, protected StepInterface $step) {}

/**
* @return void
Expand Down
4 changes: 1 addition & 3 deletions src/Loader/Http/Cookies/Date.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ class Date
{
protected ?DateTime $dateTime = null;

public function __construct(protected readonly string $httpDateString)
{
}
public function __construct(protected readonly string $httpDateString) {}

/**
* @throws InvalidArgumentException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@

use Exception;

class InvalidCookieException extends Exception
{
}
class InvalidCookieException extends Exception {}
4 changes: 1 addition & 3 deletions src/Loader/Http/Exceptions/LoadingException.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@

use Exception;

class LoadingException extends Exception
{
}
class LoadingException extends Exception {}
3 changes: 1 addition & 2 deletions src/Loader/Http/Politeness/RetryErrorResponseHandler.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ public function __construct(
protected int $retries = 2,
protected array $wait = [10, 60],
protected int $maxWait = 60,
) {
}
) {}

public function shouldWait(RespondedRequest $respondedRequest): bool
{
Expand Down
4 changes: 1 addition & 3 deletions src/Loader/Http/Politeness/TimingUnits/Microseconds.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@
/**
* @deprecated Will be removed in v2.0.0. Use the Microseconds class from the crwlr/utils package directly instead.
*/
final class Microseconds extends \Crwlr\Utils\Microseconds
{
}
final class Microseconds extends \Crwlr\Utils\Microseconds {}
4 changes: 1 addition & 3 deletions src/Loader/Http/Politeness/TimingUnits/MultipleOf.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@

class MultipleOf
{
public function __construct(public readonly float $factor)
{
}
public function __construct(public readonly float $factor) {}

public function calc(Microseconds $microseconds): Microseconds
{
Expand Down
8 changes: 2 additions & 6 deletions src/Loader/Loader.php
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,12 @@ protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException
/**
* Can be implemented in a child class to track how long a request waited for its response.
*/
protected function trackRequestStart(?float $microtime = null): void
{
}
protected function trackRequestStart(?float $microtime = null): void {}

/**
* Can be implemented in a child class to track how long a request waited for its response.
*/
protected function trackRequestEnd(?float $microtime = null): void
{
}
protected function trackRequestEnd(?float $microtime = null): void {}

protected function callHook(string $hook, mixed ...$arguments): void
{
Expand Down
4 changes: 1 addition & 3 deletions src/Output.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@

namespace Crwlr\Crawler;

class Output extends Io
{
}
class Output extends Io {}
4 changes: 1 addition & 3 deletions src/Steps/Csv.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ class Csv extends Step
/**
* @param array<string|null> $columnMapping
*/
public function __construct(protected array $columnMapping = [], protected bool $skipFirstLine = false)
{
}
public function __construct(protected array $columnMapping = [], protected bool $skipFirstLine = false) {}

/**
* @param array<string|null> $columnMapping
Expand Down
3 changes: 1 addition & 2 deletions src/Steps/Filters/ClosureFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ class ClosureFilter extends Filter
{
public function __construct(
protected readonly Closure $closure,
) {
}
) {}

/**
* @throws Exception
Expand Down
3 changes: 1 addition & 2 deletions src/Steps/Filters/ComparisonFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ class ComparisonFilter extends Filter
public function __construct(
protected readonly ComparisonFilterRule $filterRule,
protected readonly mixed $compareTo,
) {
}
) {}

/**
* @throws Exception
Expand Down
4 changes: 1 addition & 3 deletions src/Steps/Filters/NegatedFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

final class NegatedFilter implements FilterInterface
{
public function __construct(private readonly FilterInterface $filter)
{
}
public function __construct(private readonly FilterInterface $filter) {}

public function useKey(string $key): static
{
Expand Down
3 changes: 1 addition & 2 deletions src/Steps/Filters/StringFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ class StringFilter extends Filter
public function __construct(
protected readonly StringFilterRule $filterRule,
protected readonly string $filterString,
) {
}
) {}

/**
* @throws Exception
Expand Down
3 changes: 1 addition & 2 deletions src/Steps/Filters/StringLengthFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ class StringLengthFilter extends Filter
public function __construct(
protected readonly StringLengthFilterRule $filterRule,
protected readonly int $compareToLength,
) {
}
) {}

/**
* @throws Exception
Expand Down
4 changes: 1 addition & 3 deletions src/Steps/Filters/UrlFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@

class UrlFilter extends Filter
{
public function __construct(protected readonly UrlFilterRule $filterRule, protected readonly string $filterString)
{
}
public function __construct(protected readonly UrlFilterRule $filterRule, protected readonly string $filterString) {}

/**
* @throws Exception
Expand Down
3 changes: 1 addition & 2 deletions src/Steps/Html/DomQuery.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ abstract class DomQuery implements DomQueryInterface

public function __construct(
public readonly string $query
) {
}
) {}

/**
* When there is a <base> tag with a href attribute in an HTML document all links in the document must be resolved
Expand Down
4 changes: 1 addition & 3 deletions src/Steps/Html/GetLink.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ class GetLink extends Step

protected bool $withFragment = true;

public function __construct(protected ?string $selector = null)
{
}
public function __construct(protected ?string $selector = null) {}

public static function isSpecialNonHttpLink(Crawler $linkElement): bool
{
Expand Down
4 changes: 1 addition & 3 deletions src/Steps/Json.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ class Json extends Step
/**
* @param mixed[] $propertyMapping
*/
final public function __construct(protected array $propertyMapping = [], protected ?string $each = null)
{
}
final public function __construct(protected array $propertyMapping = [], protected ?string $each = null) {}

/**
* @param mixed[] $propertyMapping
Expand Down
62 changes: 7 additions & 55 deletions src/Steps/Loading/Http.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use Crwlr\Crawler\Steps\Loading\Http\Paginate;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface;
use Crwlr\Crawler\Utils\HttpHeaders;
use Exception;
use Generator;
use GuzzleHttp\Psr7\Request;
Expand Down Expand Up @@ -51,8 +52,7 @@ public function __construct(
protected readonly array $headers = [],
protected readonly string|StreamInterface|null $body = null,
protected readonly string $httpVersion = '1.1',
) {
}
) {}

/**
* @param array|(string|string[])[] $headers
Expand Down Expand Up @@ -288,7 +288,7 @@ protected function getRequestFromInputUri(UriInterface $uri): RequestInterface
{
$body = $this->inputBody ?? $this->body;

$headers = $this->inputHeaders ? $this->mergeHeaders() : $this->headers;
$headers = $this->mergeHeaders();

return new Request($this->method, $uri, $headers, $body, $this->httpVersion);
}
Expand Down Expand Up @@ -398,68 +398,20 @@ protected function addToInputHeadersFromInput(mixed $input, string $inputKey, st
return;
}

$this->inputHeaders[$headerName] = $this->addValuesToHeaderArray(
is_array($this->inputHeaders[$headerName]) ? $this->inputHeaders[$headerName] : [],
is_array($inputValue) ? $inputValue : [$inputValue],
);
$this->inputHeaders = HttpHeaders::addTo(HttpHeaders::normalize($this->inputHeaders), $headerName, $inputValue);
}

/**
* @return array<string, string[]>
*/
protected function mergeHeaders(): array
{
$headers = $this->normalizeHeaders($this->headers);
$headers = HttpHeaders::normalize($this->headers);

if (is_array($this->inputHeaders)) {
foreach ($this->inputHeaders as $headerName => $value) {
if (!array_key_exists($headerName, $headers)) {
$headers[$headerName] = is_array($value) ? $value : [$value];

continue;
}

if (!in_array($value, $headers[$headerName], true)) {
if (is_array($value)) {
$headers[$headerName] = $this->addValuesToHeaderArray($headers[$headerName], $value);
} elseif (!in_array($value, $headers[$headerName], true)) {
$headers[$headerName][] = $value;
}
} else {
$headers[$headerName] = [$headers[$headerName], $value];
}
}
}

return $headers;
}

/**
* @param array<string, string|string[]> $headers
* @return array<string, string[]>
*/
protected function normalizeHeaders(array $headers): array
{
$normalized = [];

foreach ($headers as $headerName => $value) {
$normalized[$headerName] = is_array($value) ? $value : [$value];
}
$inputHeaders = HttpHeaders::normalize($this->inputHeaders);

return $normalized;
}

/**
* @param string[] $headers
* @param string[] $values
* @return string[]
*/
protected function addValuesToHeaderArray(array $headers, array $values): array
{
foreach ($values as $value) {
if (!in_array($value, $headers, true)) {
$headers[] = $value;
}
$headers = HttpHeaders::merge($headers, $inputHeaders);
}

return $headers;
Expand Down
4 changes: 1 addition & 3 deletions src/Steps/Loading/Http/Paginators/AbstractPaginator.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@

abstract class AbstractPaginator implements PaginatorInterface
{
public function __construct(protected int $maxPages = Paginator::MAX_PAGES_DEFAULT)
{
}
public function __construct(protected int $maxPages = Paginator::MAX_PAGES_DEFAULT) {}

public function prepareRequest(
RequestInterface $request,
Expand Down
9 changes: 8 additions & 1 deletion src/Steps/Loading/HttpCrawl.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
use Generator;
use Psr\Http\Message\UriInterface;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;

class HttpCrawl extends Http
{
Expand Down Expand Up @@ -295,7 +296,13 @@ protected function getUrlsFromHtmlDocument(Document $document): array
continue;
}

$url = $this->handleUrlFragment($document->baseUrl()->resolve($linkElement->attr('href') ?? ''));
try {
$url = $this->handleUrlFragment($document->baseUrl()->resolve($linkElement->attr('href') ?? ''));
} catch (Throwable) {
$this->logger?->warning('Failed to resolve a link with href: ' . $linkElement->attr('href'));

continue;
}

if (!$this->isOnSameHostOrDomain($url)) {
continue;
Expand Down
Loading

0 comments on commit 10a10c7

Please sign in to comment.