diff --git a/CHANGELOG.md b/CHANGELOG.md index 79fe73b..b902d72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added * Static method `UserAgent::mozilla5CompatibleBrowser()` to get a `UserAgent` instance with the user agent string `Mozilla/5.0 (compatible)` and also the new method `withMozilla5CompatibleUserAgent` in the `AnonymousHttpCrawlerBuilder` that you can use like this: `HttpCrawler::make()->withMozilla5CompatibleUserAgent()`. +* URL refiners: `UrlRefiner::withScheme()`, `UrlRefiner::withHost()`, `UrlRefiner::withPort()`, `UrlRefiner::withoutPort()`, `UrlRefiner::withPath()`, `UrlRefiner::withQuery()`, `UrlRefiner::withoutQuery()`, `UrlRefiner::withFragment()` and `UrlRefiner::withoutFragment()`. ## [1.9.5] - 2024-07-25 ### Fixed diff --git a/src/Steps/Refiners/String/StrAfterFirst.php b/src/Steps/Refiners/String/StrAfterFirst.php index 01d5d07..1fa3bf8 100644 --- a/src/Steps/Refiners/String/StrAfterFirst.php +++ b/src/Steps/Refiners/String/StrAfterFirst.php @@ -11,7 +11,7 @@ public function __construct(protected readonly string $first) {} public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::afterFirst()', $value); + $this->logTypeWarning('StringRefiner::afterFirst()', $value); return $value; } diff --git a/src/Steps/Refiners/String/StrAfterLast.php b/src/Steps/Refiners/String/StrAfterLast.php index 9efd9cb..44e9efc 100644 --- a/src/Steps/Refiners/String/StrAfterLast.php +++ b/src/Steps/Refiners/String/StrAfterLast.php @@ -11,7 +11,7 @@ public function __construct(protected readonly string $last) {} public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::afterLast()', $value); + $this->logTypeWarning('StringRefiner::afterLast()', $value); return $value; } diff --git a/src/Steps/Refiners/String/StrBeforeFirst.php b/src/Steps/Refiners/String/StrBeforeFirst.php index d79c755..42d0eed 100644 --- a/src/Steps/Refiners/String/StrBeforeFirst.php +++ b/src/Steps/Refiners/String/StrBeforeFirst.php @@ -11,7 +11,7 @@ public function __construct(protected readonly string $first) {} public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::beforeFirst()', $value); + $this->logTypeWarning('StringRefiner::beforeFirst()', $value); return $value; } diff --git a/src/Steps/Refiners/String/StrBeforeLast.php b/src/Steps/Refiners/String/StrBeforeLast.php index 148298a..b88b177 100644 --- a/src/Steps/Refiners/String/StrBeforeLast.php +++ b/src/Steps/Refiners/String/StrBeforeLast.php @@ -11,7 +11,7 @@ public function __construct(protected readonly string $last) {} public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::beforeLast()', $value); + $this->logTypeWarning('StringRefiner::beforeLast()', $value); return $value; } diff --git a/src/Steps/Refiners/String/StrBetweenFirst.php b/src/Steps/Refiners/String/StrBetweenFirst.php index 92a91fa..167b112 100644 --- a/src/Steps/Refiners/String/StrBetweenFirst.php +++ b/src/Steps/Refiners/String/StrBetweenFirst.php @@ -11,7 +11,7 @@ public function __construct(protected readonly string $start, protected readonly public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::betweenFirst()', $value); + $this->logTypeWarning('StringRefiner::betweenFirst()', $value); return $value; } diff --git a/src/Steps/Refiners/String/StrBetweenLast.php b/src/Steps/Refiners/String/StrBetweenLast.php index 87b9f9e..2c6181c 100644 --- a/src/Steps/Refiners/String/StrBetweenLast.php +++ b/src/Steps/Refiners/String/StrBetweenLast.php @@ -11,7 +11,7 @@ public function __construct(protected readonly string $start, protected readonly public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::betweenLast()', $value); + $this->logTypeWarning('StringRefiner::betweenLast()', $value); return $value; } diff --git a/src/Steps/Refiners/String/StrReplace.php b/src/Steps/Refiners/String/StrReplace.php index 4719c4b..ae294a3 100644 --- a/src/Steps/Refiners/String/StrReplace.php +++ b/src/Steps/Refiners/String/StrReplace.php @@ -18,7 +18,7 @@ public function __construct( public function refine(mixed $value): mixed { if (!is_string($value)) { - $this->logTypeWarning('Str::replace()', $value); + $this->logTypeWarning('StringRefiner::replace()', $value); return $value; } diff --git a/src/Steps/Refiners/Url/AbstractUrlRefiner.php b/src/Steps/Refiners/Url/AbstractUrlRefiner.php new file mode 100644 index 0000000..7e8c589 --- /dev/null +++ b/src/Steps/Refiners/Url/AbstractUrlRefiner.php @@ -0,0 +1,34 @@ +logTypeWarning($this->staticRefinerMethod(), $value); + + return $value; + } + + if (!$value instanceof Url) { + $value = Url::parse($value); + } + + return $this->refineUrl($value); + } + + abstract protected function staticRefinerMethod(): string; + + abstract protected function refineUrl(Url $url): string; +} diff --git a/src/Steps/Refiners/Url/WithFragment.php b/src/Steps/Refiners/Url/WithFragment.php new file mode 100644 index 0000000..133a49b --- /dev/null +++ b/src/Steps/Refiners/Url/WithFragment.php @@ -0,0 +1,27 @@ +fragment($this->fragment); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/Url/WithHost.php b/src/Steps/Refiners/Url/WithHost.php new file mode 100644 index 0000000..ee896eb --- /dev/null +++ b/src/Steps/Refiners/Url/WithHost.php @@ -0,0 +1,27 @@ +host($this->host); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/Url/WithPath.php b/src/Steps/Refiners/Url/WithPath.php new file mode 100644 index 0000000..e512175 --- /dev/null +++ b/src/Steps/Refiners/Url/WithPath.php @@ -0,0 +1,27 @@ +path($this->path); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/Url/WithPort.php b/src/Steps/Refiners/Url/WithPort.php new file mode 100644 index 0000000..cee9b93 --- /dev/null +++ b/src/Steps/Refiners/Url/WithPort.php @@ -0,0 +1,27 @@ +port($this->port); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/Url/WithQuery.php b/src/Steps/Refiners/Url/WithQuery.php new file mode 100644 index 0000000..8c4018f --- /dev/null +++ b/src/Steps/Refiners/Url/WithQuery.php @@ -0,0 +1,27 @@ +query($this->query); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/Url/WithScheme.php b/src/Steps/Refiners/Url/WithScheme.php new file mode 100644 index 0000000..47afa93 --- /dev/null +++ b/src/Steps/Refiners/Url/WithScheme.php @@ -0,0 +1,27 @@ +scheme($this->scheme); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/Url/WithoutPort.php b/src/Steps/Refiners/Url/WithoutPort.php new file mode 100644 index 0000000..bfdfc5e --- /dev/null +++ b/src/Steps/Refiners/Url/WithoutPort.php @@ -0,0 +1,25 @@ +resetPort(); + + return (string) $url; + } +} diff --git a/src/Steps/Refiners/UrlRefiner.php b/src/Steps/Refiners/UrlRefiner.php new file mode 100644 index 0000000..e87eab0 --- /dev/null +++ b/src/Steps/Refiners/UrlRefiner.php @@ -0,0 +1,59 @@ +getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::afterFirst() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::afterFirst() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/String/AfterLastTest.php b/tests/Steps/Refiners/String/AfterLastTest.php index 2ee0f57..57bd66f 100644 --- a/tests/Steps/Refiners/String/AfterLastTest.php +++ b/tests/Steps/Refiners/String/AfterLastTest.php @@ -15,7 +15,7 @@ $logOutput = $this->getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::afterLast() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::afterLast() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/String/BeforeFirstTest.php b/tests/Steps/Refiners/String/BeforeFirstTest.php index 997564d..06aa7d9 100644 --- a/tests/Steps/Refiners/String/BeforeFirstTest.php +++ b/tests/Steps/Refiners/String/BeforeFirstTest.php @@ -15,7 +15,7 @@ $logOutput = $this->getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::beforeFirst() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::beforeFirst() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/String/BeforeLastTest.php b/tests/Steps/Refiners/String/BeforeLastTest.php index 8d19b3c..f5dcb17 100644 --- a/tests/Steps/Refiners/String/BeforeLastTest.php +++ b/tests/Steps/Refiners/String/BeforeLastTest.php @@ -15,7 +15,7 @@ $logOutput = $this->getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::beforeLast() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::beforeLast() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/String/BetweenFirstTest.php b/tests/Steps/Refiners/String/BetweenFirstTest.php index 5d3f976..4633083 100644 --- a/tests/Steps/Refiners/String/BetweenFirstTest.php +++ b/tests/Steps/Refiners/String/BetweenFirstTest.php @@ -15,7 +15,7 @@ $logOutput = $this->getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::betweenFirst() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::betweenFirst() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/String/BetweenLastTest.php b/tests/Steps/Refiners/String/BetweenLastTest.php index 659c45d..421b620 100644 --- a/tests/Steps/Refiners/String/BetweenLastTest.php +++ b/tests/Steps/Refiners/String/BetweenLastTest.php @@ -15,7 +15,7 @@ $logOutput = $this->getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::betweenLast() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::betweenLast() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/String/ReplaceTest.php b/tests/Steps/Refiners/String/ReplaceTest.php index dfaa0f4..45a155f 100644 --- a/tests/Steps/Refiners/String/ReplaceTest.php +++ b/tests/Steps/Refiners/String/ReplaceTest.php @@ -15,7 +15,7 @@ $logOutput = $this->getActualOutputForAssertion(); - expect($logOutput)->toContain('Refiner Str::replace() can\'t be applied to value of type ' . gettype($value)); + expect($logOutput)->toContain('Refiner StringRefiner::replace() can\'t be applied to value of type ' . gettype($value)); expect($refinedValue)->toBe($value); })->with([ diff --git a/tests/Steps/Refiners/Url/WithFragmentTest.php b/tests/Steps/Refiners/Url/WithFragmentTest.php new file mode 100644 index 0000000..7d6b34a --- /dev/null +++ b/tests/Steps/Refiners/Url/WithFragmentTest.php @@ -0,0 +1,46 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withFragment() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('replaces the query in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withFragment('#lorem')->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com/path#foo', 'https://www.example.com/path#lorem'], + ['https://www.example.com/path', 'https://www.example.com/path#lorem'], + [Url::parse('https://www.crwlr.software/some/path#abc'), 'https://www.crwlr.software/some/path#lorem'], + [Url::parsePsr7('https://www.crwl.io/quz#'), 'https://www.crwl.io/quz#lorem'], +]); + +it('resets any query', function (mixed $value, string $expected) { + expect(UrlRefiner::withoutFragment()->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com/foo#bar', 'https://www.example.com/foo'], + ['https://www.crwlr.software/#', 'https://www.crwlr.software/'], +]); diff --git a/tests/Steps/Refiners/Url/WithHostTest.php b/tests/Steps/Refiners/Url/WithHostTest.php new file mode 100644 index 0000000..da6a192 --- /dev/null +++ b/tests/Steps/Refiners/Url/WithHostTest.php @@ -0,0 +1,39 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withHost() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('replaces the host in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withHost('www.crwlr.software')->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com/foo', 'https://www.crwlr.software/foo'], + ['https://www.crwl.io/bar', 'https://www.crwlr.software/bar'], + [Url::parse('https://www.crwlr.software/baz'), 'https://www.crwlr.software/baz'], + [Url::parsePsr7('https://crwl.io/quz'), 'https://www.crwlr.software/quz'], +]); diff --git a/tests/Steps/Refiners/Url/WithPathTest.php b/tests/Steps/Refiners/Url/WithPathTest.php new file mode 100644 index 0000000..b5aeb20 --- /dev/null +++ b/tests/Steps/Refiners/Url/WithPathTest.php @@ -0,0 +1,39 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withPath() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('replaces the path in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withPath('/some/path/123')->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com/foo', 'https://www.example.com/some/path/123'], + ['https://localhost/yo', 'https://localhost/some/path/123'], + [Url::parse('https://www.crwlr.software/packages'), 'https://www.crwlr.software/some/path/123'], + [Url::parsePsr7('https://www.crwl.io/'), 'https://www.crwl.io/some/path/123'], +]); diff --git a/tests/Steps/Refiners/Url/WithPortTest.php b/tests/Steps/Refiners/Url/WithPortTest.php new file mode 100644 index 0000000..d57f35e --- /dev/null +++ b/tests/Steps/Refiners/Url/WithPortTest.php @@ -0,0 +1,39 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withPort() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('replaces the port in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withPort(1234)->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com:8000/foo', 'https://www.example.com:1234/foo'], + ['https://localhost:8080/yo', 'https://localhost:1234/yo'], + [Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software:1234/bar'], + [Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io:1234/quz'], +]); diff --git a/tests/Steps/Refiners/Url/WithQueryTest.php b/tests/Steps/Refiners/Url/WithQueryTest.php new file mode 100644 index 0000000..f6d3d59 --- /dev/null +++ b/tests/Steps/Refiners/Url/WithQueryTest.php @@ -0,0 +1,46 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withQuery() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('replaces the query in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withQuery('a=b&c=d')->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com/foo?one=two', 'https://www.example.com/foo?a=b&c=d'], + ['https://www.example.com/bar', 'https://www.example.com/bar?a=b&c=d'], + [Url::parse('https://www.crwlr.software/?'), 'https://www.crwlr.software/?a=b&c=d'], + [Url::parsePsr7('https://www.crwl.io/quz?a=c&b=d'), 'https://www.crwl.io/quz?a=b&c=d'], +]); + +it('resets any query', function (mixed $value, string $expected) { + expect(UrlRefiner::withoutQuery()->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com/foo?one=two', 'https://www.example.com/foo'], + ['https://www.crwlr.software/?', 'https://www.crwlr.software/'], +]); diff --git a/tests/Steps/Refiners/Url/WithSchemeTest.php b/tests/Steps/Refiners/Url/WithSchemeTest.php new file mode 100644 index 0000000..df3d443 --- /dev/null +++ b/tests/Steps/Refiners/Url/WithSchemeTest.php @@ -0,0 +1,39 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withScheme() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('replaces the scheme in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withScheme('https')->refine($value))->toBe($expected); +})->with([ + ['http://www.example.com/foo', 'https://www.example.com/foo'], + ['https://www.example.com/foo', 'https://www.example.com/foo'], + [Url::parse('ftp://www.example.com/bar'), 'https://www.example.com/bar'], + [Url::parsePsr7('http://www.example.com/baz'), 'https://www.example.com/baz'], +]); diff --git a/tests/Steps/Refiners/Url/WithoutPortTest.php b/tests/Steps/Refiners/Url/WithoutPortTest.php new file mode 100644 index 0000000..29f27d6 --- /dev/null +++ b/tests/Steps/Refiners/Url/WithoutPortTest.php @@ -0,0 +1,39 @@ +addLogger(new CliLogger()) + ->refine($value); + + $logOutput = $this->getActualOutputForAssertion(); + + expect($logOutput) + ->toContain('Refiner UrlRefiner::withoutPort() can\'t be applied to value of type ' . gettype($value)) + ->and($refinedValue)->toBe($value); + }, +)->with([ + [123], + [true], + [new stdClass()], +]); + +it('resets the port to null in a URL', function (mixed $value, string $expected) { + expect(UrlRefiner::withoutPort()->refine($value))->toBe($expected); +})->with([ + ['https://www.example.com:8000/foo', 'https://www.example.com/foo'], + ['http://localhost:8080/yo', 'http://localhost/yo'], + [Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software/bar'], + [Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io/quz'], +]);