Skip to content
This repository was archived by the owner on Jun 2, 2023. It is now read-only.

Commit 4ed72d8

Browse files
author
Stefan
committed
split URL error handling fixes for bad urls
1 parent 48d51a0 commit 4ed72d8

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

libs/Utils/PHPCrawlerUtils.php

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,15 @@ class PHPCrawlerUtils
4040
*/
4141
public static function splitURL($url): array
4242
{
43-
// Protokoll der URL hinzuf?gen (da ansonsten parse_url nicht klarkommt)
43+
// Add protocol to the URL (otherwise parse_url will not work)
4444
if (!preg_match('#^[a-z0-9-]+://# i', $url)) {
4545
$url = "http://" . $url;
4646
}
4747

4848
$parts = parse_url($url);
4949

50-
if (!isset($parts)) {
51-
return null;
50+
if ($parts == false || !isset($parts)) {
51+
throw new Exception('PHPCrawlerUtils::splitURL Failed to parse url: ' . $url);
5252
}
5353

5454
$protocol = $parts['scheme'] . '://';
@@ -63,7 +63,7 @@ public static function splitURL($url): array
6363
$host = strtolower($host);
6464

6565
// File
66-
preg_match('#^(.*/)([^/]*)$#', $path, $match); // Alles ab dem letzten "/"
66+
preg_match('#^(.*/)([^/]*)$#', $path, $match); // Everything from the last one "/"
6767
if (isset($match[0])) {
6868
$file = trim($match[2]);
6969
$path = trim($match[1]);
@@ -84,15 +84,15 @@ public static function splitURL($url): array
8484
$domain = substr($host, $pos + 1);
8585
}
8686

87-
// DEFAULT VALUES f?r protocol, path, port etc. (wenn noch nicht gesetzt)
87+
// DEFAULT VALUES for protocol, path, port etc. (if not set yet)
8888

89-
// Wenn Protokoll leer -> Protokoll ist "http://"
89+
// If protocol is empty -> protocol is "http: //"
9090
if ($protocol == '') {
9191
$protocol = "http://";
9292
}
9393

94-
// Wenn Port leer -> Port setzen auf 80 or 443
95-
// (abh?ngig vom Protokoll)
94+
// If port is empty -> set port to 80 or 443
95+
// (depending on the protocol)
9696
if ($port == '') {
9797
if (strtolower($protocol) === 'http://') {
9898
$port = 80;
@@ -102,12 +102,12 @@ public static function splitURL($url): array
102102
}
103103
}
104104

105-
// Wenn Pfad leet -> Pfad ist "/"
105+
// if path is empty -> path is "/"
106106
if ($path == '') {
107107
$path = "/";
108108
}
109109

110-
// R?ckgabe-Array
110+
// build array
111111
$url_parts['protocol'] = $protocol;
112112
$url_parts['host'] = $host;
113113
$url_parts['path'] = $path;
@@ -335,6 +335,14 @@ public static function buildURLFromLink($link, PHPCrawlerUrlPartsDescriptor $Bas
335335
return null;
336336
}
337337

338+
if ($link == 'http://') {
339+
return null;
340+
}
341+
342+
if ($link == 'https://') {
343+
return null;
344+
}
345+
338346
// Now, at least, replace all HTMLENTITIES with normal text.
339347
// I.E.: HTML-Code of the link is: <a href="index.php?x=1&amp;y=2">
340348
// -> Link has to be "index.php?x=1&y=2"

0 commit comments

Comments
 (0)