@@ -40,15 +40,15 @@ class PHPCrawlerUtils
4040 */
4141 public static function splitURL ($ url ): array
4242 {
43- // Protokoll der URL hinzuf?gen (da ansonsten parse_url nicht klarkommt )
43+ // Add protocol to the URL (otherwise parse_url will not work )
4444 if (!preg_match ('#^[a-z0-9-]+://# i ' , $ url )) {
4545 $ url = "http:// " . $ url ;
4646 }
4747
4848 $ parts = parse_url ($ url );
4949
50- if (!isset ($ parts )) {
51- return null ;
50+ if ($ parts == false || !isset ($ parts )) {
51+ throw new Exception ( ' PHPCrawlerUtils::splitURL Failed to parse url: ' . $ url ) ;
5252 }
5353
5454 $ protocol = $ parts ['scheme ' ] . ':// ' ;
@@ -63,7 +63,7 @@ public static function splitURL($url): array
6363 $ host = strtolower ($ host );
6464
6565 // File
66- preg_match ('#^(.*/)([^/]*)$# ' , $ path , $ match ); // Alles ab dem letzten "/"
66+ preg_match ('#^(.*/)([^/]*)$# ' , $ path , $ match ); // Everything from the last one "/"
6767 if (isset ($ match [0 ])) {
6868 $ file = trim ($ match [2 ]);
6969 $ path = trim ($ match [1 ]);
@@ -84,15 +84,15 @@ public static function splitURL($url): array
8484 $ domain = substr ($ host , $ pos + 1 );
8585 }
8686
87- // DEFAULT VALUES f?r protocol, path, port etc. (wenn noch nicht gesetzt )
87+ // DEFAULT VALUES for protocol, path, port etc. (if not set yet )
8888
89- // Wenn Protokoll leer -> Protokoll ist "http://"
89+ // If protocol is empty -> protocol is "http: //"
9090 if ($ protocol == '' ) {
9191 $ protocol = "http:// " ;
9292 }
9393
94- // Wenn Port leer -> Port setzen auf 80 or 443
95- // (abh?ngig vom Protokoll )
94+ // If port is empty -> set port to 80 or 443
95+ // (depending on the protocol )
9696 if ($ port == '' ) {
9797 if (strtolower ($ protocol ) === 'http:// ' ) {
9898 $ port = 80 ;
@@ -102,12 +102,12 @@ public static function splitURL($url): array
102102 }
103103 }
104104
105- // Wenn Pfad leet -> Pfad ist "/"
105+ // if path is empty -> path is "/"
106106 if ($ path == '' ) {
107107 $ path = "/ " ;
108108 }
109109
110- // R?ckgabe-Array
110+ // build array
111111 $ url_parts ['protocol ' ] = $ protocol ;
112112 $ url_parts ['host ' ] = $ host ;
113113 $ url_parts ['path ' ] = $ path ;
@@ -335,6 +335,14 @@ public static function buildURLFromLink($link, PHPCrawlerUrlPartsDescriptor $Bas
335335 return null ;
336336 }
337337
338+ if ($ link == 'http:// ' ) {
339+ return null ;
340+ }
341+
342+ if ($ link == 'https:// ' ) {
343+ return null ;
344+ }
345+
338346 // Now, at least, replace all HTMLENTITIES with normal text.
339347 // I.E.: HTML-Code of the link is: <a href="index.php?x=1&y=2">
340348 // -> Link has to be "index.php?x=1&y=2"
0 commit comments