Fixed parser problem with some websites. Changed project name.

AnzenKodo · Sep 9, 2022 · a367ef1 · a367ef1
1 parent 2681c29
commit a367ef1
Show file tree

Hide file tree

Showing 2 changed files with 157 additions and 91 deletions.
diff --git a/composer.json b/composer.json
@@ -1,25 +1,34 @@
 {
-	"name": "anzenkodo/small-rss-php",
-	"description": "RSS & Atom feed parser for PHP. Very small and easy-to-use library for parsing your feeds.",
-	"keywords": ["rss", "atom", "feed"],
-	"homepage": "https://github.com/AnzenKodo/small-rss-php",
-	"license": ["unlicense"],
-	"authors": [
-		{
-			"name": "AnzenKodo",
-			"email": "AnzenKodo@altmails.com",
-			"homepage": "https://AnzenKodo.github.io/AnzenKodo",
-			"role": "Developer"
-		}
-	],
-	"support": {
-		"issues": "https://github.com/AnzenKodo/small-rss-php/issues",
-		"source": "https://github.com/AnzenKodo/small-rss-php"
-	},
-	"require": {
-		"php": ">=8.0"
-	},
-	"autoload": {
-		"classmap": ["src/"]
-	}
+  "name": "anzenkodo/rss-atom-parser",
+  "description": "RSS & Atom feed parser for PHP. Very small and easy-to-use library for parsing your feeds.",
+  "type": "library",
+  "keywords": [ "rss", "atom", "feed" ],
+  "homepage": "https://github.com/AnzenKodo/rss-atom-parser",
+  "readme": "https://github.com/AnzenKodo/rss-atom-parser/blob/main/README.md",
+  "license": "MIT",
+  "authors": [
+    {
+      "name": "AnzenKodo",
+      "email": "AnzenKodo@altmails.com",
+      "homepage": "https://AnzenKodo.github.io/AnzenKodo",
+      "role": "Developer"
+    }
+  ],
+  "support": {
+    "issues": "https://github.com/AnzenKodo/rss-atom-parser/issues",
+    "source": "https://github.com/AnzenKodo/rss-atom-parser",
+    "docs": "https://github.com/AnzenKodo/rss-atom-parser/blob/main/README.md",
+  },
+  "funding": [
+    {
+      "type": "website",
+      "url": "https://AnzenKodo.github.io/AnzenKodo#support"
+    }
+  ],
+  "require": {
+    "php": ">=8.0"
+  },
+  "autoload": {
+    "classmap": [ "src/" ]
+  }
 }
diff --git a/src/rss.php b/src/rss.php
@@ -1,121 +1,176 @@
 <?php
 
 class RSS {
-	public static function feed(string|array|object $url): array {
+	public static $useragent = "FeedFetcher-Google";
+
+	public static function feed(string|array|object $url): object {
 		$data = array();
 
 		// If there are more then 1 $url in object and array type.
 		if (is_object($url) or is_array($url))
-			$data = self::multi_feed_data($url);
+			$data = self::getFeeds($url);
 		// If only 1 $url in string type.
-		else if (is_string($url))
-			$data = self::feed_data($url);
-		else {
+		else if (is_string($url)) {
+			$data = self::getFeed($url);
+		} else {
+			throw new ErrorException("Can't find datatype of {$url}");
 		}
 
 		return $data;
 	}
 
 	// For multiple feeds in array and object type only.
-	public static function multi_feed_data(array|object $urls): array {
+	public static function getFeeds(array|object $urls): object {
 		$data = array();
 
 		foreach ($urls as $url) {
 			$data[] = self::feed($url);
 		}
-		// Sort feed by time
-		usort($data, function($t1, $t2) {
-			$time1 = $t1['date']." ".$t1['time'];
-			$time2 = $t2['date']." ".$t2['time'];
 
-			return strtotime($time1) + strtotime($time2);
-		});
-
-		return $data;
+		// Sort feed by time
+		$ord = array();
+		foreach ($data as $key){
+		    $ord[] = strtotime($key->item[0]->date);
+		}
+		array_multisort($ord, SORT_DESC, $data);
+
+		return (object)$data;
 	}
 
 	// For single feed in string type.
-	public static function feed_data(string $url): array {
+	public static function getFeed(string $url): object {
+		ini_set('user_agent', self::$useragent);
+
 		// Checks if $url content type is html. If it is HTML then finds the Feed
 		// url and changes $url to feed url.
+		$its_html = self::checkContentType($url, "html");
+		if($its_html) {
+			$content_html = self::getContent($url);
+			$url = self::getFeedUrl($content_html);
+		}
 
-		$its_html = self::check_content_type($url, "html");
-		if($its_html) { $url = self::get_feed_url($content); }
+		$content = self::getContent($url);
 
-		$content = self::get_content($url);
+		$xml = new SimpleXmlElement($content, true);
+		$data = (object)array();
+		if ($xml->channel) {					// If feed is xml vesrion 1.0
+			$data = self::getRSS($xml, $url);
+		}	else if($xml->entry) {			// If feed is xml version 2.0
+			$data = self::getAtom($xml, $url);
+		} else {
+		}
 
-		$xml = self::get_feed($content);
+		return (object)$data;
+	}
 
-		return self::format_data($xml);
+	private static function isHttp(string $str, string $url, bool $boolean = false) {
+		if (preg_match("/^(https|http):\/\//", $str)) return $str;
+
+		if ($str[0] == "/") return preg_replace('/(?<=(\w|\d))\/.*$/', $str, $url);
+
+		if ($boolean) return false;
+
+		return $url;
 	}
+
+	private static function getAtom(SimpleXMLElement $xml, string $url): array {
+		$title = $xml->id ? $xml->id : $url;
+		$title = $xml->title != "" ? $xml->title : $title;
+
+		// Find proper name
+		$replace = "";
+		if (isset($xml->link)) {
+			for ($x = 0; $x < 2; $x++) {
+				if ($xml->link[$x]["rel"] == "alternate") {
+					$replace = $xml->link[$x]["href"];
+					break;
+				} else {
+					$replace = $url;
+				}
+			}
+		} else if (isset($xml->author->uri)) {
+			$replace = $xml->author->uri;
+		} else {
+			$replace = $url;
+		}
 
-	// Simplifies the feed data.
-	private static function format_data(object $xml): array {
-		// Formats time and returns $type of time format.
-		$time_format = function(object|string $time, string $format = "d-M-Y H:i"):
-			string {
+		$data = [
+			"title" => $title,
+			"description" => "$xml->subtitle",
+			"date" => self::timeFormat($xml->updated, "d-M-Y"),
+			"time" => self::timeFormat($xml->updated, "H:i"),
+			"link" => self::isHttp($replace, $url, true) ? self::isHttp($replace, $url) : self::isHttp($xml->id, $url),
+			"feed" => "$url",
+			"item" => array()
+		];
 
-			// Convert to time format
-			$timestamp = strtotime("$time");
+		foreach ($xml->entry as $item) {
+			$time =	$item->updated ? $item->updated : $item->published;
 
-			$time = date($format, $timestamp);
+			$data_item = [
+				"title" => "$item->title",
+				"link" => self::isHttp($item->link["href"], $url),
+				"date" => self::timeFormat($time, "d-M-Y"),
+				"time" => self::timeFormat($time, "H:i")
+			];
 
-			return $time;
-		};
+			array_push($data["item"], (object)$data_item);
+		}
+
+		return $data;
+	}
+
+	private static function getRSS(SimpleXMLElement $xml, string $url): array {
+		$xml = $xml->channel;
+		$time =	$xml->lastBuildDate ? $xml->lastBuildDate : $xml->pubDate;
 
-		// Formats the xml data
 		$data = [
-			"title" => "$xml->title",
+			"title" => $xml->title != "" ? $xml->title : $xml->link,
 			"description" => "$xml->description",
-			"date" => $time_format($xml->lastBuildDate, "d-M-Y"),
-			"time" => $time_format($xml->lastBuildDate, "H:i"),
-			"link" => "$xml->link",
+			"date" => self::timeFormat($time, "d-M-Y"),
+			"time" => self::timeFormat($time, "H:i"),
+			"link" => $xml->link != "" ? self::isHttp($xml->link, $url) : $url,
+			"feed" => $url,
 			"item" => array()
 		];
 
-		// Formats the xml items data
 		foreach ($xml->item as $item) {
 			$data_item = [
-				"title" => "$item->title",
-				"link" => "$item->link",
-				"date" => $time_format($item->pubDate, "d-M-Y"),
-				"time" => $time_format($item->pubDate, "H:i")
+				"title" => $item->title != "" ? $item->title : $item->link,
+				"link" => $item->link ? self::isHttp($item->link, $url) : $item->enclosure["url"],
+				"date" => self::timeFormat($item->pubDate, "d-M-Y"),
+				"time" => self::timeFormat($item->pubDate, "H:i")
 			];
 
-			array_push($data["item"], $data_item);
+			array_push($data["item"], (object)$data_item);
 		}
 
 		return $data;
 	}
 
-	// Checks feed xml version and returns content as per feed
-	private static function get_feed(string $content): object {
-		// Converts $content to object
-		$xml_data = new SimpleXmlElement($content, LIBXML_NOCDATA);
+	private static function timeFormat(object|string $time, string $format =
+		"d-M-Y H:i"): string {
 
-		$xml = (object)array();				// Convert array to object
+			// Convert to time format
+			$timestamp = strtotime("$time");
 
-		if ($xml_data->channel)				// If feed is xml vesrion 1.0
-			$xml = $xml_data->channel;
-		else if($xml_data->entry)			// If feed is xml version 2.0
-			$xml = $xml_data;
-		else {
-			/* self::console_log("$url: Invalid content type"); */
-		}
+			$time = date($format, $timestamp);
 
-		return $xml;
+			return $time;
 	}
 
 	// Returns given $url page content
-	private static function get_content(string $url, $useragent = "FeedFetcher-Google"): string {
+	private static function getContent(string $url): string {
 		$curl = curl_init($url);
 
 		// Return the transfer as a string.
 		curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
 		// The maximum number of seconds to allow cURL functions to execute.
 		curl_setopt($curl, CURLOPT_TIMEOUT, 60);
 		// Changes curl useragent
-		curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
+		curl_setopt($curl, CURLOPT_USERAGENT, self::$useragent);
+		// To make cURL follow a redirect
+		curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
 
 		// Given $url content
 		$content = curl_exec($curl);
@@ -126,7 +181,7 @@ private static function get_content(string $url, $useragent = "FeedFetcher-Googl
 	}
 
 	// Finds Feed url from HTML.
-	private static function get_feed_url(string $content): string {
+	private static function getFeedUrl(string $content): string {
 		// Suppresses DOMDocuments errors
 		libxml_use_internal_errors(true);
 
@@ -153,24 +208,26 @@ private static function get_feed_url(string $content): string {
 
 	// Checks given $content_type_name value matches given $url header content
 	// type.
-	private static function check_content_type(string $url, string
+	private static function checkContentType(string $url, string
 		$type_name): bool {
 		// Get $url header
 		$header = get_headers($url, true);
-
-		$content_type = "Content-Type";
-		// Contet type in lower case
-		$content_type_lower = strtolower($content_type);
+		$content_type = "";
 
 		// Find $type_name in header
-		$content_pos = strpos($header[$content_type], $type_name);
-		$content_pos_lower = strpos($header[$content_type_lower], $type_name);
-
-		if ($content_pos)
+		if (isset($header["Content-Type"]))
+			$content_type = $header["Content-Type"];
+		else
+			$content_type = $header["content-type"];
+
+		if (gettype($content_type) == "array")
+			$content_type = $content_type[0];
+
+		if (strpos($content_type, $type_name))
 			$checked_header = true;
-		else if ($content_pos_lower)
+		else {
 			$checked_header = false;
-		else {}
+		}
 
 		return $checked_header;
 	}