diff --git a/README.md b/README.md index a0273f0..001aed9 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Another thing to point out is that the main focus for the time being is on Dubiz * [php-html-parse](https://github.com/paquettg/php-html-parser) * [php-curl-class](https://github.com/php-curl-class/php-curl-class) -* [tidy extension for PHP](http://php.net/manual/en/book.tidy.php) +* [HTMLPurifier](https://packagist.org/packages/ezyang/htmlpurifier) * PHP 5.3 or greater ## Installation diff --git a/composer.json b/composer.json index ede7eab..c54804c 100644 --- a/composer.json +++ b/composer.json @@ -14,9 +14,9 @@ ], "require": { "php": ">=5.3.0", - "ext-tidy": "*", "paquettg/php-html-parser": "1.6.4", - "php-curl-class/php-curl-class": "^4.6" + "php-curl-class/php-curl-class": "^4.6", + "ezyang/htmlpurifier": "^4.7" }, "autoload": { "psr-4": {"Dubizzle\\": "src/Dubizzle"} diff --git a/src/Dubizzle/Category.php b/src/Dubizzle/Category.php index dfe110d..788b238 100644 --- a/src/Dubizzle/Category.php +++ b/src/Dubizzle/Category.php @@ -3,6 +3,7 @@ namespace Dubizzle; use PHPHtmlParser\Dom; +use HTMLPurifier; require_once 'lib/util.php'; require_once 'lib/region.php'; @@ -23,23 +24,13 @@ public function get_models($key = null){ } private function parseHTML(){ - $html = $this->data["form_html"]; - # Initialize a HTML cleaner object. - $tidy = new \tidy; - $config = array( - 'indent' => true, - 'output-xhtml' => true, - 'wrap' => 200); - # Fix HTML errors. - $tidy->parseString($html, $config, 'utf8'); - $tidy->cleanRepair(); - - # Get the clean HTML string. - $tidyHTML = tidy_get_output($tidy); + # Clean HTML. + $purifier = new HTMLPurifier(); + $clean_html = $purifier->purify($html); # Build a HTML parser to search for items. $this->dom = new Dom; - $this->dom->load($tidyHTML); + $this->dom->load($clean_html); } private function get_body_type(){ diff --git a/src/Dubizzle/Results.php b/src/Dubizzle/Results.php index 42bef34..480aa37 100644 --- a/src/Dubizzle/Results.php +++ b/src/Dubizzle/Results.php @@ -3,6 +3,7 @@ namespace Dubizzle; use PHPHtmlParser\Dom; +use HTMLPurifier; use Curl\MultiCurl; class Results{ @@ -18,22 +19,13 @@ class Results{ * @param sting $url Base url of the search result page. */ public function __construct($html, $num_results, $url){ - # Initialize a HTML cleaner object. - $tidy = new \tidy; - $config = array( - 'indent' => true, - 'output-xhtml' => true, - 'wrap' => 200); - # Fix HTML errors. - $tidy->parseString($html, $config, 'utf8'); - $tidy->cleanRepair(); - - # Get the clean HTML string. - $tidyHTML = tidy_get_output($tidy); + # Clean HTML. + $purifier = new HTMLPurifier(); + $clean_html = $purifier->purify($html); # Build a HTML parser to search for items. $this->dom = new Dom; - $this->dom->load($tidyHTML); + $this->dom->load($clean_html); $this->num_results = $num_results; $this->url = $url; @@ -48,23 +40,13 @@ public function __construct($html, $num_results, $url){ * @return PHPHtmlParser\Dom[] - List of items that was found on the current html page. */ public static function get_more_results($html){ - # Initialize a HTML cleaner object. - $tidy = new \tidy; - // Specify configuration - $config = array( - 'indent' => true, - 'output-xhtml' => true, - 'wrap' => 200); - # Fix HTML errors. - $tidy->parseString($html, $config, 'utf8'); - $tidy->cleanRepair(); - - # Get the clean HTML string. - $tidyHTML = tidy_get_output($tidy); + # Clean HTML. + $purifier = new HTMLPurifier(); + $clean_html = $purifier->purify($html); # Build a HTML parser to search for items. - $dom = new Dom; - $dom->load($tidyHTML); + $this->dom = new Dom; + $this->dom->load($clean_html); # Get result items from the HTML. $items = $dom->find(".listing-item");