Skip to content

Commit

Permalink
Replaced tidy with HTMLPurifier.
Browse files Browse the repository at this point in the history
  • Loading branch information
b4oshany committed Oct 31, 2015
1 parent 4e6f4f9 commit 1891d17
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 45 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Another thing to point out is that the main focus for the time being is on Dubiz

* [php-html-parse](https://github.com/paquettg/php-html-parser)
* [php-curl-class](https://github.com/php-curl-class/php-curl-class)
* [tidy extension for PHP](http://php.net/manual/en/book.tidy.php)
* [HTMLPurifier](https://packagist.org/packages/ezyang/htmlpurifier)
* PHP 5.3 or greater

## Installation
Expand Down
4 changes: 2 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
],
"require": {
"php": ">=5.3.0",
"ext-tidy": "*",
"paquettg/php-html-parser": "1.6.4",
"php-curl-class/php-curl-class": "^4.6"
"php-curl-class/php-curl-class": "^4.6",
"ezyang/htmlpurifier": "^4.7"
},
"autoload": {
"psr-4": {"Dubizzle\\": "src/Dubizzle"}
Expand Down
19 changes: 5 additions & 14 deletions src/Dubizzle/Category.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Dubizzle;

use PHPHtmlParser\Dom;
use HTMLPurifier;

require_once 'lib/util.php';
require_once 'lib/region.php';
Expand All @@ -23,23 +24,13 @@ public function get_models($key = null){
}

private function parseHTML(){
$html = $this->data["form_html"];
# Initialize a HTML cleaner object.
$tidy = new \tidy;
$config = array(
'indent' => true,
'output-xhtml' => true,
'wrap' => 200);
# Fix HTML errors.
$tidy->parseString($html, $config, 'utf8');
$tidy->cleanRepair();

# Get the clean HTML string.
$tidyHTML = tidy_get_output($tidy);
# Clean HTML.
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($html);

# Build a HTML parser to search for items.
$this->dom = new Dom;
$this->dom->load($tidyHTML);
$this->dom->load($clean_html);
}

private function get_body_type(){
Expand Down
38 changes: 10 additions & 28 deletions src/Dubizzle/Results.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Dubizzle;

use PHPHtmlParser\Dom;
use HTMLPurifier;
use Curl\MultiCurl;

class Results{
Expand All @@ -18,22 +19,13 @@ class Results{
* @param sting $url Base url of the search result page.
*/
public function __construct($html, $num_results, $url){
# Initialize a HTML cleaner object.
$tidy = new \tidy;
$config = array(
'indent' => true,
'output-xhtml' => true,
'wrap' => 200);
# Fix HTML errors.
$tidy->parseString($html, $config, 'utf8');
$tidy->cleanRepair();

# Get the clean HTML string.
$tidyHTML = tidy_get_output($tidy);
# Clean HTML.
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($html);

# Build a HTML parser to search for items.
$this->dom = new Dom;
$this->dom->load($tidyHTML);
$this->dom->load($clean_html);

$this->num_results = $num_results;
$this->url = $url;
Expand All @@ -48,23 +40,13 @@ public function __construct($html, $num_results, $url){
* @return PHPHtmlParser\Dom[] - List of items that was found on the current html page.
*/
public static function get_more_results($html){
# Initialize a HTML cleaner object.
$tidy = new \tidy;
// Specify configuration
$config = array(
'indent' => true,
'output-xhtml' => true,
'wrap' => 200);
# Fix HTML errors.
$tidy->parseString($html, $config, 'utf8');
$tidy->cleanRepair();

# Get the clean HTML string.
$tidyHTML = tidy_get_output($tidy);
# Clean HTML.
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($html);

# Build a HTML parser to search for items.
$dom = new Dom;
$dom->load($tidyHTML);
$this->dom = new Dom;
$this->dom->load($clean_html);

# Get result items from the HTML.
$items = $dom->find(".listing-item");
Expand Down

0 comments on commit 1891d17

Please sign in to comment.