From 6573ea6e3c30b3b23274b0a6290d173fde1ebb70 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 23 Oct 2024 22:03:23 +0000 Subject: [PATCH] HTML API: Fix extensibility of `WP_HTML_Processor::next_token()`. Break out logic from the `next_token()` method into a private method which may call itself recursively. This allows for subclasses to override the `next_token()` method and be assured that each call to `next_token()` corresponds with the consumption of one single token. This also parallels how `WP_HTML_Tag_Processor::next_token()` wraps a private `base_class_next_token()` method. Props westonruter, jonsurrell. Fixes #62269. git-svn-id: https://develop.svn.wordpress.org/trunk@59285 602fd350-edb4-49c9-b593-d223f7449a82 --- .../html-api/class-wp-html-processor.php | 28 ++- .../html-xpath-generating-processor.php | 88 +++++++++ .../tests/html-api/wpHtmlProcessor.php | 176 ++++++++++++++++++ 3 files changed, 286 insertions(+), 6 deletions(-) create mode 100644 tests/phpunit/data/html-api/html-xpath-generating-processor.php diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index b91c244887287..4003cf48c1b69 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -603,6 +603,22 @@ public function next_tag( $query = null ): bool { return false; } + /** + * Finds the next token in the HTML document. + * + * This doesn't currently have a way to represent non-tags and doesn't process + * semantic rules for text nodes. For access to the raw tokens consider using + * WP_HTML_Tag_Processor instead. + * + * @since 6.5.0 Added for internal support; do not use. + * @since 6.7.1 Refactored so subclasses may extend. + * + * @return bool Whether a token was parsed. + */ + public function next_token(): bool { + return $this->_next_token(); + } + /** * Ensures internal accounting is maintained for HTML semantic rules while * the underlying Tag Processor class is seeking to a bookmark. @@ -611,13 +627,13 @@ public function next_tag( $query = null ): bool { * semantic rules for text nodes. For access to the raw tokens consider using * WP_HTML_Tag_Processor instead. * - * @since 6.5.0 Added for internal support; do not use. + * @since 6.7.1 Added for internal support; do not use. * * @access private * * @return bool */ - public function next_token(): bool { + private function _next_token(): bool { $this->current_element = null; if ( isset( $this->last_error ) ) { @@ -635,7 +651,7 @@ public function next_token(): bool { * tokens works in the meantime and isn't obviously wrong. */ if ( empty( $this->element_queue ) && $this->step() ) { - return $this->next_token(); + return $this->_next_token(); } // Process the next event on the queue. @@ -646,7 +662,7 @@ public function next_token(): bool { continue; } - return empty( $this->element_queue ) ? false : $this->next_token(); + return empty( $this->element_queue ) ? false : $this->_next_token(); } $is_pop = WP_HTML_Stack_Event::POP === $this->current_element->operation; @@ -657,7 +673,7 @@ public function next_token(): bool { * the breadcrumbs. */ if ( 'root-node' === $this->current_element->token->bookmark_name ) { - return $this->next_token(); + return $this->_next_token(); } // Adjust the breadcrumbs for this event. @@ -669,7 +685,7 @@ public function next_token(): bool { // Avoid sending close events for elements which don't expect a closing. if ( $is_pop && ! $this->expects_closer( $this->current_element->token ) ) { - return $this->next_token(); + return $this->_next_token(); } return true; diff --git a/tests/phpunit/data/html-api/html-xpath-generating-processor.php b/tests/phpunit/data/html-api/html-xpath-generating-processor.php new file mode 100644 index 0000000000000..e601c1493e121 --- /dev/null +++ b/tests/phpunit/data/html-api/html-xpath-generating-processor.php @@ -0,0 +1,88 @@ + + */ + public $token_seen_count = array(); + + /** + * Previous depth. + * + * @var int + */ + private $previous_depth = 0; + + /** + * Open stack indices. + * + * @since n.e.x.t + * @var array + */ + private $open_stack_indices = array(); + + /** + * Gets XPath for the current open tag. + * + * @return string XPath. + */ + public function get_xpath(): string { + $xpath = ''; + foreach ( $this->open_stack_indices as $level ) { + $xpath .= sprintf( '/*[%d][self::%s]', $level['index'] + 1, $level['tag_name'] ); + } + return $xpath; + } + + /** + * Gets next token. + * + * @return bool Whether next token was matched. + */ + public function next_token(): bool { + $result = parent::next_token(); + $current_depth = $this->get_current_depth(); + $current_tag = $this->get_tag(); + + $current_depth--; // Because HTML starts at depth 1. + + if ( $this->get_token_type() === '#tag' ) { + $token_name = ( $this->is_tag_closer() ? '-' : '+' ) . $current_tag; + } else { + $token_name = $this->get_token_name(); + } + + if ( ! isset( $this->token_seen_count[ $token_name ] ) ) { + $this->token_seen_count[ $token_name ] = 1; + } else { + ++$this->token_seen_count[ $token_name ]; + } + + if ( $this->get_token_type() === '#tag' && ! $this->is_tag_closer() ) { + if ( $current_depth < $this->previous_depth ) { + array_splice( + $this->open_stack_indices, + $current_depth + 1 + ); + } + + if ( ! isset( $this->open_stack_indices[ $current_depth ] ) ) { + $this->open_stack_indices[ $current_depth ] = array( + 'tag_name' => $current_tag, + 'index' => 0, + ); + } else { + $this->open_stack_indices[ $current_depth ]['tag_name'] = $current_tag; + ++$this->open_stack_indices[ $current_depth ]['index']; + } + + $this->previous_depth = $current_depth; + } + + return $result; + } + +} diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 792f2971421b3..7e568286ccdf9 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -882,4 +882,180 @@ public function test_ensure_form_tag_closer_token_is_reachable() { $this->assertSame( 'FORM', $processor->get_tag() ); $this->assertTrue( $processor->is_tag_closer() ); } + + /** + * Data provider. + * + * @return array + */ + public function data_html_processor_with_extended_next_token() { + return array( + 'single_instance_per_tag' => array( + 'html' => ' + + + + Hello World + + +

Hello World!

+ +

Each tag should occur only once in this document. +

The end.
+ + + ', + 'expected_token_counts' => array( + '+HTML' => 1, + '+HEAD' => 1, + '#text' => 14, + '+META' => 1, + '+TITLE' => 1, + '-HEAD' => 1, + '+BODY' => 1, + '+H1' => 1, + '-H1' => 1, + '+IMG' => 1, + '+P' => 1, + '#comment' => 1, + '-P' => 1, + '+FOOTER' => 1, + '-FOOTER' => 1, + '-BODY' => 1, + '-HTML' => 1, + '' => 1, + ), + 'expected_xpaths' => array( + 0 => '/*[1][self::HTML]', + 1 => '/*[1][self::HTML]/*[1][self::HEAD]', + 2 => '/*[1][self::HTML]/*[1][self::HEAD]/*[1][self::META]', + 3 => '/*[1][self::HTML]/*[1][self::HEAD]/*[2][self::TITLE]', + 4 => '/*[1][self::HTML]/*[2][self::BODY]', + 5 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::H1]', + 6 => '/*[1][self::HTML]/*[2][self::BODY]/*[2][self::IMG]', + 7 => '/*[1][self::HTML]/*[2][self::BODY]/*[3][self::P]', + 8 => '/*[1][self::HTML]/*[2][self::BODY]/*[4][self::FOOTER]', + ), + ), + + 'multiple_tag_instances' => array( + 'html' => ' + + +

Hello World!

+

First +

Second +

Third +

+ + + ', + 'expected_token_counts' => array( + '+HTML' => 1, + '+HEAD' => 1, + '-HEAD' => 1, + '+BODY' => 1, + '#text' => 13, + '+H1' => 1, + '-H1' => 1, + '+P' => 3, + '-P' => 3, + '+UL' => 1, + '+LI' => 3, + '-LI' => 3, + '-UL' => 1, + '-BODY' => 1, + '-HTML' => 1, + '' => 1, + ), + 'expected_xpaths' => array( + 0 => '/*[1][self::HTML]', + 1 => '/*[1][self::HTML]/*[1][self::HEAD]', + 2 => '/*[1][self::HTML]/*[2][self::BODY]', + 3 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::H1]', + 4 => '/*[1][self::HTML]/*[2][self::BODY]/*[2][self::P]', + 5 => '/*[1][self::HTML]/*[2][self::BODY]/*[3][self::P]', + 6 => '/*[1][self::HTML]/*[2][self::BODY]/*[4][self::P]', + 7 => '/*[1][self::HTML]/*[2][self::BODY]/*[5][self::UL]', + 8 => '/*[1][self::HTML]/*[2][self::BODY]/*[5][self::UL]/*[1][self::LI]', + 9 => '/*[1][self::HTML]/*[2][self::BODY]/*[5][self::UL]/*[2][self::LI]', + 10 => '/*[1][self::HTML]/*[2][self::BODY]/*[5][self::UL]/*[3][self::LI]', + ), + ), + + 'extreme_nested_formatting' => array( + 'html' => ' + + +

+ FORMAT +

+ + + ', + 'expected_token_counts' => array( + '+HTML' => 1, + '+HEAD' => 1, + '-HEAD' => 1, + '+BODY' => 1, + '#text' => 7, + '+P' => 1, + '+STRONG' => 1, + '+EM' => 1, + '+STRIKE' => 1, + '+I' => 1, + '+B' => 1, + '+U' => 1, + '-U' => 1, + '-B' => 1, + '-I' => 1, + '-STRIKE' => 1, + '-EM' => 1, + '-STRONG' => 1, + '-P' => 1, + '-BODY' => 1, + '-HTML' => 1, + '' => 1, + ), + 'expected_xpaths' => array( + 0 => '/*[1][self::HTML]', + 1 => '/*[1][self::HTML]/*[1][self::HEAD]', + 2 => '/*[1][self::HTML]/*[2][self::BODY]', + 3 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]', + 4 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]/*[1][self::STRONG]', + 5 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]/*[1][self::STRONG]/*[1][self::EM]', + 6 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]/*[1][self::STRONG]/*[1][self::EM]/*[1][self::STRIKE]', + 7 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]/*[1][self::STRONG]/*[1][self::EM]/*[1][self::STRIKE]/*[1][self::I]', + 8 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]/*[1][self::STRONG]/*[1][self::EM]/*[1][self::STRIKE]/*[1][self::I]/*[1][self::B]', + 9 => '/*[1][self::HTML]/*[2][self::BODY]/*[1][self::P]/*[1][self::STRONG]/*[1][self::EM]/*[1][self::STRIKE]/*[1][self::I]/*[1][self::B]/*[1][self::U]', + ), + ), + ); + } + + /** + * Ensures that subclasses to WP_HTML_Processor can do bookkeeping by extending the next_token() method. + * + * @ticket ? + * @dataProvider data_html_processor_with_extended_next_token + */ + public function test_ensure_next_token_method_extensibility( $html, $expected_token_counts, $expected_xpaths ) { + require_once DIR_TESTDATA . '/html-api/html-xpath-generating-processor.php'; + + $processor = HTML_XPath_Generating_Processor::create_full_parser( $html ); + $actual_xpaths = array(); + while ( $processor->next_tag() ) { + if ( ! $processor->is_tag_closer() ) { + $processor->set_attribute( 'xpath', $processor->get_xpath() ); + $actual_xpaths[] = $processor->get_xpath(); + } + } + + $this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) ); + $this->assertEquals( $expected_xpaths, $actual_xpaths, 'Snapshot: ' . var_export( $actual_xpaths, true ) ); + } }