From fe8035f8ca7837f171d40164ac58e71ca14fe4f8 Mon Sep 17 00:00:00 2001 From: Matthias Pigulla Date: Mon, 26 Jan 2026 11:14:22 +0100 Subject: [PATCH 1/3] Document limitation of parsing `body` fragments only, rename `processFragment` method --- README.md | 12 ++- src/Implementation/Html5TagRewriter.php | 30 +++--- src/TagRewriter.php | 2 +- src/Test/TagRewriterTestCase.php | 2 +- tests/Implementation/Html5TagRewriterTest.php | 96 ++++++++++++++----- 5 files changed, 101 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 708b2e9..f04c176 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,19 @@ $result = $rewriter->process($html); // Process an HTML fragment $fragment = '

Hello World

'; -$result = $rewriter->processFragment($fragment); +$result = $rewriter->processBodyFragment($fragment); ``` +> ![NOTE] +> The `processBodyFragment()` method is currently limited in that it can only process +> HTML strings that come from within the `` section. This has to do with the +> HTML 5 parsing rules defining different [parsing states](https://html.spec.whatwg.org/multipage/parsing.html#parse-state), +> and the PHP DOM API for the HTML 5 parser does currently not expose +> a (documented) way to create fragments and passing the required context information. +> For correct results, you should limit its usage to fragments that shall be processed +> starting in the `in body` parsing state and where the `data state` [tokenization mode](https://html.spec.whatwg.org/multipage/parsing.html#tokenization) +> is active. + ### Creating a Handler Implement the `RewriteHandler` interface or extend `BaseRewriteHandler` to create custom tag transformations. diff --git a/src/Implementation/Html5TagRewriter.php b/src/Implementation/Html5TagRewriter.php index 97cd229..f5d844d 100644 --- a/src/Implementation/Html5TagRewriter.php +++ b/src/Implementation/Html5TagRewriter.php @@ -35,25 +35,25 @@ public function process(string $html5): string } #[Override] - public function processFragment(string $html5Fragment): string + public function processBodyFragment(string $html5Fragment): string { - $document = HTMLDocument::createEmpty(); - $container = $document->createElement('container'); - $document->appendChild($container); - - $temp = $document->createElement('temp'); - $temp->innerHTML = $html5Fragment; - - while ($temp->firstChild instanceof Node) { - $container->appendChild($temp->firstChild); - } + /* + * Different parser states and tokenization modes + * (https://html.spec.whatwg.org/multipage/parsing.html#parse-state, + * https://html.spec.whatwg.org/multipage/parsing.html#tokenization) + * may apply at different parts of the HTML input. Currently, there is + * no (documented) way to create HTML fragements with the necessary + * context with the new DOM API. So, for the time being, we must restrict + * handling of fragments to such inputs that can equally be considered to be + * placed directly after the `` tag. + */ + $document = HTMLDocument::createFromString('', overrideEncoding: 'utf-8'); + $container = $document->body; + $container->innerHTML = $html5Fragment; $this->applyHandlers($document, $container); - /** @var string */ - $innerHTML = $container->innerHTML; - - return $this->cleanup($innerHTML); + return $this->cleanup($container->innerHTML); } private function applyHandlers(Document $document, Node $context): void diff --git a/src/TagRewriter.php b/src/TagRewriter.php index 17b8ee7..c5a2b4e 100644 --- a/src/TagRewriter.php +++ b/src/TagRewriter.php @@ -8,5 +8,5 @@ public function register(RewriteHandler $handler): void; public function process(string $html5): string; - public function processFragment(string $html5Fragment): string; + public function processBodyFragment(string $html5Fragment): string; } diff --git a/src/Test/TagRewriterTestCase.php b/src/Test/TagRewriterTestCase.php index 1b99daf..f3cf678 100644 --- a/src/Test/TagRewriterTestCase.php +++ b/src/Test/TagRewriterTestCase.php @@ -19,7 +19,7 @@ protected function setUp(): void public function assertRewriteResultEquals(string $expected, string $input): void { - $result = $this->rewriter->processFragment($input); + $result = $this->rewriter->processBodyFragment($input); $this->assertXmlStringEqualsXmlString($expected, $result); } diff --git a/tests/Implementation/Html5TagRewriterTest.php b/tests/Implementation/Html5TagRewriterTest.php index 3561c4e..d13edff 100644 --- a/tests/Implementation/Html5TagRewriterTest.php +++ b/tests/Implementation/Html5TagRewriterTest.php @@ -157,10 +157,15 @@ public static function providePreservedFragments(): iterable 'Just plain text', ]; - yield 'simple element' => [ + yield 'simple element from body (parsing in "data state")' => [ '

Hello

', ]; + // Results for processing content are undefined for now + // yield 'tags from head (parsing in "data state")' => [ + // 'Test no tags here', + // ]; + yield 'mixed content' => [ 'Text before emphasized text after', ]; @@ -184,9 +189,9 @@ public static function providePreservedFragments(): iterable #[Test] #[DataProvider('providePreservedFragments')] - public function processFragment_preserves_fragment(string $fragment): void + public function processBodyFragment_preserves_fragment(string $fragment): void { - $result = $this->rewriter->processFragment($fragment); + $result = $this->rewriter->processBodyFragment($fragment); self::assertSame($fragment, $result); } @@ -202,19 +207,64 @@ public static function provideFragmentsCleanedUp(): iterable '

<script> & "quotes"

', '

<script> & "quotes"

', ]; + + yield 'textarea uses RCDATA state' => [ + '', + '', + ]; + + // Results for processing content are undefined for now + // yield 'title tag from head uses RCDATA state' => [ + // 'Test <h1> no tags here', + // 'Test <h1> no tags here', + // ]; } #[Test] #[DataProvider('provideFragmentsCleanedUp')] - public function processFragment_cleans_up_fragment(string $input, string $expected): void + public function processBodyFragment_cleans_up_fragment(string $input, string $expected): void { - $result = $this->rewriter->processFragment($input); + $result = $this->rewriter->processBodyFragment($input); self::assertSame($expected, $result); } + public static function provideFragmentsWithBrokenHandling(): iterable + { + yield 'full head section' => [ + 'Test', + ]; + + yield 'empty head section' => [ + '', + ]; + + yield 'body with content' => [ + '

test

more test

', + ]; + + yield 'empty body' => [ + '', + ]; + + yield 'head and body with content' => [ + 'Test

test

', + ]; + + yield 'empty head and body' => [ + '', + ]; + } + + #[Test] + #[DataProvider('provideFragmentsWithBrokenHandling')] + public function processBodyFragment_currently_cannot_process_fragments_with_head_or_body_tags(string $input): void + { + $this->markTestSkipped('Parsing of fragments that are not part of the `body` is currently not supported.'); + } + #[Test] - public function processFragment_applies_handler(): void + public function processBodyFragment_applies_handler(): void { $handler = new TestRewriteHandler('//html:strong'); $handler->onMatch(function (Element $element) { @@ -222,17 +272,17 @@ public function processFragment_applies_handler(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('Text'); + $result = $this->rewriter->processBodyFragment('Text'); self::assertSame('Text', $result); } #[Test] - public function processFragment_matches_multiple_elements(): void + public function processBodyFragment_matches_multiple_elements(): void { $handler = new TestRewriteHandler('//html:p'); $this->rewriter->register($handler); - $this->rewriter->processFragment('

First

Second

Third

'); + $this->rewriter->processBodyFragment('

First

Second

Third

'); self::assertSame(3, $handler->matchCallCount); } @@ -246,7 +296,7 @@ public function register_allows_multiple_handlers(): void $this->rewriter->register($handler1); $this->rewriter->register($handler2); - $this->rewriter->processFragment('

paragraph

div section
'); + $this->rewriter->processBodyFragment('

paragraph

div section
'); self::assertSame(1, $handler1->matchCallCount); self::assertSame(1, $handler2->matchCallCount); @@ -258,7 +308,7 @@ public function handler_can_match_HTML_elements(): void $handler = new TestRewriteHandler('//html:a'); $this->rewriter->register($handler); - $this->rewriter->processFragment('Link 1Link 2'); + $this->rewriter->processBodyFragment('Link 1Link 2'); self::assertSame(2, $handler->matchCallCount); } @@ -270,7 +320,7 @@ public function handler_can_match_SVG_elements(): void $this->rewriter->register($handler); $svg = ''; - $this->rewriter->processFragment($svg); + $this->rewriter->processBodyFragment($svg); self::assertSame(1, $handler->matchCallCount); } @@ -282,7 +332,7 @@ public function handler_can_match_MathML_elements(): void $this->rewriter->register($handler); $mathml = 'x'; - $this->rewriter->processFragment($mathml); + $this->rewriter->processBodyFragment($mathml); self::assertSame(1, $handler->matchCallCount); } @@ -293,7 +343,7 @@ public function handler_match_is_called_for_each_matching_element(): void $handler = new TestRewriteHandler('//html:p'); $this->rewriter->register($handler); - $this->rewriter->processFragment('

1

2

3

4

5

'); + $this->rewriter->processBodyFragment('

1

2

3

4

5

'); self::assertSame(5, $handler->matchCallCount); self::assertCount(5, $handler->matchedElements); @@ -313,7 +363,7 @@ public function handler_after_matches_is_called_after_all_matches(): void }); $this->rewriter->register($handler); - $this->rewriter->processFragment('

1

2

'); + $this->rewriter->processBodyFragment('

1

2

'); self::assertSame(['match', 'match', 'afterMatches'], $callOrder); } @@ -331,7 +381,7 @@ public function handler_afterMatches_receives_document_and_xpath(): void }); $this->rewriter->register($handler); - $this->rewriter->processFragment('

Test

'); + $this->rewriter->processBodyFragment('

Test

'); self::assertNotNull($receivedDocument); self::assertNotNull($receivedXPath); @@ -343,7 +393,7 @@ public function handler_afterMatches_is_called_also_with_no_matches(): void $handler = new TestRewriteHandler('//html:nonexistent'); $this->rewriter->register($handler); - $this->rewriter->processFragment('

No matching elements

'); + $this->rewriter->processBodyFragment('

No matching elements

'); self::assertSame(0, $handler->matchCallCount); self::assertSame(1, $handler->afterMatchesCallCount); @@ -359,7 +409,7 @@ public function handler_can_modify_element_attributes(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('Link'); + $result = $this->rewriter->processBodyFragment('Link'); self::assertStringContainsString('rel="noopener"', $result); self::assertStringContainsString('target="_blank"', $result); @@ -374,7 +424,7 @@ public function handler_can_modify_element_content(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('

Hello

'); + $result = $this->rewriter->processBodyFragment('

Hello

'); self::assertStringContainsString('Hello', $result); } @@ -388,7 +438,7 @@ public function handler_can_remove_elements(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('

Text

More

'); + $result = $this->rewriter->processBodyFragment('

Text

More

'); self::assertStringNotContainsString('