diff --git a/README.md b/README.md index 708b2e9..5ffd211 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,19 @@ $result = $rewriter->process($html); // Process an HTML fragment $fragment = '
Hello World
'; -$result = $rewriter->processFragment($fragment); +$result = $rewriter->processBodyFragment($fragment); ``` +> [!NOTE] +> The `processBodyFragment()` method is currently limited in that it can only process +> HTML strings that come from within the `` section. This has to do with the +> HTML 5 parsing rules defining different [parsing states](https://html.spec.whatwg.org/multipage/parsing.html#parse-state), +> and the PHP DOM API for the HTML 5 parser does currently not expose +> a (documented) way to create fragments and passing the required context information. +> For correct results, you should limit its usage to fragments that shall be processed +> starting in the `in body` parsing state and where the `data state` [tokenization mode](https://html.spec.whatwg.org/multipage/parsing.html#tokenization) +> is active. + ### Creating a Handler Implement the `RewriteHandler` interface or extend `BaseRewriteHandler` to create custom tag transformations. diff --git a/src/Implementation/Html5TagRewriter.php b/src/Implementation/Html5TagRewriter.php index 97cd229..9412d70 100644 --- a/src/Implementation/Html5TagRewriter.php +++ b/src/Implementation/Html5TagRewriter.php @@ -35,25 +35,26 @@ public function process(string $html5): string } #[Override] - public function processFragment(string $html5Fragment): string + public function processBodyFragment(string $html5Fragment): string { - $document = HTMLDocument::createEmpty(); - $container = $document->createElement('container'); - $document->appendChild($container); - - $temp = $document->createElement('temp'); - $temp->innerHTML = $html5Fragment; - - while ($temp->firstChild instanceof Node) { - $container->appendChild($temp->firstChild); - } + /* + * Different parser states and tokenization modes + * (https://html.spec.whatwg.org/multipage/parsing.html#parse-state, + * https://html.spec.whatwg.org/multipage/parsing.html#tokenization) + * may apply at different parts of the HTML input. Currently, there is + * no (documented) way to create HTML fragements with the necessary + * context with the new DOM API. So, for the time being, we must restrict + * handling of fragments to such inputs that can equally be considered to be + * placed directly after the `` tag. + */ + $document = HTMLDocument::createFromString('', overrideEncoding: 'utf-8'); + /** @var \Dom\HTMLElement $container */ + $container = $document->body; + $container->innerHTML = $html5Fragment; $this->applyHandlers($document, $container); - /** @var string */ - $innerHTML = $container->innerHTML; - - return $this->cleanup($innerHTML); + return $this->cleanup($container->innerHTML); } private function applyHandlers(Document $document, Node $context): void diff --git a/src/TagRewriter.php b/src/TagRewriter.php index 17b8ee7..c5a2b4e 100644 --- a/src/TagRewriter.php +++ b/src/TagRewriter.php @@ -8,5 +8,5 @@ public function register(RewriteHandler $handler): void; public function process(string $html5): string; - public function processFragment(string $html5Fragment): string; + public function processBodyFragment(string $html5Fragment): string; } diff --git a/src/Test/TagRewriterTestCase.php b/src/Test/TagRewriterTestCase.php index 1b99daf..f3cf678 100644 --- a/src/Test/TagRewriterTestCase.php +++ b/src/Test/TagRewriterTestCase.php @@ -19,7 +19,7 @@ protected function setUp(): void public function assertRewriteResultEquals(string $expected, string $input): void { - $result = $this->rewriter->processFragment($input); + $result = $this->rewriter->processBodyFragment($input); $this->assertXmlStringEqualsXmlString($expected, $result); } diff --git a/tests/Implementation/Html5TagRewriterTest.php b/tests/Implementation/Html5TagRewriterTest.php index 3561c4e..d13edff 100644 --- a/tests/Implementation/Html5TagRewriterTest.php +++ b/tests/Implementation/Html5TagRewriterTest.php @@ -157,10 +157,15 @@ public static function providePreservedFragments(): iterable 'Just plain text', ]; - yield 'simple element' => [ + yield 'simple element from body (parsing in "data state")' => [ 'Hello
', ]; + // Results for processing content are undefined for now + // yield 'tags from head (parsing in "data state")' => [ + // '<script> & "quotes"
', '<script> & "quotes"
', ]; + + yield 'textarea uses RCDATA state' => [ + '', + '', + ]; + + // Results for processing content are undefined for now + // yield 'title tag from head uses RCDATA state' => [ + // 'test
more test
', + ]; + + yield 'empty body' => [ + '', + ]; + + yield 'head and body with content' => [ + 'test
', + ]; + + yield 'empty head and body' => [ + '', + ]; + } + + #[Test] + #[DataProvider('provideFragmentsWithBrokenHandling')] + public function processBodyFragment_currently_cannot_process_fragments_with_head_or_body_tags(string $input): void + { + $this->markTestSkipped('Parsing of fragments that are not part of the `body` is currently not supported.'); + } + #[Test] - public function processFragment_applies_handler(): void + public function processBodyFragment_applies_handler(): void { $handler = new TestRewriteHandler('//html:strong'); $handler->onMatch(function (Element $element) { @@ -222,17 +272,17 @@ public function processFragment_applies_handler(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('Text'); + $result = $this->rewriter->processBodyFragment('Text'); self::assertSame('Text', $result); } #[Test] - public function processFragment_matches_multiple_elements(): void + public function processBodyFragment_matches_multiple_elements(): void { $handler = new TestRewriteHandler('//html:p'); $this->rewriter->register($handler); - $this->rewriter->processFragment('First
Second
Third
'); + $this->rewriter->processBodyFragment('First
Second
Third
'); self::assertSame(3, $handler->matchCallCount); } @@ -246,7 +296,7 @@ public function register_allows_multiple_handlers(): void $this->rewriter->register($handler1); $this->rewriter->register($handler2); - $this->rewriter->processFragment('paragraph
paragraph
1
2
3
4
5
'); + $this->rewriter->processBodyFragment('1
2
3
4
5
'); self::assertSame(5, $handler->matchCallCount); self::assertCount(5, $handler->matchedElements); @@ -313,7 +363,7 @@ public function handler_after_matches_is_called_after_all_matches(): void }); $this->rewriter->register($handler); - $this->rewriter->processFragment('1
2
'); + $this->rewriter->processBodyFragment('1
2
'); self::assertSame(['match', 'match', 'afterMatches'], $callOrder); } @@ -331,7 +381,7 @@ public function handler_afterMatches_receives_document_and_xpath(): void }); $this->rewriter->register($handler); - $this->rewriter->processFragment('Test
'); + $this->rewriter->processBodyFragment('Test
'); self::assertNotNull($receivedDocument); self::assertNotNull($receivedXPath); @@ -343,7 +393,7 @@ public function handler_afterMatches_is_called_also_with_no_matches(): void $handler = new TestRewriteHandler('//html:nonexistent'); $this->rewriter->register($handler); - $this->rewriter->processFragment('No matching elements
'); + $this->rewriter->processBodyFragment('No matching elements
'); self::assertSame(0, $handler->matchCallCount); self::assertSame(1, $handler->afterMatchesCallCount); @@ -359,7 +409,7 @@ public function handler_can_modify_element_attributes(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('Link'); + $result = $this->rewriter->processBodyFragment('Link'); self::assertStringContainsString('rel="noopener"', $result); self::assertStringContainsString('target="_blank"', $result); @@ -374,7 +424,7 @@ public function handler_can_modify_element_content(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('Hello
'); + $result = $this->rewriter->processBodyFragment('Hello
'); self::assertStringContainsString('Hello', $result); } @@ -388,7 +438,7 @@ public function handler_can_remove_elements(): void }); $this->rewriter->register($handler); - $result = $this->rewriter->processFragment('Text
More
'); + $result = $this->rewriter->processBodyFragment('Text
More
'); self::assertStringNotContainsString('