diff --git a/README.md b/README.md index 5ffd211..1638f5b 100644 --- a/README.md +++ b/README.md @@ -89,15 +89,25 @@ The following namespaces are pre-registered for XPath queries: ### ESI Tag Support -The library handles Edge Side Includes (ESI) tags, converting empty ESI tags to self-closing format: +The library preserves Edge Side Includes (ESI) tags verbatim during HTML5 processing. ESI tags present multiple challenges: -```php -// Input -'' +1. **Self-closing syntax**: Tags like `` don't exist in HTML5 +2. **Arbitrary interleaving**: ESI tags can span across HTML element boundaries +3. **Attribute encoding**: Characters like `&` must not become `&` + +The [ESI Language Specification 1.0](https://www.w3.org/TR/esi-lang/) describes ESI as "XML-based" (Section 1), but also states that documents containing ESI markup are not valid. From Section 1.1: + +> the markup that is emitted by the origin server is not valid; it contains interposed elements from the ESI namespace + +ESI elements can be arbitrarily interleaved with the underlying content, which does not even need to be HTML. The standard makes no statements about whether HTML entities must be applied. Since XML parsing is not feasible for such documents, assuming XML encoding rules is not warranted. + +This library wraps every ESI tag (opening, closing, or self-closing) in an HTML comment using the ESI comment syntax defined in Section 3.7 of the ESI specification (``). This hides the tags from the HTML5 parser while preserving them verbatim. + +> [!IMPORTANT] +> During processing, ESI tags appear as Comment nodes in the DOM. If RewriteHandler +> transformations move or delete these comment nodes, the final result may not +> match expectations. -// Output -'' -``` ## Credits, Copyright and License This library is based on internal work that we have been using at webfactory GmbH, Bonn, at least diff --git a/src/Implementation/EsiTagProcessor.php b/src/Implementation/EsiTagProcessor.php new file mode 100644 index 0000000..01e67f2 --- /dev/null +++ b/src/Implementation/EsiTagProcessor.php @@ -0,0 +1,78 @@ + use self-closing syntax, + * which does not exist in HTML5. The parser treats them as opening tags without closing + * tags, causing all following content to be incorrectly nested inside the ESI element. + * + * 2. Arbitrary interleaving: ESI tags can span across HTML element boundaries in ways that + * violate well-formedness rules. For example, an opening ESI tag might appear in one + * HTML element while its closing tag appears in another. HTML5 parsers would "repair" + * such structures, breaking the intended ESI behavior. + * + * 3. Attribute preservation: ESI tags must not be modified because they may be processed + * on a text basis by an upstream component (e.g., a caching proxy or CDN) that does not + * apply HTML rules. Any transformation - such as encoding & as & in attribute + * values - would break the ESI processor's ability to parse the tag correctly. + * + * This class solves these problems by wrapping every ESI tag (opening, closing, or + * self-closing) in an HTML comment during pre-processing, using the ESI comment syntax + * defined in Section 3.7 of the ESI Language Specification. The original tags are restored + * verbatim during post-processing. + * + * Important: During processing, ESI tags do not appear as Elements in the DOM, but as + * Comment nodes. If RewriteHandler transformations move or delete these comment nodes, + * the final result may not match expectations. + */ +final class EsiTagProcessor +{ + private const COMMENT_PREFIX = 'esi html5-tagrewriter '; + + /** + * Wraps all ESI tags in HTML comments. + * + * Each ESI tag (opening, closing, or self-closing) is wrapped as + * to hide it from the HTML5 parser + * while preserving the original content verbatim. + */ + public function preProcess(string $html): string + { + // Match opening tags: + // Match closing tags: + // Match self-closing tags: + // Note: The [^>]*? pattern does not correctly handle ">" inside quoted attribute + // values (e.g., ). This is a known limitation that we + // ignore for now, as such attribute values are uncommon in practice. + return preg_replace_callback( + '#<(/?)esi:([a-z]+)([^>]*?)(/?)>#', + function (array $matches): string { + return ''; + }, + $html + ) ?? $html; + } + + /** + * Restores original ESI tags from HTML comments. + */ + public function postProcess(string $html): string + { + $prefix = preg_quote(self::COMMENT_PREFIX, '#'); + + return preg_replace_callback( + '##', + function (array $matches): string { + return $matches[1]; + }, + $html + ) ?? $html; + } +} diff --git a/src/Implementation/Html5TagRewriter.php b/src/Implementation/Html5TagRewriter.php index f2829b9..05501cd 100644 --- a/src/Implementation/Html5TagRewriter.php +++ b/src/Implementation/Html5TagRewriter.php @@ -5,7 +5,6 @@ namespace Webfactory\Html5TagRewriter\Implementation; use Dom\Document; -use Dom\Element; use Dom\HTMLDocument; use Dom\Node; use Dom\XPath; @@ -27,11 +26,13 @@ public function register(RewriteHandler $handler): void #[Override] public function process(string $html5): string { - $document = HTMLDocument::createFromString($this->convertEsiSelfClosingTagsToEmptyElements($html5), LIBXML_NOERROR); + $esiProcessor = new EsiTagProcessor(); + + $document = HTMLDocument::createFromString($esiProcessor->preProcess($html5), LIBXML_NOERROR); $this->applyHandlers($document, $document); - return $this->convertEsiEmptyElementsToSelfClosingTags($document->saveHtml()); + return $esiProcessor->postProcess($document->saveHtml()); } #[Override] @@ -47,15 +48,17 @@ public function processBodyFragment(string $html5Fragment): string * handling of fragments to such inputs that can equally be considered to be * placed directly after the `` tag. */ + $esiProcessor = new EsiTagProcessor(); + $document = HTMLDocument::createFromString('', overrideEncoding: 'utf-8'); $container = $document->body; assert($container !== null); - $container->innerHTML = $this->convertEsiSelfClosingTagsToEmptyElements($html5Fragment); + $container->innerHTML = $esiProcessor->preProcess($html5Fragment); $this->applyHandlers($document, $container); - return $this->convertEsiEmptyElementsToSelfClosingTags($container->innerHTML); + return $esiProcessor->postProcess($container->innerHTML); } private function applyHandlers(Document $document, Node $context): void @@ -75,13 +78,4 @@ private function applyHandlers(Document $document, Node $context): void } } - private function convertEsiSelfClosingTagsToEmptyElements(string $html): string - { - return preg_replace('#(]*))/>#i', '$1>', $html) ?? $html; - } - - private function convertEsiEmptyElementsToSelfClosingTags(string $html): string - { - return preg_replace('#(]*))>#i', '$1 />', $html) ?? $html; - } } diff --git a/tests/Implementation/EsiTagProcessorTest.php b/tests/Implementation/EsiTagProcessorTest.php new file mode 100644 index 0000000..07827ad --- /dev/null +++ b/tests/Implementation/EsiTagProcessorTest.php @@ -0,0 +1,91 @@ +processor = new EsiTagProcessor(); + } + + #[Test] + #[DataProvider('providePreProcessCases')] + public function preProcess(string $input, string $expected): void + { + $result = $this->processor->preProcess($input); + + self::assertSame($expected, $result); + } + + public static function providePreProcessCases(): iterable + { + yield 'wraps self-closing tag in comment' => [ + '', + '', + ]; + + yield 'wraps opening tag in comment' => [ + '', + '', + ]; + + yield 'wraps closing tag in comment' => [ + '', + '', + ]; + + yield 'wraps opening and closing tags in separate comments' => [ + 'content', + 'content', + ]; + + yield 'handles multiple tags' => [ + '', + '', + ]; + + yield 'preserves non-esi content' => [ + '

Hello

World
', + '

Hello

World
', + ]; + + yield 'handles esi tags spanning html element boundaries' => [ + '

Start content

more end

', + '

Start content

more end

', + ]; + } + + #[Test] + #[DataProvider('provideRoundtripCases')] + public function roundtrip(string $html): void + { + $preProcessed = $this->processor->preProcess($html); + $result = $this->processor->postProcess($preProcessed); + + self::assertSame($html, $result); + } + + public static function provideRoundtripCases(): iterable + { + yield 'self-closing tag without attributes' => ['']; + yield 'self-closing tag with attribute' => ['']; + yield 'self-closing tag with multiple attributes' => ['']; + yield 'self-closing tag with ampersand in query string' => ['']; + yield 'multiple self-closing tags' => ['']; + yield 'opening and closing tags' => ['content']; + yield 'nested esi structure' => ['']; + yield 'esi tags spanning html boundaries' => ['

Start content

more end

']; + yield 'esi wrapping partial html' => ['

Important:text

']; + yield 'mixed esi and html content' => ['

Content

']; + } +} diff --git a/tests/Implementation/Html5TagRewriterTest.php b/tests/Implementation/Html5TagRewriterTest.php index d5c368d..fedbc6e 100644 --- a/tests/Implementation/Html5TagRewriterTest.php +++ b/tests/Implementation/Html5TagRewriterTest.php @@ -184,16 +184,16 @@ public static function providePreservedFragments(): iterable "
  Line 1\n  Line 2\n  Line 3
", ]; - yield 'ESI tag' => [ - '', + yield 'ESI tag kept literally, since it may be processed as raw text, not under HTML rules' => [ + '', ]; - yield 'ESI tags in context' => [ + yield 'ESI multiple ESI tags with context' => [ '

test


- +
- +
' ]; } @@ -209,11 +209,6 @@ public function processBodyFragment_preserves_fragment(string $fragment): void public static function provideFragmentsCleanedUp(): iterable { - yield 'empty ESI include tag' => [ - '', - '', - ]; - yield 'qouted entities are replaced' => [ '

<script> & "quotes"

', '

<script> & "quotes"

',