Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,19 @@ $result = $rewriter->process($html);

// Process an HTML fragment
$fragment = '<p>Hello <strong>World</strong></p>';
$result = $rewriter->processFragment($fragment);
$result = $rewriter->processBodyFragment($fragment);
```

> [!NOTE]
> The `processBodyFragment()` method is currently limited in that it can only process
> HTML strings that come from within the `<body>` section. This has to do with the
> HTML 5 parsing rules defining different [parsing states](https://html.spec.whatwg.org/multipage/parsing.html#parse-state),
> and the PHP DOM API for the HTML 5 parser does currently not expose
> a (documented) way to create fragments and passing the required context information.
> For correct results, you should limit its usage to fragments that shall be processed
> starting in the `in body` parsing state and where the `data state` [tokenization mode](https://html.spec.whatwg.org/multipage/parsing.html#tokenization)
> is active.

### Creating a Handler

Implement the `RewriteHandler` interface or extend `BaseRewriteHandler` to create custom tag transformations.
Expand Down
31 changes: 16 additions & 15 deletions src/Implementation/Html5TagRewriter.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,26 @@ public function process(string $html5): string
}

#[Override]
public function processFragment(string $html5Fragment): string
public function processBodyFragment(string $html5Fragment): string
{
$document = HTMLDocument::createEmpty();
$container = $document->createElement('container');
$document->appendChild($container);

$temp = $document->createElement('temp');
$temp->innerHTML = $html5Fragment;

while ($temp->firstChild instanceof Node) {
$container->appendChild($temp->firstChild);
}
/*
* Different parser states and tokenization modes
* (https://html.spec.whatwg.org/multipage/parsing.html#parse-state,
* https://html.spec.whatwg.org/multipage/parsing.html#tokenization)
* may apply at different parts of the HTML input. Currently, there is
* no (documented) way to create HTML fragements with the necessary
* context with the new DOM API. So, for the time being, we must restrict
* handling of fragments to such inputs that can equally be considered to be
* placed directly after the `<body>` tag.
*/
$document = HTMLDocument::createFromString('', overrideEncoding: 'utf-8');
/** @var \Dom\HTMLElement $container */
$container = $document->body;
$container->innerHTML = $html5Fragment;

$this->applyHandlers($document, $container);

/** @var string */
$innerHTML = $container->innerHTML;

return $this->cleanup($innerHTML);
return $this->cleanup($container->innerHTML);
}

private function applyHandlers(Document $document, Node $context): void
Expand Down
2 changes: 1 addition & 1 deletion src/TagRewriter.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ public function register(RewriteHandler $handler): void;

public function process(string $html5): string;

public function processFragment(string $html5Fragment): string;
public function processBodyFragment(string $html5Fragment): string;
}
2 changes: 1 addition & 1 deletion src/Test/TagRewriterTestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ protected function setUp(): void

public function assertRewriteResultEquals(string $expected, string $input): void
{
$result = $this->rewriter->processFragment($input);
$result = $this->rewriter->processBodyFragment($input);

$this->assertXmlStringEqualsXmlString($expected, $result);
}
Expand Down
96 changes: 73 additions & 23 deletions tests/Implementation/Html5TagRewriterTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,15 @@ public static function providePreservedFragments(): iterable
'Just plain text',
];

yield 'simple element' => [
yield 'simple element from body (parsing in "data state")' => [
'<p>Hello</p>',
];

// Results for processing <head> content are undefined for now
// yield 'tags from head (parsing in "data state")' => [
// '<title>Test no tags here</title><meta name="description" content="test">',
// ];

yield 'mixed content' => [
'Text before <em>emphasized</em> text after',
];
Expand All @@ -184,9 +189,9 @@ public static function providePreservedFragments(): iterable

#[Test]
#[DataProvider('providePreservedFragments')]
public function processFragment_preserves_fragment(string $fragment): void
public function processBodyFragment_preserves_fragment(string $fragment): void
{
$result = $this->rewriter->processFragment($fragment);
$result = $this->rewriter->processBodyFragment($fragment);

self::assertSame($fragment, $result);
}
Expand All @@ -202,37 +207,82 @@ public static function provideFragmentsCleanedUp(): iterable
'<p>&lt;script&gt; &amp; &quot;quotes&quot;</p>',
'<p>&lt;script&gt; &amp; "quotes"</p>',
];

yield 'textarea uses RCDATA state' => [
'<textarea><h1>Some HTML</h1><p>textarea content</p></textarea>',
'<textarea>&lt;h1&gt;Some HTML&lt;/h1&gt;&lt;p&gt;textarea content&lt;/p&gt;</textarea>',
];

// Results for processing <head> content are undefined for now
// yield 'title tag from head uses RCDATA state' => [
// '<title>Test <h1> no tags here</title>',
// '<title>Test &lt;h1&gt; no tags here</title>',
// ];
}

#[Test]
#[DataProvider('provideFragmentsCleanedUp')]
public function processFragment_cleans_up_fragment(string $input, string $expected): void
public function processBodyFragment_cleans_up_fragment(string $input, string $expected): void
{
$result = $this->rewriter->processFragment($input);
$result = $this->rewriter->processBodyFragment($input);

self::assertSame($expected, $result);
}

public static function provideFragmentsWithBrokenHandling(): iterable
{
yield 'full head section' => [
'<head><title>Test</title><meta name="description" content="test"></head>',
];

yield 'empty head section' => [
'<head></head>',
];

yield 'body with content' => [
'<body><p>test</p><p>more test</p></body>',
];

yield 'empty body' => [
'<body></body>',
];

yield 'head and body with content' => [
'<head><title>Test</title></head><body><p>test</p></body>',
];

yield 'empty head and body' => [
'<head></head><body></body>',
];
}

#[Test]
#[DataProvider('provideFragmentsWithBrokenHandling')]
public function processBodyFragment_currently_cannot_process_fragments_with_head_or_body_tags(string $input): void
{
$this->markTestSkipped('Parsing of fragments that are not part of the `body` is currently not supported.');
}

#[Test]
public function processFragment_applies_handler(): void
public function processBodyFragment_applies_handler(): void
{
$handler = new TestRewriteHandler('//html:strong');
$handler->onMatch(function (Element $element) {
$element->setAttribute('class', 'bold');
});

$this->rewriter->register($handler);
$result = $this->rewriter->processFragment('<strong>Text</strong>');
$result = $this->rewriter->processBodyFragment('<strong>Text</strong>');

self::assertSame('<strong class="bold">Text</strong>', $result);
}

#[Test]
public function processFragment_matches_multiple_elements(): void
public function processBodyFragment_matches_multiple_elements(): void
{
$handler = new TestRewriteHandler('//html:p');
$this->rewriter->register($handler);
$this->rewriter->processFragment('<p>First</p><p>Second</p><p>Third</p>');
$this->rewriter->processBodyFragment('<p>First</p><p>Second</p><p>Third</p>');

self::assertSame(3, $handler->matchCallCount);
}
Expand All @@ -246,7 +296,7 @@ public function register_allows_multiple_handlers(): void
$this->rewriter->register($handler1);
$this->rewriter->register($handler2);

$this->rewriter->processFragment('<p>paragraph</p><div>div section</div>');
$this->rewriter->processBodyFragment('<p>paragraph</p><div>div section</div>');

self::assertSame(1, $handler1->matchCallCount);
self::assertSame(1, $handler2->matchCallCount);
Expand All @@ -258,7 +308,7 @@ public function handler_can_match_HTML_elements(): void
$handler = new TestRewriteHandler('//html:a');
$this->rewriter->register($handler);

$this->rewriter->processFragment('<a href="#">Link 1</a><a href="#">Link 2</a>');
$this->rewriter->processBodyFragment('<a href="#">Link 1</a><a href="#">Link 2</a>');

self::assertSame(2, $handler->matchCallCount);
}
Expand All @@ -270,7 +320,7 @@ public function handler_can_match_SVG_elements(): void
$this->rewriter->register($handler);

$svg = '<svg><circle cx="50" cy="50" r="40"/></svg>';
$this->rewriter->processFragment($svg);
$this->rewriter->processBodyFragment($svg);

self::assertSame(1, $handler->matchCallCount);
}
Expand All @@ -282,7 +332,7 @@ public function handler_can_match_MathML_elements(): void
$this->rewriter->register($handler);

$mathml = '<math><mrow><mi>x</mi></mrow></math>';
$this->rewriter->processFragment($mathml);
$this->rewriter->processBodyFragment($mathml);

self::assertSame(1, $handler->matchCallCount);
}
Expand All @@ -293,7 +343,7 @@ public function handler_match_is_called_for_each_matching_element(): void
$handler = new TestRewriteHandler('//html:p');
$this->rewriter->register($handler);

$this->rewriter->processFragment('<p>1</p><p>2</p><p>3</p><p>4</p><p>5</p>');
$this->rewriter->processBodyFragment('<p>1</p><p>2</p><p>3</p><p>4</p><p>5</p>');

self::assertSame(5, $handler->matchCallCount);
self::assertCount(5, $handler->matchedElements);
Expand All @@ -313,7 +363,7 @@ public function handler_after_matches_is_called_after_all_matches(): void
});

$this->rewriter->register($handler);
$this->rewriter->processFragment('<p>1</p><p>2</p>');
$this->rewriter->processBodyFragment('<p>1</p><p>2</p>');

self::assertSame(['match', 'match', 'afterMatches'], $callOrder);
}
Expand All @@ -331,7 +381,7 @@ public function handler_afterMatches_receives_document_and_xpath(): void
});

$this->rewriter->register($handler);
$this->rewriter->processFragment('<p>Test</p>');
$this->rewriter->processBodyFragment('<p>Test</p>');

self::assertNotNull($receivedDocument);
self::assertNotNull($receivedXPath);
Expand All @@ -343,7 +393,7 @@ public function handler_afterMatches_is_called_also_with_no_matches(): void
$handler = new TestRewriteHandler('//html:nonexistent');
$this->rewriter->register($handler);

$this->rewriter->processFragment('<p>No matching elements</p>');
$this->rewriter->processBodyFragment('<p>No matching elements</p>');

self::assertSame(0, $handler->matchCallCount);
self::assertSame(1, $handler->afterMatchesCallCount);
Expand All @@ -359,7 +409,7 @@ public function handler_can_modify_element_attributes(): void
});

$this->rewriter->register($handler);
$result = $this->rewriter->processFragment('<a href="/page">Link</a>');
$result = $this->rewriter->processBodyFragment('<a href="/page">Link</a>');

self::assertStringContainsString('rel="noopener"', $result);
self::assertStringContainsString('target="_blank"', $result);
Expand All @@ -374,7 +424,7 @@ public function handler_can_modify_element_content(): void
});

$this->rewriter->register($handler);
$result = $this->rewriter->processFragment('<p>Hello</p>');
$result = $this->rewriter->processBodyFragment('<p>Hello</p>');

self::assertStringContainsString('<strong>Hello</strong>', $result);
}
Expand All @@ -388,7 +438,7 @@ public function handler_can_remove_elements(): void
});

$this->rewriter->register($handler);
$result = $this->rewriter->processFragment('<p>Text</p><script>alert("evil")</script><p>More</p>');
$result = $this->rewriter->processBodyFragment('<p>Text</p><script>alert("evil")</script><p>More</p>');

self::assertStringNotContainsString('<script>', $result);
self::assertStringNotContainsString('alert', $result);
Expand All @@ -408,7 +458,7 @@ public function handler_can_add_new_elements(): void
});

$this->rewriter->register($handler);
$result = $this->rewriter->processFragment('<p>Original</p>');
$result = $this->rewriter->processBodyFragment('<p>Original</p>');

self::assertSame('<p>Original<span>[added]</span></p>', $result);
}
Expand All @@ -428,7 +478,7 @@ public function handler_can_collect_and_batch_process(): void
});

$this->rewriter->register($handler);
$this->rewriter->processFragment('<ul><li>A</li><li>B</li><li>C</li></ul>');
$this->rewriter->processBodyFragment('<ul><li>A</li><li>B</li><li>C</li></ul>');
}

#[Test]
Expand Down Expand Up @@ -459,7 +509,7 @@ public function handler_multiple_handlers_can_modify_same_element(): void
$this->rewriter->register($handler1);
$this->rewriter->register($handler2);

$result = $this->rewriter->processFragment('<a href="#">Test</a>');
$result = $this->rewriter->processBodyFragment('<a href="#">Test</a>');

self::assertStringContainsString('class="link external"', $result);
}
Expand Down