Skip to content

Commit

Permalink
Check HTML comments according to current parsing rules
Browse files Browse the repository at this point in the history
  • Loading branch information
kohler committed Oct 23, 2024
1 parent f421bfa commit 1d09b4f
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 13 deletions.
65 changes: 52 additions & 13 deletions lib/cleanhtml.php
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,31 @@ private function check_text($curtf, $tagstack, $pos1, $pos2, $t) {
}
}

private function check_comment($comment, $pos, $t) {
if (str_starts_with($comment, "<!-->")) {
$this->ml[] = $this->e("<0>Incorrectly closed HTML comment", $pos, $pos + 5, $t);
return false;
} else if (str_starts_with($comment, "<!--->")) {
$this->ml[] = $this->e("<0>Incorrectly closed HTML comment", $pos, $pos + 6, $t);
return false;
} else if (($xp = strpos($comment, "<!--", 4)) !== false
&& $xp + 5 !== strlen($comment)) {
$this->ml[] = $this->e("<0>HTML comments may not be nested", $pos + $xp, $pos + $xp + 4, $t);
return false;
} else if (str_ends_with($comment, "<!--->")) {
$this->ml[] = $this->e("<0>Incorrectly closed HTML comment", $pos, $pos + strlen($comment), $t);
return false;
} else if (($xp = strpos($comment, "--!>", 4)) !== false) {
$this->ml[] = $this->e("<0>Incorrectly closed HTML comment", $pos, $pos + $xp + 4, $t);
return false;
} else if (!str_ends_with($comment, "-->")) {
$this->ml[] = $this->e("<0>Unclosed HTML comment", $pos, $pos + strlen($comment), $t);
return false;
} else {
return true;
}
}

/** @param string $t
* @return string|false */
function clean($t) {
Expand All @@ -265,19 +290,33 @@ function clean($t) {
$this->check_text($curtf, $tagstack, $p, $nextp, $t);
}
$p = $nextp;
if (preg_match('/\G<!\[[ie]\w+/i', $t, $m, 0, $p)) {
$this->ml[] = $this->e("<0>Conditional HTML comments not allowed", $p, $p + strlen($m[0]), $t);
return false;
} else if (preg_match('/\G<!\[CDATA\[(.*?)(?:\]\]>|\z)/s', $t, $m, 0, $p)) {
$this->check_text($curtf, $tagstack, $p, $p + strlen($m[0]), $t);
$x .= substr($t, $xp, $p - $xp) . htmlspecialchars($m[1]);
$p = $xp = $p + strlen($m[0]);
} else if (preg_match('/\G<!--.*?(?:-->|\z)\z/s', $t, $m, 0, $p)) {
$x .= substr($t, $xp, $p - $xp);
$p = $xp = $p + strlen($m[0]);
} else if (preg_match('/\G<!(\S+)/s', $t, $m, 0, $p)) {
$this->ml[] = $this->e("<0>HTML and XML declarations not allowed", $p, $p + strlen($m[0]), $t);
return false;
if ($p + 1 < $len && $t[$p + 1] === "!") {
if (preg_match('/\G<!\[CDATA\[(.*?)(\]\]>|\z)/s', $t, $m, 0, $p)) {
if ($m[2] === "") {
$this->ml[] = $this->e("<0>Unclosed CDATA section", $p, $p + strlen($m[0]), $t);
return false;
}
$this->check_text($curtf, $tagstack, $p, $p + strlen($m[0]), $t);
$x .= substr($t, $xp, $p - $xp) . htmlspecialchars($m[1]);
$p = $xp = $p + strlen($m[0]);
} else if (preg_match('/\G<!--.*?(-->|\z)/s', $t, $m, 0, $p)) {
if (!$this->check_comment($m[0], $p, $t)) {
return false;
}
$x .= substr($t, $xp, $p - $xp);
$p = $xp = $p + strlen($m[0]);
} else {
preg_match('/\G<!\s*(\S+)/s', $t, $m, 0, $p);
if (str_starts_with(strtolower($m[1]), "doctype")) {
$this->ml[] = $this->e("<0>HTML DOCTYPE declarations not allowed", $p, $p + strlen($m[0]), $t);
} else if (str_starts_with(strtolower($m[1]), "[i")
|| str_starts_with(strtolower($m[1]), "[e")) {
$this->ml[] = $this->e("<0>Conditional HTML comments not allowed", $p, $p + strlen($m[0]), $t);
} else {
$this->ml[] = $this->e("<0>Incorrectly opened HTML comment", $p, $p + strlen($m[0]), $t);
}
return false;
}
} else if (preg_match('/\G<(\s*+)([A-Za-z][-A-Za-z0-9]*+)(?=[\s\/>])(\s*+)(?:[^<>\'"]+|\'[^\']*\'|"[^"]*")*+>?/s', $t, $m, 0, $p)) {
$tag = strtolower($m[2]);
$tagp = $p;
Expand Down
13 changes: 13 additions & 0 deletions test/t_unit.php
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,19 @@ function test_clean_html() {
xassert_eqq($chtml->clean("<i><![CDATA[<alert>]]></i>"), "<i>&lt;alert&gt;</i>");
}

function test_clean_html_comments() {
$chtml = CleanHTML::basic();
xassert_eqq($chtml->clean('<!-->'), false);
xassert_eqq($chtml->clean('<![ie foo>'), false);
xassert_eqq($chtml->clean('<!--->'), false);
xassert_eqq($chtml->clean('<!---'), false);
xassert_eqq($chtml->clean('<!---->'), "");
xassert_eqq($chtml->clean('<!--<!-->'), "");
xassert_eqq($chtml->clean('<!--<!--->'), false);
xassert_eqq($chtml->clean('<!--My favorite operators are > ad <!-->x'), "x");
xassert_eqq($chtml->clean('<!----!>-->'), false);
}

function test_base48() {
for ($i = 0; $i !== 1000; ++$i) {
$n = mt_rand(0, 99);
Expand Down

0 comments on commit 1d09b4f

Please sign in to comment.