Skip to content

Commit ae0ea29

Browse files
committed
Introduce “UnforgivingHtml” parser.
Goals of the parser: * Tighten control over things like double-quotes & closing tags. * Improve error messaging for malformed markup. * Improve performance. Closes #239.
1 parent aaefbba commit ae0ea29

File tree

5 files changed

+1697
-521
lines changed

5 files changed

+1697
-521
lines changed

CHANGELOG.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
### Changed
10+
11+
- The current “forgiving” html parser is replaced for a more “unforgiving” html
12+
parser in the default template engine (#239).
13+
14+
### Deprecated
15+
16+
- The `<svg>` element and the `svg` tagged template function are deprecated and
17+
will be removed in future versions. The spec and conventions for `svg`
18+
differ a lot from `html` and a faster / more-maintainable parser can be built
19+
if we stop supporting this (#236).
20+
- The `<style>` tag is deprecated and will be removed in future versions.
21+
Authors should prefer to declare a separate stylesheet in a `.css` file now
22+
that “import attributes” are supported in modern browsers (#237).
23+
24+
### Removed
25+
26+
- Support for the `<math>` element is removed from the default template engine.
27+
This worked before because `innerHTML` was being used under-the-hood. But a
28+
strict allow-list is now used to accomplish parsing (#238).
29+
- Support for `on*` event handlers bound to either attributes or properties is
30+
removed. Authors should prefer to add event listeners via “addEventListener”.
31+
This was implicitly supported previously, but is now deemed invalid with the
32+
move to a more “unforgiving“ parser (#240).
33+
- Support for CDATA sections is removed. Authors should prefer to use character
34+
references (html entities) instead. Previously, this was implicitly supported
35+
due to underlying usage of `innerHTML`, but is now strictly forbidden (#241).
36+
- Reject JS-y unicode escapes in html template strings. E.g., you cannot write
37+
something like `this\u2026` — instead, you would have to write something like
38+
`this&hellip;`, or `this&#x2026;`, etc. This mirrors the html spec (#242).
39+
- Restrict element tags to an allow-list of what we’re willing to parse in the
40+
default template engine. This causes us to reject elements like `<title>`,
41+
`<body>`, `<link>`, `<script>`, `<canvas>`, `<meta>`, etc. (#239).
42+
943
## [1.1.2] - 2024-12-16
1044

1145
### Added

test/forgiving.js

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
// This is just kept here as an example alternative to our more “unforgiving”
2+
// parsing solution. In particular, it could be interesting to try and keep the
3+
// interfaces to both “forgiving” and “unforgiving” as similar as possible to
4+
// enable us to show performance-testing deltas in the future.
5+
/** Forgiving HTML parser which leverages innerHTML. */
6+
export default class Forgiving {
7+
// Special markers added to markup enabling discovery post-instantiation.
8+
static #NEXT_MARKER = 'forgiving-next:'; // The ":" helps for debugging.
9+
static #CONTENT_MARKER = 'forgiving-content';
10+
11+
// Types of bindings that we can have.
12+
static #ATTRIBUTE = 'attribute';
13+
static #BOOLEAN = 'boolean';
14+
static #DEFINED = 'defined';
15+
static #PROPERTY = 'property';
16+
17+
// TODO: Could be more forgiving here!
18+
// Patterns to find special edges in original html strings.
19+
static #OPEN_REGEX = /<[a-z][a-z0-9-]*(?=\s)/g;
20+
static #STEP_REGEX = /(?:\s+[a-z][a-z0-9-]*(?=[\s>])|\s+[a-z][a-zA-Z0-9-]*="[^"]*")+/y;
21+
static #ATTRIBUTE_OR_PROPERTY_REGEX = /\s+(?:(?<questions>\?{0,2})?(?<attribute>([a-z][a-zA-Z0-9-]*))|\.(?<property>[a-z][a-zA-Z0-9_]*))="$/y;
22+
static #CLOSE_REGEX = />/g;
23+
24+
// Walk through each string from our tagged template function “strings” array
25+
// in a stateful way so that we know what kind of bindings are implied at
26+
// each interpolated value.
27+
static #exhaustString(string, state, context) {
28+
if (!state.inside) {
29+
// We're outside the opening tag.
30+
Forgiving.#OPEN_REGEX.lastIndex = state.index;
31+
const openMatch = Forgiving.#OPEN_REGEX.exec(string);
32+
if (openMatch) {
33+
state.inside = true;
34+
state.index = Forgiving.#OPEN_REGEX.lastIndex;
35+
state.lastOpenContext = context;
36+
state.lastOpenIndex = openMatch.index;
37+
Forgiving.#exhaustString(string, state, context);
38+
}
39+
} else {
40+
// We're inside the opening tag.
41+
Forgiving.#STEP_REGEX.lastIndex = state.index;
42+
if (Forgiving.#STEP_REGEX.test(string)) {
43+
state.index = Forgiving.#STEP_REGEX.lastIndex;
44+
}
45+
Forgiving.#CLOSE_REGEX.lastIndex = state.index;
46+
if (Forgiving.#CLOSE_REGEX.test(string)) {
47+
state.inside = false;
48+
state.index = Forgiving.#CLOSE_REGEX.lastIndex;
49+
Forgiving.#exhaustString(string, state, context);
50+
}
51+
}
52+
}
53+
54+
// Flesh out an html string from our tagged template function “strings” array
55+
// and add special markers that we can detect later, after instantiation.
56+
//
57+
// E.g., the user might have passed this interpolation:
58+
//
59+
// <div id="foo-bar-baz" foo="${foo}" bar="${bar}" .baz="${baz}">
60+
// ${content}
61+
// </div>
62+
//
63+
// … and we would instrument it as follows:
64+
//
65+
// <!--forgiving-next:attribute=foo,attribute=bar,attribute=baz--><div id="foo-bar-baz">
66+
// <!--forgiving-content-->
67+
// </div>
68+
//
69+
static #createHtml(language, strings) {
70+
const keyToKeyState = new Map();
71+
const htmlStrings = [];
72+
const state = { inside: false, index: 0, lastOpenContext: 0, lastOpenIndex: 0 };
73+
// We don’t have to test the last string since it is already on the other
74+
// side of the last interpolation, by definition. Hence the “- 1” below.
75+
// Note that this final string is added just after the loop completes.
76+
for (let iii = 0; iii < strings.length - 1; iii++) {
77+
// The index may be set to “1” here, which indicates we are slicing off a
78+
// trailing quote character from a attribute-or-property match. After
79+
// slicing, we reset the index to zero so regular expressions know to
80+
// match from the start in “exhaustString”.
81+
let string = strings[iii];
82+
if (state.index !== 0) {
83+
string = string.slice(state.index);
84+
state.index = 0;
85+
}
86+
Forgiving.#exhaustString(string, state, iii);
87+
if (state.inside) {
88+
Forgiving.#ATTRIBUTE_OR_PROPERTY_REGEX.lastIndex = state.index;
89+
const match = Forgiving.#ATTRIBUTE_OR_PROPERTY_REGEX.exec(string);
90+
if (match) {
91+
const { questions, attribute, property } = match.groups;
92+
if (attribute) {
93+
// We found a match like this: html`<div hidden="${value}"></div>`.
94+
// … or this: html`<div ?hidden="${value}"></div>`.
95+
// … or this: html`<div ??hidden="${value}"></div>`.
96+
// Syntax is 3-5 characters: `${questions}${attribute}="` + `"`.
97+
let syntax = 3;
98+
let kind = Forgiving.#ATTRIBUTE;
99+
switch (questions) {
100+
case '??': kind = Forgiving.#DEFINED; syntax = 5; break;
101+
case '?': kind = Forgiving.#BOOLEAN; syntax = 4; break;
102+
}
103+
string = string.slice(0, -syntax - attribute.length);
104+
const key = state.lastOpenContext;
105+
const keyState = Forgiving.#setIfMissing(keyToKeyState, key, () => ({ index: state.lastOpenIndex, items: [] }));
106+
keyState.items.push(`${kind}=${attribute}`);
107+
} else {
108+
// We found a match like this: html`<div .title="${value}"></div>`.
109+
// Syntax is 4 characters: `.${property}="` + `"`.
110+
const syntax = 4;
111+
const kind = Forgiving.#PROPERTY;
112+
string = string.slice(0, -syntax - property.length);
113+
const key = state.lastOpenContext;
114+
const keyState = Forgiving.#setIfMissing(keyToKeyState, key, () => ({ index: state.lastOpenIndex, items: [] }));
115+
keyState.items.push(`${kind}=${property}`);
116+
}
117+
state.index = 1; // Accounts for an expected quote character next.
118+
} else {
119+
// It’s “on or after” because interpolated JS can span multiple lines.
120+
const handled = [...strings.slice(0, iii), string.slice(0, state.index)].join('');
121+
const lineCount = handled.split('\n').length;
122+
throw new Error(`Found invalid template on or after line ${lineCount} in substring \`${string}\`. Failed to parse \`${string.slice(state.index)}\`.`);
123+
}
124+
} else {
125+
// Assume it’s a match like this: html`<div>${value}</div>`.
126+
string += `<!--${Forgiving.#CONTENT_MARKER}-->`;
127+
state.index = 0; // No characters to account for. Reset to zero.
128+
}
129+
htmlStrings[iii] = string;
130+
}
131+
// Again, there might be a quote we need to slice off here still.
132+
let lastString = strings.at(-1);
133+
if (state.index > 0) {
134+
lastString = lastString.slice(state.index);
135+
}
136+
htmlStrings.push(lastString);
137+
for (const [iii, { index, items }] of keyToKeyState.entries()) {
138+
const comment = `<!--${Forgiving.#NEXT_MARKER}${items.join(',')}-->`;
139+
const htmlString = htmlStrings[iii];
140+
htmlStrings[iii] = `${htmlString.slice(0, index)}${comment}${htmlString.slice(index)}`;
141+
}
142+
const html = htmlStrings.join('');
143+
return language === Forgiving.svg
144+
? `<svg xmlns="http://www.w3.org/2000/svg">${html}</svg>`
145+
: html;
146+
}
147+
148+
static #createFragment(language, strings) {
149+
const template = document.createElement('template');
150+
const html = Forgiving.#createHtml(language, strings);
151+
template.innerHTML = html;
152+
return template.content;
153+
}
154+
155+
// Walk through our fragment that we added special markers to and notify
156+
// integrator when we hit target “paths”. The integrator can use this with
157+
// a subsequent clone of the fragment to establish “targets”. And, while we
158+
// walk, clean up our bespoke markers.
159+
// Note that we are always walking the interpolated strings and the resulting,
160+
// instantiated DOM _in the same depth-first manner_. This means that the
161+
// ordering is fairly reliable.
162+
//
163+
// For example, we walk this structure:
164+
//
165+
// <!--forgiving-next:attribute=foo,attribute=bar,attribute=baz--><div id="foo-bar-baz">
166+
// <!--forgiving-content-->
167+
// </div>
168+
//
169+
// And end up with this (which is ready to be injected into a container):
170+
//
171+
// <div id="foo-bar-baz">
172+
// <!---->
173+
// <!---->
174+
// </div>
175+
//
176+
static #walkFragment(
177+
onBoolean,
178+
onDefined,
179+
onAttribute,
180+
onProperty,
181+
onContent,
182+
onText,
183+
node,
184+
nodeType = Node.DOCUMENT_FRAGMENT_NODE,
185+
path = [],
186+
) {
187+
// @ts-ignore — TypeScript doesn’t seem to understand the nodeType param.
188+
if (nodeType === Node.ELEMENT_NODE) {
189+
// Special case to handle elements which only allow text content (no comments).
190+
const { localName } = node;
191+
if (
192+
(localName === 'style' || localName === 'script') &&
193+
node.textContent.includes(Forgiving.#CONTENT_MARKER)
194+
) {
195+
throw new Error(`Interpolation of <${localName}> tags is not allowed.`);
196+
} else if (localName === 'textarea' || localName === 'title') {
197+
if (node.textContent.includes(Forgiving.#CONTENT_MARKER)) {
198+
if (node.textContent === `<!--${Forgiving.#CONTENT_MARKER}-->`) {
199+
node.textContent = '';
200+
onText(path);
201+
} else {
202+
throw new Error(`Only basic interpolation of <${localName}> tags is allowed.`);
203+
}
204+
}
205+
}
206+
}
207+
if (nodeType === Node.DOCUMENT_FRAGMENT_NODE || nodeType === Node.ELEMENT_NODE) {
208+
// It’s expensive to make a copy of “childNodes”. Instead, we carefully
209+
// manage our index as we iterate over the live collection.
210+
const childNodes = node.childNodes;
211+
for (let iii = 0; iii < childNodes.length; iii++) {
212+
const childNode = childNodes[iii];
213+
const childNodeType = childNode.nodeType;
214+
if (childNodeType === Node.COMMENT_NODE) {
215+
const textContent = childNode.textContent;
216+
if (textContent.startsWith(Forgiving.#CONTENT_MARKER)) {
217+
childNode.textContent = '';
218+
const startNode = document.createComment('');
219+
node.insertBefore(startNode, childNode);
220+
iii++;
221+
onContent([...path, iii]);
222+
} else if (textContent.startsWith(Forgiving.#NEXT_MARKER)) {
223+
const data = textContent.slice(Forgiving.#NEXT_MARKER.length);
224+
const items = data.split(',');
225+
for (const item of items) {
226+
const [binding, name] = item.split('=');
227+
switch (binding) {
228+
case Forgiving.#ATTRIBUTE: onAttribute(name, [...path, iii]); break;
229+
case Forgiving.#BOOLEAN: onBoolean(name, [...path, iii]); break;
230+
case Forgiving.#DEFINED: onDefined(name, [...path, iii]); break;
231+
case Forgiving.#PROPERTY: onProperty(name, [...path, iii]); break;
232+
}
233+
}
234+
iii--;
235+
node.removeChild(childNode);
236+
}
237+
} else if (childNodeType === Node.ELEMENT_NODE) {
238+
Forgiving.#walkFragment(
239+
onBoolean,
240+
onDefined,
241+
onAttribute,
242+
onProperty,
243+
onContent,
244+
onText,
245+
childNode,
246+
childNodeType,
247+
[...path, iii],
248+
);
249+
}
250+
}
251+
}
252+
}
253+
254+
// TODO: Replace with Map.prototype.getOrInsert when TC39 proposal lands.
255+
// https://github.com/tc39/proposal-upsert
256+
static #setIfMissing(map, key, callback) {
257+
// Values set in this file are ALL truthy, so "get" is used (versus "has").
258+
let value = map.get(key);
259+
if (!value) {
260+
value = callback();
261+
map.set(key, value);
262+
}
263+
return value;
264+
}
265+
266+
// Languages.
267+
static html = 'html';
268+
static svg = 'svg';
269+
270+
static parse(strings, onBoolean, onDefined, onAttribute, onProperty, onContent, onText, language) {
271+
const fragment = Forgiving.#createFragment(language, strings);
272+
Forgiving.#walkFragment(onBoolean, onDefined, onAttribute, onProperty, onContent, onText, fragment);
273+
return fragment;
274+
}
275+
}

0 commit comments

Comments
 (0)