Skip to content
This repository has been archived by the owner on Nov 1, 2024. It is now read-only.

Commit

Permalink
Fix spans generated for HTML with higher-plane unicode characters (#109)
Browse files Browse the repository at this point in the history
  • Loading branch information
cvolzke4 authored and nshahan committed Sep 19, 2019
1 parent d37f588 commit 2b392a4
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ language: dart

dart:
- dev
- 2.0.0
- 2.3.0

dart_task:
- test: -p vm
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.14.0+3

- Fix spans generated for HTML with higher-plane unicode characters (eg. emojis)

## 0.14.0+2

- Support `package:css` `>=0.13.2 <0.17.0`.
Expand Down
57 changes: 42 additions & 15 deletions lib/src/html_input_stream.dart
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class HtmlInputStream {
List<int> _rawBytes;

/// Raw UTF-16 codes, used if a Dart String is passed in.
Iterable<int> _rawChars;
List<int> _rawChars;

Queue<String> errors;

Expand Down Expand Up @@ -66,7 +66,7 @@ class HtmlInputStream {
this.sourceUrl])
: charEncodingName = codecName(encoding) {
if (source is String) {
_rawChars = source.runes.toList();
_rawChars = source.codeUnits;
charEncodingName = 'utf-8';
charEncodingCertain = true;
} else if (source is List<int>) {
Expand Down Expand Up @@ -96,17 +96,27 @@ class HtmlInputStream {
}

bool skipNewline = false;
for (var c in _rawChars) {
bool wasSurrogatePair = false;
for (int i = 0; i < _rawChars.length; i++) {
int c = _rawChars[i];
if (skipNewline) {
skipNewline = false;
if (c == NEWLINE) continue;
}

if (_invalidUnicode(c)) errors.add('invalid-codepoint');
final isSurrogatePair = _isSurrogatePair(_rawChars, i);
if (!isSurrogatePair && !wasSurrogatePair) {
if (_invalidUnicode(c)) {
errors.add('invalid-codepoint');

if (0xD800 <= c && c <= 0xDFFF) {
c = 0xFFFD;
} else if (c == RETURN) {
if (0xD800 <= c && c <= 0xDFFF) {
c = 0xFFFD;
}
}
}
wasSurrogatePair = isSurrogatePair;

if (c == RETURN) {
skipNewline = true;
c = NEWLINE;
}
Expand Down Expand Up @@ -203,21 +213,38 @@ class HtmlInputStream {
/// EOF when EOF is reached.
String char() {
if (_offset >= _chars.length) return eof;
return String.fromCharCodes([_chars[_offset++]]);
return _isSurrogatePair(_chars, _offset)
? String.fromCharCodes([_chars[_offset++], _chars[_offset++]])
: String.fromCharCodes([_chars[_offset++]]);
}

String peekChar() {
if (_offset >= _chars.length) return eof;
return String.fromCharCodes([_chars[_offset]]);
return _isSurrogatePair(_chars, _offset)
? String.fromCharCodes([_chars[_offset], _chars[_offset + 1]])
: String.fromCharCodes([_chars[_offset]]);
}

// Whether the current and next chars indicate a surrogate pair.
bool _isSurrogatePair(List<int> chars, int i) {
return i + 1 < chars.length &&
_isLeadSurrogate(chars[i]) &&
_isTrailSurrogate(chars[i + 1]);
}

// Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;

// Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;

/// Returns a string of characters from the stream up to but not
/// including any character in 'characters' or EOF.
String charsUntil(String characters, [bool opposite = false]) {
int start = _offset;
String c;
while ((c = peekChar()) != null && characters.contains(c) == opposite) {
_offset++;
_offset += c.codeUnits.length;
}

return String.fromCharCodes(_chars.sublist(start, _offset));
Expand All @@ -227,7 +254,7 @@ class HtmlInputStream {
// Only one character is allowed to be ungotten at once - it must
// be consumed again before any further call to unget
if (ch != null) {
_offset--;
_offset -= ch.codeUnits.length;
assert(peekChar() == ch);
}
}
Expand Down Expand Up @@ -304,18 +331,18 @@ bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
bytes[offset + 2] == 0xBF;
}

/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
/// Decodes the [bytes] with the provided [encoding] and returns a list for
/// the codepoints. Supports the major unicode encodings as well as ascii and
/// and windows-1252 encodings.
Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
List<int> _decodeBytes(String encoding, List<int> bytes) {
switch (encoding) {
case 'ascii':
return ascii.decode(bytes).runes;
return ascii.decode(bytes).codeUnits;

case 'utf-8':
// NOTE: To match the behavior of the other decode functions, we eat the
// UTF-8 BOM here. This is the default behavior of `utf8.decode`.
return utf8.decode(bytes).runes;
return utf8.decode(bytes).codeUnits;

default:
throw ArgumentError('Encoding $encoding not supported');
Expand Down
2 changes: 1 addition & 1 deletion pubspec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ author: Dart Team <misc@dartlang.org>
homepage: https://github.com/dart-lang/html

environment:
sdk: '>=2.0.0 <3.0.0'
sdk: '>=2.3.0 <3.0.0'

dependencies:
csslib: '>=0.13.2 <0.17.0'
Expand Down
24 changes: 24 additions & 0 deletions test/data/tokenizer/unicodeCharsSurrogates.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{"tests" : [
{"description": "Unicode surrogate (emoji)",
"input": "\uD83D\uDC3C",
"output":[["Character", "\uD83D\uDC3C"]]},

{"description": "Unicode surrogate (emoji) prefixed by characters",
"input": "before\uD83D\uDC3C",
"output":[["Character", "before\uD83D\uDC3C"]]},

{"description": "Unicode surrogate (emoji) suffixed by characters",
"input": "\uD83D\uDC3Cafter",
"output":[["Character", "\uD83D\uDC3Cafter"]]},

{"description":"Quoted attribute with surrogate unicode content",
"generateSpans": true,
"input":"<a href='\uD83D\uDC3C'/>",
"output":[["StartTag","a",{"href":"\uD83D\uDC3C"},true,0,14]]},

{"description":"Surrogate unicode content followed by attribute",
"generateSpans": true,
"input":"\uD83D\uDC3C<a href='b'/>",
"output":[["Character", "\uD83D\uDC3C", 0, 2],["StartTag","a",{"href":"b"},true,2,15]]}
]
}
47 changes: 31 additions & 16 deletions test/tokenizer_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,20 @@ import 'support.dart';
class TokenizerTestParser {
final String _state;
final String _lastStartTag;
final bool _generateSpans;
List outputTokens;

TokenizerTestParser(String initialState, [String lastStartTag])
TokenizerTestParser(String initialState,
[String lastStartTag, bool generateSpans = false])
: _state = initialState,
_lastStartTag = lastStartTag;
_lastStartTag = lastStartTag,
_generateSpans = generateSpans;

List parse(String str) {
// Note: we need to pass bytes to the tokenizer if we want it to handle BOM.
var bytes = utf8.encode(str);
var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8');
var tokenizer =
HtmlTokenizer(bytes, encoding: 'utf-8', generateSpans: _generateSpans);
outputTokens = [];

// Note: we can't get a closure of the state method. However, we can
Expand Down Expand Up @@ -68,28 +72,29 @@ class TokenizerTestParser {
}

void processDoctype(DoctypeToken token) {
outputTokens.add(
addOutputToken(token,
["DOCTYPE", token.name, token.publicId, token.systemId, token.correct]);
}

void processStartTag(StartTagToken token) {
outputTokens.add(["StartTag", token.name, token.data, token.selfClosing]);
addOutputToken(
token, ["StartTag", token.name, token.data, token.selfClosing]);
}

void processEndTag(EndTagToken token) {
outputTokens.add(["EndTag", token.name, token.selfClosing]);
addOutputToken(token, ["EndTag", token.name, token.selfClosing]);
}

void processComment(StringToken token) {
outputTokens.add(["Comment", token.data]);
addOutputToken(token, ["Comment", token.data]);
}

void processSpaceCharacters(StringToken token) {
processCharacters(token);
}

void processCharacters(StringToken token) {
outputTokens.add(["Character", token.data]);
addOutputToken(token, ["Character", token.data]);
}

void processEOF(token) {}
Expand All @@ -98,7 +103,15 @@ class TokenizerTestParser {
// TODO(jmesserly): when debugging test failures it can be useful to add
// logging here like `print('ParseError $token');`. It would be nice to
// use the actual logging library.
outputTokens.add(["ParseError", token.data]);
addOutputToken(token, ["ParseError", token.data]);
}

void addOutputToken(Token token, List array) {
outputTokens.add([
...array,
if (token.span != null && _generateSpans) token.span.start.offset,
if (token.span != null && _generateSpans) token.span.end.offset,
]);
}
}

Expand Down Expand Up @@ -138,16 +151,18 @@ List normalizeTokens(List tokens) {
void expectTokensMatch(
List expectedTokens, List receivedTokens, bool ignoreErrorOrder,
[bool ignoreErrors = false, String message]) {
var checkSelfClosing = false;
// If the 'selfClosing' attribute is not included in the expected test tokens,
// remove it from the received token.
var removeSelfClosing = false;
for (var token in expectedTokens) {
if (token[0] == "StartTag" && token.length == 4 ||
token[0] == "EndTag" && token.length == 3) {
checkSelfClosing = true;
if (token[0] == "StartTag" && token.length == 3 ||
token[0] == "EndTag" && token.length == 2) {
removeSelfClosing = true;
break;
}
}

if (!checkSelfClosing) {
if (removeSelfClosing) {
for (var token in receivedTokens) {
if (token[0] == "StartTag" || token[0] == "EndTag") {
token.removeLast();
Expand Down Expand Up @@ -182,8 +197,8 @@ void runTokenizerTest(Map testInfo) {
if (!testInfo.containsKey('lastStartTag')) {
testInfo['lastStartTag'] = null;
}
var parser =
TokenizerTestParser(testInfo['initialState'], testInfo['lastStartTag']);
var parser = TokenizerTestParser(testInfo['initialState'],
testInfo['lastStartTag'], testInfo['generateSpans'] ?? false);
var tokens = parser.parse(testInfo['input']);
tokens = concatenateCharacterTokens(tokens);
var received = normalizeTokens(tokens);
Expand Down

0 comments on commit 2b392a4

Please sign in to comment.