From 0340cadfb46a11f5aea498f68de2d0c57f2fb1ee Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 15:23:51 -0800 Subject: [PATCH 01/38] Add a token type for CDATA. --- src/parser.c | 2 +- src/token_type.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parser.c b/src/parser.c index 004639dc..0888794b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -345,7 +345,7 @@ typedef struct _TextNodeBufferState { // The source position of the start of this text node. GumboSourcePosition _start_position; - // The type of node that will be inserted (TEXT or WHITESPACE). + // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). GumboNodeType _type; } TextNodeBufferState; diff --git a/src/token_type.h b/src/token_type.h index 5874d1a2..eeab5078 100644 --- a/src/token_type.h +++ b/src/token_type.h @@ -29,6 +29,7 @@ typedef enum { GUMBO_TOKEN_COMMENT, GUMBO_TOKEN_WHITESPACE, GUMBO_TOKEN_CHARACTER, + GUMBO_TOKEN_CDATA, GUMBO_TOKEN_NULL, GUMBO_TOKEN_EOF } GumboTokenType; From f9a515f5ff0f32143e5599709d751a526f540b82 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 15:29:48 -0800 Subject: [PATCH 02/38] Add a state flag for whether the tokenizer is in a cdata section, and set it as appropriate. --- src/tokenizer.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.c b/src/tokenizer.c index 9dde62cd..297299b3 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState { // markup declaration state. bool _is_current_node_foreign; + // A flag indicating whether the tokenizer is in a CDATA section. If so, then + // text tokens emitted will be GUMBO_TOKEN_CDATA. + bool _is_in_cdata; + // Certain states (notably character references) may emit two character tokens // at once, but the contract for lex() fills in only one token at a time. The // extra character is buffered here, and then this is checked on entry to @@ -475,7 +479,11 @@ static void finish_doctype_system_id(GumboParser* parser) { // Writes a single specified character to the output token. static void emit_char(GumboParser* parser, int c, GumboToken* output) { - output->type = get_char_token_type(c); + if (parser->_tokenizer_state->_is_in_cdata) { + output->type = GUMBO_TOKEN_CDATA; + } else { + output->type = get_char_token_type(c); + } output->v.character = c; finish_token(parser, output); } @@ -850,6 +858,7 @@ void gumbo_tokenizer_state_init( gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); tokenizer->_reconsume_current_input = false; tokenizer->_is_current_node_foreign = false; + tokenizer->_is_in_cdata = false; tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST; tokenizer->_buffered_emit_char = kGumboNoChar; @@ -2041,6 +2050,7 @@ static StateResult handle_markup_declaration_state( utf8iterator_maybe_consume_match( &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA); + tokenizer->_is_in_cdata = true; tokenizer->_reconsume_current_input = true; } else { tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE); @@ -2814,6 +2824,7 @@ static StateResult handle_cdata_state( tokenizer->_reconsume_current_input = true; reset_token_start_point(tokenizer); gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); + tokenizer->_is_in_cdata = true; return NEXT_CHAR; } else { return emit_current_char(parser, output); From 58d5fadf2c2c27b5585f3c686095e6a76e504c45 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 20:41:25 -0800 Subject: [PATCH 03/38] Add CDATA handling to parser, including a test for it. --- src/parser.c | 12 +++++++++--- src/tokenizer.c | 12 ++++++------ tests/parser.cc | 15 +++++++++++++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/parser.c b/src/parser.c index 0888794b..b2c1ad8b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -793,7 +793,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { } assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || - buffer_state->_type == GUMBO_NODE_TEXT); + buffer_state->_type == GUMBO_NODE_TEXT || + buffer_state->_type == GUMBO_NODE_CDATA); GumboNode* text_node = create_node(parser, buffer_state->_type); GumboText* text_node_data = &text_node->v.text; text_node_data->text = gumbo_string_buffer_to_string( @@ -1019,7 +1020,8 @@ static GumboNode* insert_foreign_element( static void insert_text_token(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_CHARACTER); + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { // Initialize position fields. @@ -1030,6 +1032,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) { parser, token->v.character, &buffer_state->_buffer); if (token->type == GUMBO_TOKEN_CHARACTER) { buffer_state->_type = GUMBO_NODE_TEXT; + } else if (token->type == GUMBO_TOKEN_CDATA) { + buffer_state->_type = GUMBO_NODE_CDATA; } gumbo_debug("Inserting text token '%c'.\n", token->v.character); } @@ -2207,7 +2211,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); return true; - } else if (token->type == GUMBO_TOKEN_CHARACTER) { + } else if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); set_frameset_not_ok(parser); @@ -3492,6 +3497,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { case GUMBO_TOKEN_WHITESPACE: insert_text_token(parser, token); return true; + case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_CHARACTER: insert_text_token(parser, token); set_frameset_not_ok(parser); diff --git a/src/tokenizer.c b/src/tokenizer.c index 297299b3..7a7ae3c0 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -319,7 +319,11 @@ static int ensure_lowercase(int c) { return c >= 'A' && c <= 'Z' ? c + 0x20 : c; } -static GumboTokenType get_char_token_type(int c) { +static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { + if (is_in_cdata && c != -1) { + return GUMBO_TOKEN_CDATA; + } + switch (c) { case '\t': case '\n': @@ -479,11 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) { // Writes a single specified character to the output token. static void emit_char(GumboParser* parser, int c, GumboToken* output) { - if (parser->_tokenizer_state->_is_in_cdata) { - output->type = GUMBO_TOKEN_CDATA; - } else { - output->type = get_char_token_type(c); - } + output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c); output->v.character = c; finish_token(parser, output); } diff --git a/tests/parser.cc b/tests/parser.cc index e565a248..c5877591 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1522,6 +1522,21 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) { ASSERT_EQ(1, GetChildCount(li2)); } +TEST_F(GumboParserTest, CData) { + Parse("this is text"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* svg = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(svg)); + + GumboNode* cdata = GetChild(svg, 0); + ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); + EXPECT_STREQ("this is text", cdata->v.text.text); +} + TEST_F(GumboParserTest, FormattingTagsInHeading) { Parse("

This is old

text"); From fa3a71d45a72b365ccc969b44608f594a59aa26b Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 20:48:32 -0800 Subject: [PATCH 04/38] Add test for CDATA sections not in foreign content. --- tests/parser.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/parser.cc b/tests/parser.cc index c5877591..1942734f 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1537,6 +1537,21 @@ TEST_F(GumboParserTest, CData) { EXPECT_STREQ("this is text", cdata->v.text.text); } +TEST_F(GumboParserTest, CDataInBody) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* div = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(div)); + + GumboNode* cdata = GetChild(div, 0); + ASSERT_EQ(GUMBO_NODE_COMMENT, cdata->type); + EXPECT_STREQ("[CDATA[this is text]]", cdata->v.text.text); +} + TEST_F(GumboParserTest, FormattingTagsInHeading) { Parse("

This is old

text"); From 2b804faff0e34176d995a2e79a95ae38984409d1 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 10 Nov 2014 13:09:38 -0800 Subject: [PATCH 05/38] Fix a couple comment issues (line-wrapping, unfinished comments) in utf8.c --- src/utf8.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/utf8.c b/src/utf8.c index a5c5b0e2..a6a30376 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) { decode(&state, &code_point, (uint32_t) (unsigned char) (*c)); if (state == UTF8_ACCEPT) { iter->_width = c - iter->_start + 1; - // This is the special handling for carriage returns that is mandated by the - // HTML5 spec. Since we're looking for particular 7-bit literal characters, - // we operate in terms of chars and only need a check for iter overrun, - // instead of having to read in a full next code point. + // This is the special handling for carriage returns that is mandated by + // the HTML5 spec. Since we're looking for particular 7-bit literal + // characters, we operate in terms of chars and only need a check for iter + // overrun, instead of having to read in a full next code point. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream if (code_point == '\r') { assert(iter->_width == 1); @@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) { return; } } - // If we got here without exiting early, then we've reached the end of the iterator. - // Add an error for truncated input, set the width to consume the rest of the - // iterator, and emit a replacement character. The next time we enter this method, - // it will detect that there's no input to consume and + // If we got here without exiting early, then we've reached the end of the + // iterator. Add an error for truncated input, set the width to consume the + // rest of the iterator, and emit a replacement character. The next time we + // enter this method, it will detect that there's no input to consume and + // output an EOF. iter->_current = kUtf8ReplacementChar; iter->_width = iter->_end - iter->_start; add_error(iter, GUMBO_ERR_UTF8_TRUNCATED); From 8b867b48e47475b25f69c23d8dae1d47f8af7391 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 10 Nov 2014 13:10:25 -0800 Subject: [PATCH 06/38] Print the decimal value of the current character in the debug output for lexing, to ease debugging non-printable characters. --- src/tokenizer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.c b/src/tokenizer.c index 7a7ae3c0..89c22d13 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -2941,7 +2941,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) { assert(!tokenizer->_temporary_buffer_emit); assert(tokenizer->_buffered_emit_char == kGumboNoChar); int c = utf8iterator_current(&tokenizer->_input); - gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state); + gumbo_debug("Lexing character '%c' (%d) in state %d.\n", + c, c, tokenizer->_state); StateResult result = dispatch_table[tokenizer->_state](parser, tokenizer, c, output); // We need to clear reconsume_current_input before returning to prevent From 3f6012a0c82b5511de04b74b532abedd0f5396d0 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 15 Dec 2014 11:41:18 -0800 Subject: [PATCH 07/38] Add test for unsafe cdata. --- tests/parser.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/parser.cc b/tests/parser.cc index 1942734f..72b60b41 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1537,6 +1537,22 @@ TEST_F(GumboParserTest, CData) { EXPECT_STREQ("this is text", cdata->v.text.text); } +TEST_F(GumboParserTest, CDataUnsafe) { + Parse("\0filler\0text\0"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* svg = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(svg)); + + GumboNode* cdata = GetChild(svg, 0); + ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); + // \xEF\xBF\xBD = unicode replacement char + EXPECT_STREQ("fillertext", cdata->v.text.text); +} + TEST_F(GumboParserTest, CDataInBody) { Parse("
"); From fe28c1831abc60ed0e00df364f266e5414c699da Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Tue, 10 Feb 2015 18:17:06 -0800 Subject: [PATCH 08/38] Fix missing case statement for GUMBO_TOKEN_CDATA in handle_parser_error. (The whole error handling really needs to be redone, it's not very helpful to users.) --- src/error.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/error.c b/src/error.c index 3239a0b6..0cae4639 100644 --- a/src/error.c +++ b/src/error.c @@ -106,6 +106,7 @@ static void handle_parser_error(GumboParser* parser, // But just in case... print_message(parser, output, "Comments aren't legal here"); return; + case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_WHITESPACE: case GUMBO_TOKEN_CHARACTER: print_message(parser, output, "Character tokens aren't legal here"); From b6c9617f24d323497b2d63e6163f0fe20c5ddb8b Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Tue, 10 Feb 2015 18:24:33 -0800 Subject: [PATCH 09/38] Additional debugging instructions. --- DEBUGGING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DEBUGGING.md b/DEBUGGING.md index 262ba1f1..8b8a56df 100644 --- a/DEBUGGING.md +++ b/DEBUGGING.md @@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core The same goes for core dumps in other example binaries. +To run only a single unit test, pass the --gtest_filter='TestName' flag to the +lt-gumbo_test binary. + Assertions ========== From adc4c76daa2b3c481992edea2aea5daafc3bc753 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 16 Feb 2015 18:06:12 -0800 Subject: [PATCH 10/38] Add a test for utf8iterator_maybe_consume_match followed by a null. --- tests/utf8.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/utf8.cc b/tests/utf8.cc index 479e4bc8..a98c69c2 100644 --- a/tests/utf8.cc +++ b/tests/utf8.cc @@ -556,6 +556,21 @@ TEST_F(Utf8Test, MatchesCaseInsensitive) { EXPECT_EQ(-1, utf8iterator_current(&input_)); } +TEST_F(Utf8Test, MatchFollowedByNullByte) { + // Can't use ResetText, as the implicit strlen will choke on the null. + text_ = "CDATA\0f"; + utf8iterator_init(&parser_, text_, 7, &input_); + + EXPECT_TRUE(utf8iterator_maybe_consume_match( + &input_, "cdata", sizeof("cdata") - 1, false)); + + EXPECT_EQ(0, utf8iterator_current(&input_)); + EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_)); + utf8iterator_next(&input_); + EXPECT_EQ('f', utf8iterator_current(&input_)); + EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_)); +} + TEST_F(Utf8Test, MarkReset) { ResetText("this is a test"); Advance(5); From 29f48f2c4d886a2e934713f01d9fd430cb9aad0a Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 16 Feb 2015 21:56:51 -0800 Subject: [PATCH 11/38] Update parser and tokenizer tests with testcases for null CDATA, and make sure their input mechanisms can accept this without relying on strlen. --- tests/parser.cc | 6 +++++- tests/tokenizer.cc | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/parser.cc b/tests/parser.cc index 72b60b41..b87f60c0 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1538,7 +1538,11 @@ TEST_F(GumboParserTest, CData) { } TEST_F(GumboParserTest, CDataUnsafe) { - Parse("\0filler\0text\0"); + // Can't use Parse() because of the strlen + output_ = gumbo_parse_with_options( + &options_, "\0filler\0text\0", + sizeof("\0filler\0text\0") - 1); + root_ = output_->document; GumboNode* body; GetAndAssertBody(root_, &body); diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc index 532bad98..2e4b04ac 100644 --- a/tests/tokenizer.cc +++ b/tests/tokenizer.cc @@ -450,6 +450,24 @@ TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) { EXPECT_EQ('>', token_.v.character); } +TEST_F(GumboTokenizerTest, CData) { + // SetInput uses strlen and so can't handle nulls. + text_ = "\0filler\0text\0"; + gumbo_tokenizer_state_destroy(&parser_); + gumbo_tokenizer_state_init( + &parser_, text_, sizeof("\0filler\0text\0") - 1); + gumbo_tokenizer_set_is_current_node_foreign(&parser_, true); + + EXPECT_TRUE(gumbo_lex(&parser_, &token_)); + EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type); + EXPECT_EQ(0, token_.v.character); + + gumbo_token_destroy(&parser_, &token_); + EXPECT_TRUE(gumbo_lex(&parser_, &token_)); + EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type); + EXPECT_EQ('f', token_.v.character); +} + TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) { SetInput(" */"); Advance(1); From 7fea4b5c25b6a1c0f2fb22d46e28ae6fd2495139 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 16 Feb 2015 22:35:41 -0800 Subject: [PATCH 12/38] Fix handling of nulls in CDATA sections. --- src/parser.c | 2 +- src/tokenizer.c | 2 +- tests/parser.cc | 3 ++- tests/tokenizer.cc | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/parser.c b/src/parser.c index b2c1ad8b..9296e5d8 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1021,6 +1021,7 @@ static GumboNode* insert_foreign_element( static void insert_text_token(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_WHITESPACE || token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { @@ -3490,7 +3491,6 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { switch (token->type) { case GUMBO_TOKEN_NULL: parser_add_parse_error(parser, token); - token->type = GUMBO_TOKEN_CHARACTER; token->v.character = kUtf8ReplacementChar; insert_text_token(parser, token); return false; diff --git a/src/tokenizer.c b/src/tokenizer.c index 89c22d13..8c9272c0 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -320,7 +320,7 @@ static int ensure_lowercase(int c) { } static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { - if (is_in_cdata && c != -1) { + if (is_in_cdata && c > 0) { return GUMBO_TOKEN_CDATA; } diff --git a/tests/parser.cc b/tests/parser.cc index b87f60c0..590f549a 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1554,7 +1554,8 @@ TEST_F(GumboParserTest, CDataUnsafe) { GumboNode* cdata = GetChild(svg, 0); ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); // \xEF\xBF\xBD = unicode replacement char - EXPECT_STREQ("fillertext", cdata->v.text.text); + EXPECT_STREQ("\xEF\xBF\xBD" "filler\xEF\xBF\xBD" "text\xEF\xBF\xBD", + cdata->v.text.text); } TEST_F(GumboParserTest, CDataInBody) { diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc index 2e4b04ac..916494e2 100644 --- a/tests/tokenizer.cc +++ b/tests/tokenizer.cc @@ -459,7 +459,7 @@ TEST_F(GumboTokenizerTest, CData) { gumbo_tokenizer_set_is_current_node_foreign(&parser_, true); EXPECT_TRUE(gumbo_lex(&parser_, &token_)); - EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type); + EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type); EXPECT_EQ(0, token_.v.character); gumbo_token_destroy(&parser_, &token_); From 4383a40605ee7872a8e2de58553383a13d919153 Mon Sep 17 00:00:00 2001 From: Kevin Hendricks Date: Sat, 14 Feb 2015 14:45:49 -0500 Subject: [PATCH 13/38] First pass at getting template changes on top of new master --- src/gumbo.h | 12 +- src/parser.c | 438 ++++++++++++++++++++++++++++++++++++++++----------- src/vector.c | 2 +- 3 files changed, 353 insertions(+), 99 deletions(-) diff --git a/src/gumbo.h b/src/gumbo.h index a1b9a036..e317acec 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -141,7 +141,7 @@ extern const GumboVector kGumboEmptyVector; * Returns the first index at which an element appears in this vector (testing * by pointer equality), or -1 if it never does. */ -int gumbo_vector_index_of(GumboVector* vector, void* element); +int gumbo_vector_index_of(GumboVector* vector, const void* element); /** @@ -461,10 +461,16 @@ typedef enum { GUMBO_NODE_TEXT, /** CDATA node. v will be a GumboText. */ GUMBO_NODE_CDATA, - /** Comment node. v. will be a GumboText, excluding comment delimiters. */ + /** Comment node. v will be a GumboText, excluding comment delimiters. */ GUMBO_NODE_COMMENT, /** Text node, where all contents is whitespace. v will be a GumboText. */ - GUMBO_NODE_WHITESPACE + GUMBO_NODE_WHITESPACE, + /** Template node. This is separate from GUMBO_NODE_ELEMENT because many + * client libraries will want to ignore the contents of template nodes, as + * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing + * here, while clients that want to include template contents should also + * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + GUMBO_NODE_TEMPLATE } GumboNodeType; /** diff --git a/src/parser.c b/src/parser.c index 9296e5d8..4434aaed 100644 --- a/src/parser.c +++ b/src/parser.c @@ -47,6 +47,13 @@ typedef char gumbo_tagset[GUMBO_TAG_LAST]; tagset[(int)tag] == (1 << (int)namespace)) + +// selected forward declarations as it is getting hard to find +// an appropriate order +static bool node_html_tag_is(const GumboNode*, GumboTag); +static GumboInsertionMode get_current_template_insertion_mode(const GumboParser*); +static bool handle_in_template(GumboParser*, GumboToken*); + static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); } @@ -550,55 +557,74 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { parser->_parser_state->_insertion_mode = mode; } + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately // This is a helper function that returns the appropriate insertion mode instead // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to // indicate that there is no appropriate insertion mode, and the loop should // continue. -static GumboInsertionMode get_appropriate_insertion_mode( - const GumboNode* node, bool is_last) { - assert(node->type == GUMBO_NODE_ELEMENT); - - if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) { - switch (node->v.element.tag) { - case GUMBO_TAG_SELECT: - return GUMBO_INSERTION_MODE_IN_SELECT; - case GUMBO_TAG_TD: - case GUMBO_TAG_TH: - return is_last ? - GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL; - case GUMBO_TAG_TR: - return GUMBO_INSERTION_MODE_IN_ROW; - case GUMBO_TAG_TBODY: - case GUMBO_TAG_THEAD: - case GUMBO_TAG_TFOOT: - return GUMBO_INSERTION_MODE_IN_TABLE_BODY; - case GUMBO_TAG_CAPTION: - return GUMBO_INSERTION_MODE_IN_CAPTION; - case GUMBO_TAG_COLGROUP: - return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; - case GUMBO_TAG_TABLE: - return GUMBO_INSERTION_MODE_IN_TABLE; - case GUMBO_TAG_HEAD: - case GUMBO_TAG_BODY: - return GUMBO_INSERTION_MODE_IN_BODY; - case GUMBO_TAG_FRAMESET: - return GUMBO_INSERTION_MODE_IN_FRAMESET; - case GUMBO_TAG_HTML: - return GUMBO_INSERTION_MODE_BEFORE_HEAD; - default: - break; - } - } - return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; +static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* parser, int index) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; + const GumboNode* node = open_elements->data[index]; + bool is_last = index == 0; + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + switch (node->v.element.tag) { + case GUMBO_TAG_SELECT: + if (is_last) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + int i = index; + for (const GumboNode* ancestor = open_elements->data[i]; + i > 0; --i) { + if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { + return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; + } + } + return GUMBO_INSERTION_MODE_IN_SELECT; + case GUMBO_TAG_TD: + case GUMBO_TAG_TH: + return is_last ? + GUMBO_INSERTION_MODE_INITIAL : GUMBO_INSERTION_MODE_IN_CELL; + case GUMBO_TAG_TR: + return GUMBO_INSERTION_MODE_IN_ROW; + case GUMBO_TAG_TBODY: + case GUMBO_TAG_THEAD: + case GUMBO_TAG_TFOOT: + return GUMBO_INSERTION_MODE_IN_TABLE_BODY; + case GUMBO_TAG_CAPTION: + return GUMBO_INSERTION_MODE_IN_CAPTION; + case GUMBO_TAG_COLGROUP: + return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; + case GUMBO_TAG_TABLE: + return GUMBO_INSERTION_MODE_IN_TABLE; + case GUMBO_TAG_TEMPLATE: + return get_current_template_insertion_mode(parser); + case GUMBO_TAG_HEAD: + return is_last ? + GUMBO_INSERTION_MODE_INITIAL : GUMBO_INSERTION_MODE_IN_HEAD; + case GUMBO_TAG_BODY: + return GUMBO_INSERTION_MODE_IN_BODY; + case GUMBO_TAG_FRAMESET: + return GUMBO_INSERTION_MODE_IN_FRAMESET; + case GUMBO_TAG_HTML: + return parser->_parser_state->_head_element ? + GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD; + default: + return is_last ? + GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; + } } + // This performs the actual "reset the insertion mode" loop. static void reset_insertion_mode_appropriately(GumboParser* parser) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (int i = open_elements->length; --i >= 0; ) { GumboInsertionMode mode = - get_appropriate_insertion_mode(open_elements->data[i], i == 0); + get_appropriate_insertion_mode(parser, i); if (mode != GUMBO_INSERTION_MODE_INITIAL) { set_insertion_mode(parser, mode); return; @@ -632,7 +658,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken* &extra_data->tag_stack); for (int i = 0; i < state->_open_elements.length; ++i) { const GumboNode* node = state->_open_elements.data[i]; - assert(node->type == GUMBO_NODE_ELEMENT); + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); gumbo_vector_add(parser, (void*) node->v.element.tag, &extra_data->tag_stack); } @@ -669,7 +695,7 @@ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { // Like tag_in, but checks for the tag of a node, rather than a token. static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { assert(node != NULL); - if (node->type != GUMBO_NODE_ELEMENT) { + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { return false; } return TAGSET_INCLUDES(tags, node->v.element.tag_namespace, node->v.element.tag); @@ -678,7 +704,7 @@ static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { // Like node_tag_in, but for the single-tag case. static bool node_qualified_tag_is(const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) { - return node->type == GUMBO_NODE_ELEMENT && + return (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) && node->v.element.tag == tag && node->v.element.tag_namespace == ns; } @@ -689,6 +715,23 @@ static bool node_html_tag_is(const GumboNode* node, GumboTag tag) return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); } +static void push_template_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { + gumbo_vector_add(parser, (void*) mode, &parser->_parser_state->_template_insertion_modes); +} + +static void pop_template_insertion_mode(GumboParser* parser) { + gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes); +} + +// Returns the current template insertion mode. If the stack of template +// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. +static GumboInsertionMode get_current_template_insertion_mode(const GumboParser* parser) { + GumboVector* template_insertion_modes = &parser->_parser_state->_template_insertion_modes; + if (template_insertion_modes->length == 0) { + return GUMBO_INSERTION_MODE_INITIAL; + } + return (GumboInsertionMode) template_insertion_modes->data[template_insertion_modes->length - 1]; +} // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point static bool is_mathml_integration_point(const GumboNode* node) { @@ -706,6 +749,63 @@ static bool is_html_integration_point(const GumboNode* node) { "encoding", "application/xhtml+xml"))); } + +// This represents a place to insert a node, consisting of a target parent and a +// child index within that parent. If the node should be inserted at the end of +// the parent's child, index will be -1. +typedef struct { + GumboNode* target; + int index; +} InsertionLocation; + +InsertionLocation get_appropriate_insertion_location(GumboParser* parser, GumboNode* override_target) { + InsertionLocation retval = { override_target, -1 }; + if (retval.target == NULL) { + // No override target; default to the current node, but special-case the + // root node since get_current_node() assumes the stack of open elements is + // non-empty. + retval.target = parser->_output->root != NULL ? + get_current_node(parser) : get_document_node(parser); + } + if (!parser->_parser_state->_foster_parent_insertions || + !node_tag_in_set(retval.target, (gumbo_tagset) { TAG(TABLE), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TR) })) { + return retval; + } + + // Foster-parenting case. + int last_template_index = -1; + int last_table_index = -1; + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (int i = 0; i < open_elements->length; ++i) { + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { + last_template_index = i; + } + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { + last_table_index = i; + } + } + if (last_template_index != -1 && + (last_table_index == -1 || last_template_index > last_table_index)) { + retval.target = open_elements->data[last_template_index]; + return retval; + } + if (last_table_index == -1) { + retval.target = open_elements->data[0]; + return retval; + } + GumboNode* last_table = open_elements->data[last_table_index]; + if (last_table->parent != NULL) { + retval.target = last_table->parent; + retval.index = last_table->index_within_parent; + return retval; + } + + retval.target = open_elements->data[last_table_index - 1]; + return retval; +} + + // Appends a node to the end of its parent, setting the "parent" and // "index_within_parent" fields appropriately. static void append_node( @@ -713,7 +813,7 @@ static void append_node( assert(node->parent == NULL); assert(node->index_within_parent == -1); GumboVector* children; - if (parent->type == GUMBO_NODE_ELEMENT) { + if (parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE) { children = &parent->v.element.children; } else { assert(parent->type == GUMBO_NODE_DOCUMENT); @@ -731,7 +831,7 @@ static void insert_node( GumboParser* parser, GumboNode* parent, int index, GumboNode* node) { assert(node->parent == NULL); assert(node->index_within_parent == -1); - assert(parent->type == GUMBO_NODE_ELEMENT); + assert(parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE); GumboVector* children = &parent->v.element.children; assert(index >= 0); assert(index < children->length); @@ -844,7 +944,7 @@ static GumboNode* pop_current_node(GumboParser* parser) { assert(state->_open_elements.length == 0); return NULL; } - assert(current_node->type == GUMBO_NODE_ELEMENT); + assert(current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE); bool is_closed_body_or_html_tag = (node_html_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) || (node_html_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag); @@ -873,14 +973,14 @@ static void append_comment_node( // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context static void clear_stack_to_table_row_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR) })) { + while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { pop_current_node(parser); } } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context static void clear_stack_to_table_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE) } )) { + while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE), TAG(TEMPLATE) } )) { pop_current_node(parser); } } @@ -888,7 +988,7 @@ static void clear_stack_to_table_context(GumboParser* parser) { // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context void clear_stack_to_table_body_context(GumboParser* parser) { while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TBODY), - TAG(TFOOT), TAG(THEAD) })) { + TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) })) { pop_current_node(parser); } } @@ -914,7 +1014,12 @@ static GumboNode* create_element_from_token( assert(token->type == GUMBO_TOKEN_START_TAG); GumboTokenStartTag* start_tag = &token->v.start_tag; - GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); + GumboNodeType type = ( + tag_namespace == GUMBO_NAMESPACE_HTML && + start_tag->tag == GUMBO_TAG_TEMPLATE) + ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT; + + GumboNode* node = create_node(parser, type); GumboElement* element = &node->v.element; gumbo_vector_init(parser, 1, &element->children); element->attributes = start_tag->attributes; @@ -1137,7 +1242,7 @@ static bool is_open_element(GumboParser* parser, const GumboNode* node) { // values are fresh copies. GumboNode* clone_node( GumboParser* parser, const GumboNode* node, GumboParseFlags reason) { - assert(node->type == GUMBO_NODE_ELEMENT); + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode)); *new_node = *node; new_node->parent = NULL; @@ -1267,7 +1372,7 @@ static bool has_an_element_in_specific_scope(GumboParser* parser, gumbo_tagset e bool result = false; for (int i = open_elements->length; --i >= 0; ) { const GumboNode* node = open_elements->data[i]; - if (node->type != GUMBO_NODE_ELEMENT) { + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { continue; } if (TAGSET_INCLUDES(expected, node->v.element.tag_namespace, node->v.element.tag)) { @@ -1285,6 +1390,13 @@ static bool has_an_element_in_specific_scope(GumboParser* parser, gumbo_tagset e return result; } +// Checks for the presence of an open element of the specified tag type. +static bool has_open_element(GumboParser* parser, GumboTag tag) { + gumbo_tagset qualset = {0}; + qualset[(int) tag] = (1 << (int) GUMBO_NAMESPACE_HTML); + return has_an_element_in_specific_scope(parser, qualset, false, (gumbo_tagset) { TAG(HTML) } ); +} + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) { gumbo_tagset qualset = {0}; @@ -1309,7 +1421,7 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) { if (current == node) { return true; } - if (current->type != GUMBO_NODE_ELEMENT) { + if (current->type != GUMBO_NODE_ELEMENT && current->type != GUMBO_NODE_TEMPLATE) { continue; } if (node_tag_in_set(current, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML), @@ -1329,7 +1441,7 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) { static bool has_an_element_in_scope_with_tagname(GumboParser* parser, gumbo_tagset qualset) { return has_an_element_in_specific_scope(parser, qualset, false, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE), - TAG(OBJECT), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), + TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) }); } @@ -1340,7 +1452,7 @@ static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) { qualset[(int)tag] = (1 << (int)(GUMBO_NAMESPACE_HTML)); return has_an_element_in_specific_scope(parser, qualset, false, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE), - TAG(OBJECT), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), + TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(OL), TAG(UL) }); @@ -1352,7 +1464,7 @@ static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) { qualset[(int) tag] = (1 << (int)(GUMBO_NAMESPACE_HTML)); return has_an_element_in_specific_scope(parser, qualset, false, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE), - TAG(OBJECT), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), + TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(BUTTON) }); } @@ -1361,7 +1473,8 @@ static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) { static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) { gumbo_tagset qualset = {0}; qualset[(int) tag] = (1 << (int)(GUMBO_NAMESPACE_HTML)); - return has_an_element_in_specific_scope(parser, qualset, false, (gumbo_tagset) { TAG(HTML), TAG(TABLE) }); + return has_an_element_in_specific_scope(parser, qualset, false, (gumbo_tagset) { TAG(HTML), + TAG(TABLE), TAG(TEMPLATE) }); } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope @@ -1382,6 +1495,16 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) { pop_current_node(parser)); } +// This is the "generate all implied end tags thoroughly" clause of the spec. +static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) { + for (; + node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(CAPTION), + TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP), + TAG(P), TAG(RP), TAG(RT), TAG(TBODY), TAG(TD), TAG(TFOOT), + TAG(TH), TAG(HEAD), TAG(TR) }); + pop_current_node(parser)); +} + // This factors out the clauses relating to "act as if an end tag token with tag // name "table" had been seen. Returns true if there's a table element in table // scope which was successfully closed, false if not and the token should be @@ -1446,7 +1569,7 @@ static void close_current_select(GumboParser* parser) { // The list of nodes in the "special" category: // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special static bool is_special_node(const GumboNode* node) { - assert(node->type == GUMBO_NODE_ELEMENT); + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); return node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE), TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL), @@ -1458,8 +1581,8 @@ static bool is_special_node(const GumboNode* node) { TAG(LISTING), TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), TAG(SELECT), TAG(STYLE), - TAG(SUMMARY), TAG(TABLE), TAG(TBODY), TAG(TD), TAG(TEXTAREA), TAG(TFOOT), - TAG(TH), TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), + TAG(SUMMARY), TAG(TABLE), TAG(TBODY), TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), + TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), @@ -2054,6 +2177,30 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { assert(node_html_tag_is(head, GUMBO_TAG_HEAD)); set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) { + insert_element_from_token(parser, token); + add_formatting_element(parser, &kActiveFormattingScopeMarker); + parser->_parser_state->_frameset_ok = false; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + generate_all_implied_end_tags_thoroughly(parser); + bool success = true; + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) { + parser_add_parse_error(parser, token); + success = false; + } + while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)); + clear_active_formatting_elements(parser); + pop_template_insertion_mode(parser); + reset_insertion_mode_appropriately(parser); + return success; } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) { parser_add_parse_error(parser, token); ignore_token(parser); @@ -2070,7 +2217,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { return false; } else { const GumboNode* node = pop_current_node(parser); - assert(node_html_tag_is(node, GUMBO_TAG_HEAD)); + assert(node_tag_in_set(node, (gumbo_tagset) { TAG(HEAD), TAG(TEMPLATE) })); AVOID_UNUSED_VARIABLE_WARNING(node); set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); parser->_parser_state->_reprocess_current_token = true; @@ -2142,7 +2289,7 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) { } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), - TAG(TITLE) })) { + TAG(TEMPLATE), TAG(TITLE) })) { parser_add_parse_error(parser, token); assert(state->_head_element != NULL); // This must be flushed before we push the head element on, as there may be @@ -2152,6 +2299,8 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) { bool result = handle_in_head(parser, token); gumbo_vector_remove(parser, state->_head_element, &state->_open_elements); return result; + } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || (token->type == GUMBO_TOKEN_END_TAG && !tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) }))) { @@ -2180,6 +2329,7 @@ static void destroy_node(GumboParser* parser, GumboNode* node) { gumbo_parser_deallocate(parser, (void*) doc->system_identifier); } break; + case GUMBO_NODE_TEMPLATE: case GUMBO_NODE_ELEMENT: for (int i = 0; i < node->v.element.attributes.length; ++i) { gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]); @@ -2234,12 +2384,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), - TAG(STYLE), TAG(TITLE) } )) { + TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) } ) || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { return handle_in_head(parser, token); } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { parser_add_parse_error(parser, token); if (state->_open_elements.length < 2 || - !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) { + !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || has_open_element(parser, GUMBO_TAG_TEMPLATE)) { ignore_token(parser); return false; } @@ -2286,6 +2436,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); return true; } else if (token->type == GUMBO_TOKEN_EOF) { + if (get_current_template_insertion_mode(parser) != + GUMBO_INSERTION_MODE_INITIAL) { + return handle_in_template(parser, token); + } for (int i = 0; i < state->_open_elements.length; ++i) { if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) { TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), @@ -2347,15 +2501,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { state->_frameset_ok = false; return result; } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { - if (state->_form_element != NULL) { + if (state->_form_element != NULL && !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { gumbo_debug("Ignoring nested form.\n"); parser_add_parse_error(parser, token); ignore_token(parser); return false; } bool result = maybe_implicitly_close_p_tag(parser, token); - state->_form_element = - insert_element_from_token(parser, token); + GumboNode* form_element = insert_element_from_token(parser, token); + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + state->_form_element = form_element; + } return result; } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) { maybe_implicitly_close_list_tag(parser, token, true); @@ -2398,30 +2554,45 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag); return true; } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) { - bool result = true; - const GumboNode* node = state->_form_element; - assert(!node || node->type == GUMBO_NODE_ELEMENT); - state->_form_element = NULL; - if (!node || !has_node_in_scope(parser, node)) { - gumbo_debug("Closing an unopened form.\n"); - parser_add_parse_error(parser, token); - ignore_token(parser); - return false; - } - // This differs from implicitly_close_tags because we remove *only* the - //
element; other nodes are left in scope. - generate_implied_end_tags(parser, GUMBO_TAG_LAST); - if (get_current_node(parser) != node) { - parser_add_parse_error(parser, token); - result = false; - } + if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + bool success = true; + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) { + parser_add_parse_error(parser, token); + return false; + } + while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM)); + return success; + } else { + bool result = true; + const GumboNode* node = state->_form_element; + assert(!node || node->type == GUMBO_NODE_ELEMENT); + state->_form_element = NULL; + if (!node || !has_node_in_scope(parser, node)) { + gumbo_debug("Closing an unopened form.\n"); + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + // This differs from implicitly_close_tags because we remove *only* the + // element; other nodes are left in scope. + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + if (get_current_node(parser) != node) { + parser_add_parse_error(parser, token); + result = false; + } - GumboVector* open_elements = &state->_open_elements; - int index = open_elements->length - 1; - for (; index >= 0 && open_elements->data[index] != node; --index); - assert(index >= 0); - gumbo_vector_remove_at(parser, index, open_elements); - return result; + GumboVector* open_elements = &state->_open_elements; + int index = gumbo_vector_index_of(open_elements, node); + assert(index >= 0); + gumbo_vector_remove_at(parser, index, open_elements); + return result; + } } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) { if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { parser_add_parse_error(parser, token); @@ -2592,7 +2763,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { return result; } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) { parser_add_parse_error(parser, token); - if (parser->_parser_state->_form_element != NULL) { + if (parser->_parser_state->_form_element != NULL && + !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { ignore_token(parser); return false; } @@ -2607,6 +2779,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { GumboNode* form = insert_element_of_tag_type( parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX); + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + parser->_parser_state->_form_element = form; + } if (action_attr) { gumbo_vector_add(parser, action_attr, &form->v.element.attributes); } @@ -2670,6 +2845,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); pop_current_node(parser); //
pop_current_node(parser); // + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + parser->_parser_state->_form_element = NULL; + } return false; } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); @@ -2887,7 +3065,8 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(STYLE), TAG(SCRIPT) })) { + } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(STYLE), TAG(SCRIPT) }) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { return handle_in_head(parser, token); } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) && attribute_matches(&token->v.start_tag.attributes, @@ -2898,7 +3077,7 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { return false; } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { parser_add_parse_error(parser, token); - if (state->_form_element) { + if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) { ignore_token(parser); return false; } @@ -3015,6 +3194,9 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { parser_add_parse_error(parser, token); ignore_token(parser); return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); } else if (token->type == GUMBO_TOKEN_EOF && get_current_node(parser) == parser->_output->root) { return true; @@ -3023,7 +3205,7 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { parser_add_parse_error(parser, token); return false; } - assert(node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)); + assert(node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(COLGROUP), TAG(TEMPLATE) })); pop_current_node(parser); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); if (!tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) { @@ -3253,7 +3435,8 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) { parser->_parser_state->_reprocess_current_token = true; } return false; - } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) { + } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(SCRIPT) , TAG(TEMPLATE) }) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { return handle_in_head(parser, token); } else if (token->type == GUMBO_TOKEN_EOF) { if (get_current_node(parser) != parser->_output->root) { @@ -3294,8 +3477,71 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) { // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate static bool handle_in_template(GumboParser* parser, GumboToken* token) { - // TODO(jdtang): Implement this. - return true; + GumboParserState* state = parser->_parser_state; + if (token->type == GUMBO_TOKEN_WHITESPACE || + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_COMMENT || + token->type == GUMBO_TOKEN_DOCTYPE) { + return handle_in_body(parser, token); + } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT), + TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), + TAG(TEMPLATE), TAG(TITLE) }) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); + } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COLGROUP), + TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + state->_reprocess_current_token = true; + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); + state->_reprocess_current_token = true; + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + state->_reprocess_current_token = true; + return true; + } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + state->_reprocess_current_token = true; + return true; + } else if (token->type == GUMBO_TOKEN_START_TAG) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + state->_reprocess_current_token = true; + return true; + } else if (token->type == GUMBO_TOKEN_END_TAG) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_EOF) { + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + // Stop parsing. + return true; + } + parser_add_parse_error(parser, token); + for (GumboNode* popped = pop_current_node(parser); + popped->v.element.tag_namespace != GUMBO_NAMESPACE_HTML || + !node_html_tag_is(popped, GUMBO_TAG_TEMPLATE); + popped = pop_current_node(parser)); + clear_active_formatting_elements(parser); + pop_template_insertion_mode(parser); + reset_insertion_mode_appropriately(parser); + state->_reprocess_current_token = true; + return false; + } else { + assert(0); + return false; + } } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody @@ -3631,7 +3877,9 @@ static bool handle_token(GumboParser* parser, GumboToken* token) { } const GumboNode* current_node = get_current_node(parser); - assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT); + assert(!current_node || + current_node->type == GUMBO_NODE_ELEMENT || + current_node->type == GUMBO_NODE_TEMPLATE); if (current_node) { gumbo_debug("Current node: <%s>.\n", gumbo_normalized_tagname(current_node->v.element.tag)); diff --git a/src/vector.c b/src/vector.c index f6b7d88d..91867a77 100644 --- a/src/vector.c +++ b/src/vector.c @@ -81,7 +81,7 @@ void* gumbo_vector_pop( return vector->data[--vector->length]; } -int gumbo_vector_index_of(GumboVector* vector, void* element) { +int gumbo_vector_index_of(GumboVector* vector, const void* element) { for (int i = 0; i < vector->length; ++i) { if (vector->data[i] == element) { return i; From d8f369d5dec63481f81fb71aa749bc27e31de008 Mon Sep 17 00:00:00 2001 From: Kevin Hendricks Date: Sat, 14 Feb 2015 15:17:05 -0500 Subject: [PATCH 14/38] Update python interface for template changes --- python/gumbo/gumboc.py | 5 +++-- python/gumbo/html5lib_adapter.py | 4 ++-- python/gumbo/html5lib_adapter_test.py | 22 ++++++++++++++++++++++ python/gumbo/soup_adapter.py | 1 + 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index d377d58c..db6a94b6 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -444,7 +444,8 @@ def __repr__(self): class NodeType(Enum): - _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', 'COMMENT', 'WHITESPACE'] + _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', + 'COMMENT', 'WHITESPACE', 'TEMPLATE'] class NodeUnion(ctypes.Union): @@ -463,7 +464,7 @@ def _contents(self): # __getattr__, so we factor it out to a helper. if self.type == NodeType.DOCUMENT: return self.v.document - elif self.type == NodeType.ELEMENT: + elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE): return self.v.element else: return self.v.text diff --git a/python/gumbo/html5lib_adapter.py b/python/gumbo/html5lib_adapter.py index 2a968640..7615814a 100644 --- a/python/gumbo/html5lib_adapter.py +++ b/python/gumbo/html5lib_adapter.py @@ -58,7 +58,7 @@ def maybe_namespace(attr): def _convert_element(source_node): - if source_node.type != gumboc.NodeType.ELEMENT: + if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): # If-statement instead of assert so it runs with -O raise AssertionError( '_convert_element only works with elements; found %r' % @@ -110,7 +110,7 @@ def parse(self, text_or_file, **kwargs): if node.type == gumboc.NodeType.COMMENT: self.tree.insertComment({'data': node.v.text.text.decode('utf-8')}, self.tree.document) - elif node.type == gumboc.NodeType.ELEMENT: + elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): _insert_root(self.tree, output.contents.root.contents) else: assert 'Only comments and nodes allowed at the root' diff --git a/python/gumbo/html5lib_adapter_test.py b/python/gumbo/html5lib_adapter_test.py index 2ab8c619..b1d8bc81 100644 --- a/python/gumbo/html5lib_adapter_test.py +++ b/python/gumbo/html5lib_adapter_test.py @@ -91,6 +91,22 @@ def convertExpected(data, stripChars): rv.append(line) return "\n".join(rv) +def reformatTemplateContents(expected): + lines = expected.split('\n') + retval = [] + template_indents = [] + for line in lines: + indent = len(line) - len(line.strip()) + if 'content' in line: + template_indents.append(indent) + continue + elif template_indents and indent <= template_indents[-1]: + template_indents.pop() + elif template_indents: + line = line[2 * len(template_indents):] + retval.append(line) + return '\n'.join(retval) + class Html5libAdapterTest(unittest.TestCase): """Adapter between Gumbo and the html5lib tests. @@ -106,6 +122,7 @@ class Html5libAdapterTest(unittest.TestCase): def impl(self, inner_html, input, expected, errors): p = html5lib_adapter.HTMLParser( tree=TREEBUILDER(namespaceHTMLElements=True)) + if not inner_html: # TODO(jdtang): Need to implement fragment parsing. document = p.parse(StringIO.StringIO(input)) @@ -120,6 +137,11 @@ def impl(self, inner_html, input, expected, errors): expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub( r'\1', convertExpected(expected, 2)) + # html5lib doesn't yet support the template tag, but it appears in the + # tests with the expectation that the template contents will be under the + # word 'contents', so we need to reformat that string a bit. + expected = reformatTemplateContents(expected) + error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected, '\nReceived:', output]) self.assertEquals(expected, output, diff --git a/python/gumbo/soup_adapter.py b/python/gumbo/soup_adapter.py index 089f8918..9bfaed66 100644 --- a/python/gumbo/soup_adapter.py +++ b/python/gumbo/soup_adapter.py @@ -80,6 +80,7 @@ def add_text_internal(soup, element): _add_text(BeautifulSoup.CData), _add_text(BeautifulSoup.Comment), _add_text(BeautifulSoup.NavigableString), + _add_element, ] From 975cfcf62c94cb8256010c5e9592cbb34d145e9a Mon Sep 17 00:00:00 2001 From: Kevin Hendricks Date: Sat, 14 Feb 2015 15:18:12 -0500 Subject: [PATCH 15/38] Add in template parser.cc tests and fixes for parser.c --- src/parser.c | 6 ++---- tests/parser.cc | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/parser.c b/src/parser.c index 4434aaed..55091f56 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1369,7 +1369,6 @@ static GumboQuirksModeEnum compute_quirks_mode( // from the rest of the document. static bool has_an_element_in_specific_scope(GumboParser* parser, gumbo_tagset expected, bool negate, const gumbo_tagset tags) { GumboVector* open_elements = &parser->_parser_state->_open_elements; - bool result = false; for (int i = open_elements->length; --i >= 0; ) { const GumboNode* node = open_elements->data[i]; if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { @@ -1383,11 +1382,10 @@ static bool has_an_element_in_specific_scope(GumboParser* parser, gumbo_tagset e found_qualname = true; } if (negate != found_qualname) { - result = false; - return result; + return false; } } - return result; + return false; } // Checks for the presence of an open element of the specified tag type. diff --git a/tests/parser.cc b/tests/parser.cc index 590f549a..6f00bc7f 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1491,6 +1491,10 @@ TEST_F(GumboParserTest, AdoptionAgency2) { EXPECT_STREQ("3", text3->v.text.text); } +TEST_F(GumboParserTest, AdoptionAgency3) { + Parse("
"); +} + TEST_F(GumboParserTest, ImplicitlyCloseLists) { Parse("
    \n" "
  • First\n" @@ -1853,4 +1857,35 @@ TEST_F(GumboParserTest, TdInMathml) { ASSERT_EQ(0, GetChildCount(td)); } +TEST_F(GumboParserTest, TestTemplateInForeignContent) { + Parse("