From 87a41da8cda294e401e572f79c92bf62800b3a99 Mon Sep 17 00:00:00 2001 From: Ivan Nikulin Date: Mon, 5 Aug 2013 13:43:32 +0400 Subject: [PATCH] Readme updated. Parser refactored. Version updated to 0.6.0 --- README.md | 77 ++++++++- lib/parser.js | 455 +++++++++++++++++++++++++------------------------- package.json | 2 +- 3 files changed, 304 insertions(+), 230 deletions(-) diff --git a/README.md b/README.md index 8611433fb..a28dbea35 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,77 @@ parse5 ====== +Fast full-featured HTML parser for Node. Based on WHATWG HTML5 specification. +To build [TestCafé](http://testcafe.devexpress.com/) we needed fast and ready for production HTML parser for node.js, which will parse HTML as a modern browser's parser. +Existing solutions were either too slow or their output was too inaccurate. So, this is how parse5 was born. -Fast full-featured HTML parser for Node. Based on WHATWG HTML5 specification. -Still in BETA. +Install +------- +``` +$ npm install parse5 +``` -Basic functionality is accomplished and tested, but requires some more -optimizations. API IS UNSTABLE. +Usage and API +------------- +```js +var Parser = require('parse5').Parser; + +//Instantiate parser +var parser = new Parser(); + +//Then feed it with an HTML document +var document = parser.parse('Hi there!') + +//Now let's parse HTML-snippet +var fragment = parser.parseFragment('Parse5 is fucking awesome!

42

'); + +``` + +Is it fast? +----------- +Check out [this benchmark](https://github.com/inikulin/node-html-parser-bench). + +``` +Starting benchmark. Fasten your seatbelts... +html5 (https://github.com/aredridel/html5) x 0.18 ops/sec ±5.92% (5 runs sampled) +htmlparser (https://github.com/tautologistics/node-htmlparser/) x 3.83 ops/sec ±42.43% (14 runs sampled) +htmlparser2 (https://github.com/fb55/htmlparser2) x 4.05 ops/sec ±39.27% (15 runs sampled) +parse5 (https://github.com/inikulin/parse5) x 3.04 ops/sec ±51.81% (13 runs sampled) +Fastest is htmlparser2 (https://github.com/fb55/htmlparser2),parse5 (https://github.com/inikulin/parse5) +``` + +So, parse5 is as fast as simple specification incompatible parsers and ~15-times(!) faster than the current specification compatible parser available for the node. + +Testing +------- +Test data is adopted from [html5lib project](https://github.com/html5lib). Parser is covered by more than 8000 test cases. +To run tests: +``` +$ node test/run_tests.js +``` + +Custom tree adapter +------------------- +You can create a custom tree adapter so parse5 can work with your own DOM-tree implementation. +Just pass your adapter implementation to the parser's constructor as an argument: + +```js +var Parser = require('parse5').Parser; + +var myTreeAdapter = { + //Adapter methods... +}; + +//Instantiate parser +var parser = new Parser(myTreeAdapter); +``` + +Sample implementation can be found [here](https://github.com/inikulin/parse5/blob/master/lib/default_tree_adapter.js). +The custom tree adapter should implement all methods exposed via `exports` in the sample implementation. + +Questions or suggestions? +------------------------- +If you have any questions, please feel free to create an issue [here on github](https://github.com/inikulin/parse5/issues). + +Author +------ +[Ivan Nikulin](https://github.com/inikulin) (ifaaan@gmail.com) diff --git a/lib/parser.js b/lib/parser.js index a8f30ddc1..313974dae 100644 --- a/lib/parser.js +++ b/lib/parser.js @@ -416,228 +416,217 @@ INSERTION_MODE_RESET_MAP[$.FRAMESET] = IN_FRAMESET_MODE; INSERTION_MODE_RESET_MAP[$.HTML] = BEFORE_HEAD_MODE; //Token handlers map for insertion modes -var IM_ = {}; - -IM_[INITIAL_MODE] = {}; -IM_[INITIAL_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[INITIAL_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInInitialMode; -IM_[INITIAL_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken; -IM_[INITIAL_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[INITIAL_MODE][Tokenizer.DOCTYPE_TOKEN] = doctypeInInitialMode; -IM_[INITIAL_MODE][Tokenizer.START_TAG_TOKEN] = -IM_[INITIAL_MODE][Tokenizer.END_TAG_TOKEN] = -IM_[INITIAL_MODE][Tokenizer.EOF_TOKEN] = tokenInInitialMode; - -IM_[BEFORE_HTML_MODE] = {}; -IM_[BEFORE_HTML_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[BEFORE_HTML_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenBeforeHtml; -IM_[BEFORE_HTML_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken; -IM_[BEFORE_HTML_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[BEFORE_HTML_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[BEFORE_HTML_MODE][Tokenizer.START_TAG_TOKEN] = startTagBeforeHtml; -IM_[BEFORE_HTML_MODE][Tokenizer.END_TAG_TOKEN] = endTagBeforeHtml; -IM_[BEFORE_HTML_MODE][Tokenizer.EOF_TOKEN] = tokenBeforeHtml; - -IM_[BEFORE_HEAD_MODE] = {}; -IM_[BEFORE_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[BEFORE_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenBeforeHead; -IM_[BEFORE_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken; -IM_[BEFORE_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[BEFORE_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[BEFORE_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagBeforeHead; -IM_[BEFORE_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagBeforeHead; -IM_[BEFORE_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenBeforeHead; - -IM_[IN_HEAD_MODE] = {}; -IM_[IN_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[IN_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInHead; -IM_[IN_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[IN_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagInHead; -IM_[IN_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagInHead; -IM_[IN_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenInHead; - -IM_[AFTER_HEAD_MODE] = {}; -IM_[AFTER_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[AFTER_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterHead; -IM_[AFTER_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[AFTER_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[AFTER_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[AFTER_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterHead; -IM_[AFTER_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterHead; -IM_[AFTER_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenAfterHead; - -IM_[IN_BODY_MODE] = {}; -IM_[IN_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = characterInBody; -IM_[IN_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; -IM_[IN_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagInBody; -IM_[IN_BODY_MODE][Tokenizer.END_TAG_TOKEN] = endTagInBody; -IM_[IN_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[TEXT_MODE] = {}; -IM_[TEXT_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[TEXT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = -IM_[TEXT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[TEXT_MODE][Tokenizer.COMMENT_TOKEN] = -IM_[TEXT_MODE][Tokenizer.DOCTYPE_TOKEN] = -IM_[TEXT_MODE][Tokenizer.START_TAG_TOKEN] = ignoreToken; -IM_[TEXT_MODE][Tokenizer.END_TAG_TOKEN] = endTagInText; -IM_[TEXT_MODE][Tokenizer.EOF_TOKEN] = eofInText; - -IM_[IN_TABLE_MODE] = {}; -IM_[IN_TABLE_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[IN_TABLE_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = -IM_[IN_TABLE_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = characterInTable; -IM_[IN_TABLE_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_TABLE_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_TABLE_MODE][Tokenizer.START_TAG_TOKEN] = startTagInTable; -IM_[IN_TABLE_MODE][Tokenizer.END_TAG_TOKEN] = endTagInTable; -IM_[IN_TABLE_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_TABLE_TEXT_MODE] = {}; -IM_[IN_TABLE_TEXT_MODE][Tokenizer.CHARACTER_TOKEN] = characterInTableText; -IM_[IN_TABLE_TEXT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_TABLE_TEXT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInTableText; -IM_[IN_TABLE_TEXT_MODE][Tokenizer.COMMENT_TOKEN] = -IM_[IN_TABLE_TEXT_MODE][Tokenizer.DOCTYPE_TOKEN] = -IM_[IN_TABLE_TEXT_MODE][Tokenizer.START_TAG_TOKEN] = -IM_[IN_TABLE_TEXT_MODE][Tokenizer.END_TAG_TOKEN] = -IM_[IN_TABLE_TEXT_MODE][Tokenizer.EOF_TOKEN] = tokenInTableText; - -IM_[IN_CAPTION_MODE] = {}; -IM_[IN_CAPTION_MODE][Tokenizer.CHARACTER_TOKEN] = characterInBody; -IM_[IN_CAPTION_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_CAPTION_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; -IM_[IN_CAPTION_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_CAPTION_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_CAPTION_MODE][Tokenizer.START_TAG_TOKEN] = startTagInCaption; -IM_[IN_CAPTION_MODE][Tokenizer.END_TAG_TOKEN] = endTagInCaption; -IM_[IN_CAPTION_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_COLUMN_GROUP_MODE] = {}; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInColumnGroup; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.START_TAG_TOKEN] = startTagInColumnGroup; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.END_TAG_TOKEN] = endTagInColumnGroup; -IM_[IN_COLUMN_GROUP_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_TABLE_BODY_MODE] = {}; -IM_[IN_TABLE_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[IN_TABLE_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = -IM_[IN_TABLE_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = characterInTable; -IM_[IN_TABLE_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_TABLE_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_TABLE_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagInTableBody; -IM_[IN_TABLE_BODY_MODE][Tokenizer.END_TAG_TOKEN] = endTagInTableBody; -IM_[IN_TABLE_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_ROW_MODE] = {}; -IM_[IN_ROW_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[IN_ROW_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = -IM_[IN_ROW_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = characterInTable; -IM_[IN_ROW_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_ROW_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_ROW_MODE][Tokenizer.START_TAG_TOKEN] = startTagInRow; -IM_[IN_ROW_MODE][Tokenizer.END_TAG_TOKEN] = endTagInRow; -IM_[IN_ROW_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_CELL_MODE] = {}; -IM_[IN_CELL_MODE][Tokenizer.CHARACTER_TOKEN] = characterInBody; -IM_[IN_CELL_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_CELL_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; -IM_[IN_CELL_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_CELL_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_CELL_MODE][Tokenizer.START_TAG_TOKEN] = startTagInCell; -IM_[IN_CELL_MODE][Tokenizer.END_TAG_TOKEN] = endTagInCell; -IM_[IN_CELL_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_SELECT_MODE] = {}; -IM_[IN_SELECT_MODE][Tokenizer.CHARACTER_TOKEN] = insertCharacters; -IM_[IN_SELECT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_SELECT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[IN_SELECT_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_SELECT_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_SELECT_MODE][Tokenizer.START_TAG_TOKEN] = startTagInSelect; -IM_[IN_SELECT_MODE][Tokenizer.END_TAG_TOKEN] = endTagInSelect; -IM_[IN_SELECT_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_SELECT_IN_TABLE_MODE] = {}; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.CHARACTER_TOKEN] = insertCharacters; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.START_TAG_TOKEN] = startTagInSelectInTable; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.END_TAG_TOKEN] = endTagInSelectInTable; -IM_[IN_SELECT_IN_TABLE_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[AFTER_BODY_MODE] = {}; -IM_[AFTER_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[AFTER_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterBody; -IM_[AFTER_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; -IM_[AFTER_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendCommentToRootHtmlElement; -IM_[AFTER_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[AFTER_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterBody; -IM_[AFTER_BODY_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterBody; -IM_[AFTER_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[IN_FRAMESET_MODE] = {}; -IM_[IN_FRAMESET_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[IN_FRAMESET_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[IN_FRAMESET_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[IN_FRAMESET_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[IN_FRAMESET_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[IN_FRAMESET_MODE][Tokenizer.START_TAG_TOKEN] = startTagInFrameset; -IM_[IN_FRAMESET_MODE][Tokenizer.END_TAG_TOKEN] = endTagInFrameset; -IM_[IN_FRAMESET_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[AFTER_FRAMESET_MODE] = {}; -IM_[AFTER_FRAMESET_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[AFTER_FRAMESET_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[AFTER_FRAMESET_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -IM_[AFTER_FRAMESET_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; -IM_[AFTER_FRAMESET_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[AFTER_FRAMESET_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterFrameset; -IM_[AFTER_FRAMESET_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterFrameset; -IM_[AFTER_FRAMESET_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[AFTER_AFTER_BODY_MODE] = {}; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = tokenAfterAfterBody; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterAfterBody; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendCommentToDocument; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterAfterBody; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.END_TAG_TOKEN] = tokenAfterAfterBody; -IM_[AFTER_AFTER_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -IM_[AFTER_AFTER_FRAMESET_MODE] = {}; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.CHARACTER_TOKEN] = -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.COMMENT_TOKEN] = appendCommentToDocument; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterAfterFrameset; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.END_TAG_TOKEN] = ignoreToken; -IM_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.EOF_TOKEN] = stopParsing; - -//Token handler for the foreign content -var FC_ = {}; - -FC_[Tokenizer.CHARACTER_TOKEN] = characterInForeignContent; -FC_[Tokenizer.NULL_CHARACTER_TOKEN] = nullCharacterInForeignContent; -FC_[Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; -FC_[Tokenizer.COMMENT_TOKEN] = appendComment; -FC_[Tokenizer.DOCTYPE_TOKEN] = ignoreToken; -FC_[Tokenizer.START_TAG_TOKEN] = startTagInForeignContent; -FC_[Tokenizer.END_TAG_TOKEN] = endTagInForeignContent; +var _ = {}; + +_[INITIAL_MODE] = {}; +_[INITIAL_MODE][Tokenizer.CHARACTER_TOKEN] = +_[INITIAL_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInInitialMode; +_[INITIAL_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken; +_[INITIAL_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[INITIAL_MODE][Tokenizer.DOCTYPE_TOKEN] = doctypeInInitialMode; +_[INITIAL_MODE][Tokenizer.START_TAG_TOKEN] = +_[INITIAL_MODE][Tokenizer.END_TAG_TOKEN] = +_[INITIAL_MODE][Tokenizer.EOF_TOKEN] = tokenInInitialMode; + +_[BEFORE_HTML_MODE] = {}; +_[BEFORE_HTML_MODE][Tokenizer.CHARACTER_TOKEN] = +_[BEFORE_HTML_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenBeforeHtml; +_[BEFORE_HTML_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken; +_[BEFORE_HTML_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[BEFORE_HTML_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[BEFORE_HTML_MODE][Tokenizer.START_TAG_TOKEN] = startTagBeforeHtml; +_[BEFORE_HTML_MODE][Tokenizer.END_TAG_TOKEN] = endTagBeforeHtml; +_[BEFORE_HTML_MODE][Tokenizer.EOF_TOKEN] = tokenBeforeHtml; + +_[BEFORE_HEAD_MODE] = {}; +_[BEFORE_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] = +_[BEFORE_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenBeforeHead; +_[BEFORE_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken; +_[BEFORE_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[BEFORE_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[BEFORE_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagBeforeHead; +_[BEFORE_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagBeforeHead; +_[BEFORE_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenBeforeHead; + +_[IN_HEAD_MODE] = {}; +_[IN_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] = +_[IN_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInHead; +_[IN_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[IN_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagInHead; +_[IN_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagInHead; +_[IN_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenInHead; + +_[AFTER_HEAD_MODE] = {}; +_[AFTER_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] = +_[AFTER_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterHead; +_[AFTER_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[AFTER_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[AFTER_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[AFTER_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterHead; +_[AFTER_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterHead; +_[AFTER_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenAfterHead; + +_[IN_BODY_MODE] = {}; +_[IN_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = characterInBody; +_[IN_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; +_[IN_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagInBody; +_[IN_BODY_MODE][Tokenizer.END_TAG_TOKEN] = endTagInBody; +_[IN_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[TEXT_MODE] = {}; +_[TEXT_MODE][Tokenizer.CHARACTER_TOKEN] = +_[TEXT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = +_[TEXT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[TEXT_MODE][Tokenizer.COMMENT_TOKEN] = +_[TEXT_MODE][Tokenizer.DOCTYPE_TOKEN] = +_[TEXT_MODE][Tokenizer.START_TAG_TOKEN] = ignoreToken; +_[TEXT_MODE][Tokenizer.END_TAG_TOKEN] = endTagInText; +_[TEXT_MODE][Tokenizer.EOF_TOKEN] = eofInText; + +_[IN_TABLE_MODE] = {}; +_[IN_TABLE_MODE][Tokenizer.CHARACTER_TOKEN] = +_[IN_TABLE_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = +_[IN_TABLE_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = characterInTable; +_[IN_TABLE_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_TABLE_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_TABLE_MODE][Tokenizer.START_TAG_TOKEN] = startTagInTable; +_[IN_TABLE_MODE][Tokenizer.END_TAG_TOKEN] = endTagInTable; +_[IN_TABLE_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_TABLE_TEXT_MODE] = {}; +_[IN_TABLE_TEXT_MODE][Tokenizer.CHARACTER_TOKEN] = characterInTableText; +_[IN_TABLE_TEXT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_TABLE_TEXT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInTableText; +_[IN_TABLE_TEXT_MODE][Tokenizer.COMMENT_TOKEN] = +_[IN_TABLE_TEXT_MODE][Tokenizer.DOCTYPE_TOKEN] = +_[IN_TABLE_TEXT_MODE][Tokenizer.START_TAG_TOKEN] = +_[IN_TABLE_TEXT_MODE][Tokenizer.END_TAG_TOKEN] = +_[IN_TABLE_TEXT_MODE][Tokenizer.EOF_TOKEN] = tokenInTableText; + +_[IN_CAPTION_MODE] = {}; +_[IN_CAPTION_MODE][Tokenizer.CHARACTER_TOKEN] = characterInBody; +_[IN_CAPTION_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_CAPTION_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; +_[IN_CAPTION_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_CAPTION_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_CAPTION_MODE][Tokenizer.START_TAG_TOKEN] = startTagInCaption; +_[IN_CAPTION_MODE][Tokenizer.END_TAG_TOKEN] = endTagInCaption; +_[IN_CAPTION_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_COLUMN_GROUP_MODE] = {}; +_[IN_COLUMN_GROUP_MODE][Tokenizer.CHARACTER_TOKEN] = +_[IN_COLUMN_GROUP_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInColumnGroup; +_[IN_COLUMN_GROUP_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[IN_COLUMN_GROUP_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_COLUMN_GROUP_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_COLUMN_GROUP_MODE][Tokenizer.START_TAG_TOKEN] = startTagInColumnGroup; +_[IN_COLUMN_GROUP_MODE][Tokenizer.END_TAG_TOKEN] = endTagInColumnGroup; +_[IN_COLUMN_GROUP_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_TABLE_BODY_MODE] = {}; +_[IN_TABLE_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = +_[IN_TABLE_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = +_[IN_TABLE_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = characterInTable; +_[IN_TABLE_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_TABLE_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_TABLE_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagInTableBody; +_[IN_TABLE_BODY_MODE][Tokenizer.END_TAG_TOKEN] = endTagInTableBody; +_[IN_TABLE_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_ROW_MODE] = {}; +_[IN_ROW_MODE][Tokenizer.CHARACTER_TOKEN] = +_[IN_ROW_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = +_[IN_ROW_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = characterInTable; +_[IN_ROW_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_ROW_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_ROW_MODE][Tokenizer.START_TAG_TOKEN] = startTagInRow; +_[IN_ROW_MODE][Tokenizer.END_TAG_TOKEN] = endTagInRow; +_[IN_ROW_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_CELL_MODE] = {}; +_[IN_CELL_MODE][Tokenizer.CHARACTER_TOKEN] = characterInBody; +_[IN_CELL_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_CELL_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; +_[IN_CELL_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_CELL_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_CELL_MODE][Tokenizer.START_TAG_TOKEN] = startTagInCell; +_[IN_CELL_MODE][Tokenizer.END_TAG_TOKEN] = endTagInCell; +_[IN_CELL_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_SELECT_MODE] = {}; +_[IN_SELECT_MODE][Tokenizer.CHARACTER_TOKEN] = insertCharacters; +_[IN_SELECT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_SELECT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[IN_SELECT_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_SELECT_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_SELECT_MODE][Tokenizer.START_TAG_TOKEN] = startTagInSelect; +_[IN_SELECT_MODE][Tokenizer.END_TAG_TOKEN] = endTagInSelect; +_[IN_SELECT_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_SELECT_IN_TABLE_MODE] = {}; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.CHARACTER_TOKEN] = insertCharacters; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.START_TAG_TOKEN] = startTagInSelectInTable; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.END_TAG_TOKEN] = endTagInSelectInTable; +_[IN_SELECT_IN_TABLE_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[AFTER_BODY_MODE] = {}; +_[AFTER_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = +_[AFTER_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterBody; +_[AFTER_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; +_[AFTER_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendCommentToRootHtmlElement; +_[AFTER_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[AFTER_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterBody; +_[AFTER_BODY_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterBody; +_[AFTER_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[IN_FRAMESET_MODE] = {}; +_[IN_FRAMESET_MODE][Tokenizer.CHARACTER_TOKEN] = +_[IN_FRAMESET_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[IN_FRAMESET_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[IN_FRAMESET_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[IN_FRAMESET_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[IN_FRAMESET_MODE][Tokenizer.START_TAG_TOKEN] = startTagInFrameset; +_[IN_FRAMESET_MODE][Tokenizer.END_TAG_TOKEN] = endTagInFrameset; +_[IN_FRAMESET_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[AFTER_FRAMESET_MODE] = {}; +_[AFTER_FRAMESET_MODE][Tokenizer.CHARACTER_TOKEN] = +_[AFTER_FRAMESET_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[AFTER_FRAMESET_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters; +_[AFTER_FRAMESET_MODE][Tokenizer.COMMENT_TOKEN] = appendComment; +_[AFTER_FRAMESET_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[AFTER_FRAMESET_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterFrameset; +_[AFTER_FRAMESET_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterFrameset; +_[AFTER_FRAMESET_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[AFTER_AFTER_BODY_MODE] = {}; +_[AFTER_AFTER_BODY_MODE][Tokenizer.CHARACTER_TOKEN] = tokenAfterAfterBody; +_[AFTER_AFTER_BODY_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterAfterBody; +_[AFTER_AFTER_BODY_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; +_[AFTER_AFTER_BODY_MODE][Tokenizer.COMMENT_TOKEN] = appendCommentToDocument; +_[AFTER_AFTER_BODY_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[AFTER_AFTER_BODY_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterAfterBody; +_[AFTER_AFTER_BODY_MODE][Tokenizer.END_TAG_TOKEN] = tokenAfterAfterBody; +_[AFTER_AFTER_BODY_MODE][Tokenizer.EOF_TOKEN] = stopParsing; + +_[AFTER_AFTER_FRAMESET_MODE] = {}; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.CHARACTER_TOKEN] = +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = ignoreToken; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = whitespaceCharacterInBody; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.COMMENT_TOKEN] = appendCommentToDocument; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterAfterFrameset; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.END_TAG_TOKEN] = ignoreToken; +_[AFTER_AFTER_FRAMESET_MODE][Tokenizer.EOF_TOKEN] = stopParsing; //Token utils function getTokenAttr(token, attrName) { @@ -1002,15 +991,31 @@ Parser.prototype._shouldProcessTokenInForeignContent = function (token) { }; Parser.prototype._processToken = function (token) { - IM_[this.insertionMode][token.type](this, token); + _[this.insertionMode][token.type](this, token); }; Parser.prototype._processTokenInBodyMode = function (token) { - IM_[IN_BODY_MODE][token.type](this, token); + _[IN_BODY_MODE][token.type](this, token); }; Parser.prototype._processTokenInForeignContent = function (token) { - FC_[token.type](this, token); + if (token.type === Tokenizer.CHARACTER_TOKEN) + characterInForeignContent(this, token); + + else if (token.type === Tokenizer.NULL_CHARACTER_TOKEN) + nullCharacterInForeignContent(this, token); + + else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) + insertCharacters(this, token); + + else if (token.type === Tokenizer.COMMENT_TOKEN) + appendComment(this, token); + + else if (token.type === Tokenizer.START_TAG_TOKEN) + startTagInForeignContent(this, token); + + else if (token.type === Tokenizer.END_TAG_TOKEN) + endTagInForeignContent(this, token); }; Parser.prototype._processFakeStartTagWithAttrs = function (tagName, attrs) { diff --git a/package.json b/package.json index e05558409..006a860ac 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "parse5", "description": "Fast full-featured HTML parser for Node. Based on WHATWG HTML5 specification.", - "version": "0.5.4", + "version": "0.6.0", "author": "Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)", "keywords": [ "html",