Skip to content
This repository was archived by the owner on Feb 15, 2023. It is now read-only.

Commit 56f2cf9

Browse files
committed
parser: Implement fragment parsing
The HTML5 fragment parsing algorithm has been implemented using a new API, `gumbo_parse_fragment`. The old APIs are maintained for backwards compatibility, although passing `GUMBO_TAG_LAST` as the inner_html context to `parse_fragment` will cause it to parse the buffer as a full document (same functionality as `gumbo_parse_with_options`). The HTML5lib adapter code has been modified to support fragment parsing tests (the tests are passing 100%).
1 parent b8e2f45 commit 56f2cf9

File tree

5 files changed

+172
-45
lines changed

5 files changed

+172
-45
lines changed

python/gumbo/gumboc.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,11 @@ def to_url(self):
246246

247247

248248
class Tag(Enum):
249+
@staticmethod
250+
def from_str(tagname):
251+
text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
252+
return _tag_enum(text_ptr)
253+
249254
_values_ = [
250255
'HTML',
251256
'HEAD',
@@ -398,6 +403,7 @@ class Tag(Enum):
398403
'SPACER',
399404
'TT',
400405
'UNKNOWN',
406+
'LAST'
401407
]
402408

403409

@@ -498,11 +504,6 @@ def __repr__(self):
498504

499505
class Options(ctypes.Structure):
500506
_fields_ = [
501-
# TODO(jdtang): Allow the Python API to set the allocator/deallocator
502-
# function. Right now these are treated as opaque void pointers.
503-
('allocator', ctypes.c_void_p),
504-
('deallocator', ctypes.c_void_p),
505-
('userdata', ctypes.c_void_p),
506507
('tab_stop', ctypes.c_int),
507508
('stop_on_first_error', ctypes.c_bool),
508509
('max_errors', ctypes.c_int),
@@ -517,9 +518,8 @@ class Output(ctypes.Structure):
517518
('errors', Vector),
518519
]
519520

520-
521521
@contextlib.contextmanager
522-
def parse(text, **kwargs):
522+
def parse(text, container, **kwargs):
523523
options = Options()
524524
for field_name, _ in Options._fields_:
525525
try:
@@ -531,7 +531,7 @@ def parse(text, **kwargs):
531531
# call, it creates a temporary buffer which is destroyed when the call
532532
# completes, and then the original_text pointers point into invalid memory.
533533
text_ptr = ctypes.c_char_p(text.encode('utf-8'))
534-
output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
534+
output = _parse_fragment(ctypes.byref(options), text_ptr, len(text), container)
535535
try:
536536
yield output
537537
finally:
@@ -543,6 +543,10 @@ def parse(text, **kwargs):
543543
_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
544544
_parse_with_options.restype = _Ptr(Output)
545545

546+
_parse_fragment = _dll.gumbo_parse_fragment
547+
_parse_fragment.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag]
548+
_parse_fragment.restype = _Ptr(Output)
549+
546550
_tag_from_original_text = _dll.gumbo_tag_from_original_text
547551
_tag_from_original_text.argtypes = [_Ptr(StringPiece)]
548552
_tag_from_original_text.restype = None
@@ -559,6 +563,10 @@ def parse(text, **kwargs):
559563
_tagname.argtypes = [Tag]
560564
_tagname.restype = ctypes.c_char_p
561565

566+
_tag_enum = _dll.gumbo_tag_enum
567+
_tag_enum.argtypes = [ctypes.c_char_p]
568+
_tag_enum.restype = Tag
569+
562570
__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
563571
'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
564572
'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',

python/gumbo/html5lib_adapter.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,12 @@ def _convert_element(source_node):
7070
}
7171

7272

73-
def _insert_root(treebuilder, source_node):
73+
def _insert_root(treebuilder, source_node, is_fragment = False):
7474
treebuilder.insertRoot(_convert_element(source_node))
7575
for child_node in source_node.children:
7676
_insert_node(treebuilder, child_node)
77-
treebuilder.openElements.pop()
78-
77+
if not is_fragment:
78+
treebuilder.openElements.pop()
7979

8080
def _insert_node(treebuilder, source_node):
8181
assert source_node.type != gumboc.NodeType.DOCUMENT
@@ -104,7 +104,7 @@ def parse(self, text_or_file, **kwargs):
104104
# Assume a string.
105105
text = text_or_file
106106

107-
with gumboc.parse(text, **kwargs) as output:
107+
with gumboc.parse(text, gumboc.Tag.LAST, **kwargs) as output:
108108
_convert_doctype(self.tree, output.contents.document.contents)
109109
for node in output.contents.document.contents.children:
110110
if node.type == gumboc.NodeType.COMMENT:
@@ -115,3 +115,18 @@ def parse(self, text_or_file, **kwargs):
115115
else:
116116
assert 'Only comments and <html> nodes allowed at the root'
117117
return self.tree.getDocument()
118+
119+
def parseFragment(self, text_or_file, container, **kwargs):
120+
try:
121+
text = text_or_file.read()
122+
except AttributeError:
123+
# Assume a string.
124+
text = text_or_file
125+
126+
with gumboc.parse(text, gumboc.Tag.from_str(container), **kwargs) as output:
127+
for node in output.contents.document.contents.children:
128+
if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
129+
_insert_root(self.tree, output.contents.root.contents, True)
130+
else:
131+
assert 'Malformed fragment parse (??)'
132+
return self.tree.getFragment()

python/gumbo/html5lib_adapter_test.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,10 @@ def impl(self, inner_html, input, expected, errors):
123123
p = html5lib_adapter.HTMLParser(
124124
tree=TREEBUILDER(namespaceHTMLElements=True))
125125

126-
if not inner_html:
127-
# TODO(jdtang): Need to implement fragment parsing.
128-
document = p.parse(StringIO.StringIO(input))
126+
if inner_html:
127+
document = p.parseFragment(StringIO.StringIO(input), inner_html)
129128
else:
130-
return
129+
document = p.parse(StringIO.StringIO(input))
131130

132131
with warnings.catch_warnings():
133132
# Etree serializer in html5lib uses a deprecated getchildren() API.
@@ -137,11 +136,6 @@ def impl(self, inner_html, input, expected, errors):
137136
expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub(
138137
r'\1<html \2>', convertExpected(expected, 2))
139138

140-
# html5lib doesn't yet support the template tag, but it appears in the
141-
# tests with the expectation that the template contents will be under the
142-
# word 'contents', so we need to reformat that string a bit.
143-
expected = reformatTemplateContents(expected)
144-
145139
error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected,
146140
'\nReceived:', output])
147141
self.assertEquals(expected, output,

src/gumbo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,14 @@ GumboOutput* gumbo_parse(const char* buffer);
791791
GumboOutput* gumbo_parse_with_options(
792792
const GumboOptions* options, const char* buffer, size_t buffer_length);
793793

794+
/**
795+
* Parse a chunk of HTML with the given fragment context. If `fragment_ctx`
796+
* is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document.
797+
*/
798+
GumboOutput* gumbo_parse_fragment(
799+
const GumboOptions* options, const char* buffer, size_t length,
800+
const GumboTag fragment_ctx);
801+
794802
/** Release the memory used for the parse tree & parse errors. */
795803
void gumbo_destroy_output(GumboOutput* output);
796804

0 commit comments

Comments
 (0)