-
Notifications
You must be signed in to change notification settings - Fork 125
/
Copy patha_lxml_create_html.py
executable file
·49 lines (38 loc) · 1.23 KB
/
a_lxml_create_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
from lxml import etree, html
class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
super(SafeXMLParser, self).__init__(*args, **kwargs)
_ctgroup = {
'html': {
'_parser': html.HTMLParser,
'_tostring_method': 'html',
},
'xml': {
'_parser': SafeXMLParser,
'_tostring_method': 'xml',
},
}
_default_type = None
def _st(st):
if st is None:
return 'html'
elif st in _ctgroup:
return st
else:
raise ValueError('Invalid type: %s' % st)
def create_root_node(text, base_url=None, doc_type='html'):
"""Create root node for text using given parser class.
"""
st = _st(doc_type or _default_type)
parser_cls = _ctgroup[st]['_parser']
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
parser = parser_cls(recover=True, encoding='utf8')
root = etree.fromstring(body, parser=parser, base_url=base_url)
if root is None:
root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
return root
def ele_to_string(ele):
html_source = etree.tostring(ele, encoding='unicode')
return html_source