-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathClientForm.py
3401 lines (2795 loc) · 123 KB
/
ClientForm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""HTML form handling for web clients.
ClientForm is a Python module for handling HTML forms on the client
side, useful for parsing HTML forms, filling them in and returning the
completed forms to the server. It has developed from a port of Gisle
Aas' Perl module HTML::Form, from the libwww-perl library, but the
interface is not the same.
The most useful docstring is the one for HTMLForm.
RFC 1866: HTML 2.0
RFC 1867: Form-based File Upload in HTML
RFC 2388: Returning Values from Forms: multipart/form-data
HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
HTML 4.01 Specification, W3C Recommendation 24 December 1999
Copyright 2002-2007 John J. Lee <jjl@pobox.com>
Copyright 2005 Gary Poster
Copyright 2005 Zope Corporation
Copyright 1998-2000 Gisle Aas.
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
# XXX
# Remove parser testing hack
# safeUrl()-ize action
# Switch to unicode throughout (would be 0.3.x)
# See Wichert Akkerman's 2004-01-22 message to c.l.py.
# Add charset parameter to Content-type headers? How to find value??
# Add some more functional tests
# Especially single and multiple file upload on the internet.
# Does file upload work when name is missing? Sourceforge tracker form
# doesn't like it. Check standards, and test with Apache. Test
# binary upload with Apache.
# mailto submission & enctype text/plain
# I'm not going to fix this unless somebody tells me what real servers
# that want this encoding actually expect: If enctype is
# application/x-www-form-urlencoded and there's a FILE control present.
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
# 17.13.2), but I send "name=" ATM. What about multiple file upload??
# Would be nice, but I'm not going to do it myself:
# -------------------------------------------------
# Maybe a 0.4.x?
# Replace by_label etc. with moniker / selector concept. Allows, eg.,
# a choice between selection by value / id / label / element
# contents. Or choice between matching labels exactly or by
# substring. Etc.
# Remove deprecated methods.
# ...what else?
# Work on DOMForm.
# XForms? Don't know if there's a need here.
__all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',
'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl',
'RadioControl', 'ScalarControl', 'SelectControl',
'SubmitButtonControl', 'SubmitControl', 'TextControl',
'TextareaControl', 'XHTMLCompatibleFormParser']
try: True
except NameError:
True = 1
False = 0
try: bool
except NameError:
def bool(expr):
if expr: return True
else: return False
try:
import logging
import inspect
except ImportError:
def debug(msg, *args, **kwds):
pass
else:
_logger = logging.getLogger("ClientForm")
OPTIMIZATION_HACK = True
def debug(msg, *args, **kwds):
if OPTIMIZATION_HACK:
return
caller_name = inspect.stack()[1][3]
extended_msg = '%%s %s' % msg
extended_args = (caller_name,)+args
debug = _logger.debug(extended_msg, *extended_args, **kwds)
def _show_debug_messages():
global OPTIMIZATION_HACK
OPTIMIZATION_HACK = False
_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
_logger.addHandler(handler)
import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
htmlentitydefs, re, random
from cStringIO import StringIO
import sgmllib
# monkeypatch to fix http://www.python.org/sf/803422 :-(
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
# HTMLParser.HTMLParser is recent, so live without it if it's not available
# (also, sgmllib.SGMLParser is much more tolerant of bad HTML)
try:
import HTMLParser
except ImportError:
HAVE_MODULE_HTMLPARSER = False
else:
HAVE_MODULE_HTMLPARSER = True
try:
import warnings
except ImportError:
def deprecation(message, stack_offset=0):
pass
else:
def deprecation(message, stack_offset=0):
warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)
VERSION = "0.2.10"
CHUNK = 1024 # size of chunks fed to parser, in bytes
DEFAULT_ENCODING = "latin-1"
class Missing: pass
_compress_re = re.compile(r"\s+")
def compress_text(text): return _compress_re.sub(" ", text.strip())
def normalize_line_endings(text):
return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)
# This version of urlencode is from my Python 1.5.2 back-port of the
# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
def urlencode(query,doseq=False,):
"""Encode a sequence of two-element tuples or dictionary into a URL query \
string.
If any values in the query arg are sequences and doseq is true, each
sequence element is converted to a separate parameter.
If the query arg is a sequence of two-element tuples, the order of the
parameters in the output will match the order of parameters in the
input.
"""
if hasattr(query,"items"):
# mapping objects
query = query.items()
else:
# it's a bother at times that strings and string-like objects are
# sequences...
try:
# non-sequence items should not work with len()
x = len(query)
# non-empty strings will fail this
if len(query) and type(query[0]) != types.TupleType:
raise TypeError()
# zero-length sequences of all types will get here and succeed,
# but that's a minor nit - since the original implementation
# allowed empty dicts that type of behavior probably should be
# preserved for consistency
except TypeError:
ty,va,tb = sys.exc_info()
raise TypeError("not a valid non-string sequence or mapping "
"object", tb)
l = []
if not doseq:
# preserve old behavior
for k, v in query:
k = urllib.quote_plus(str(k))
v = urllib.quote_plus(str(v))
l.append(k + '=' + v)
else:
for k, v in query:
k = urllib.quote_plus(str(k))
if type(v) == types.StringType:
v = urllib.quote_plus(v)
l.append(k + '=' + v)
elif type(v) == types.UnicodeType:
# is there a reasonable way to convert to ASCII?
# encode generates a string, but "replace" or "ignore"
# lose information and "strict" can raise UnicodeError
v = urllib.quote_plus(v.encode("ASCII","replace"))
l.append(k + '=' + v)
else:
try:
# is this a sufficient test for sequence-ness?
x = len(v)
except TypeError:
# not a sequence
v = urllib.quote_plus(str(v))
l.append(k + '=' + v)
else:
# loop over the sequence
for elt in v:
l.append(k + '=' + urllib.quote_plus(str(elt)))
return '&'.join(l)
def unescape(data, entities, encoding=DEFAULT_ENCODING):
if data is None or "&" not in data:
return data
def replace_entities(match, entities=entities, encoding=encoding):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent[2:-1], encoding)
repl = entities.get(ent)
if repl is not None:
if type(repl) != type(""):
try:
repl = repl.encode(encoding)
except UnicodeError:
repl = ent
else:
repl = ent
return repl
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
def unescape_charref(data, encoding):
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
if encoding is None:
return uc
else:
try:
repl = uc.encode(encoding)
except UnicodeError:
repl = "&#%s;" % data
return repl
def get_entitydefs():
import htmlentitydefs
from codecs import latin_1_decode
entitydefs = {}
try:
htmlentitydefs.name2codepoint
except AttributeError:
entitydefs = {}
for name, char in htmlentitydefs.entitydefs.items():
uc = latin_1_decode(char)[0]
if uc.startswith("&#") and uc.endswith(";"):
uc = unescape_charref(uc[2:-1], None)
entitydefs["&%s;" % name] = uc
else:
for name, codepoint in htmlentitydefs.name2codepoint.items():
entitydefs["&%s;" % name] = unichr(codepoint)
return entitydefs
def issequence(x):
try:
x[0]
except (TypeError, KeyError):
return False
except IndexError:
pass
return True
def isstringlike(x):
try: x+""
except: return False
else: return True
def choose_boundary():
"""Return a string usable as a multipart boundary."""
# follow IE and firefox
nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
return "-"*27 + nonce
# This cut-n-pasted MimeWriter from standard library is here so can add
# to HTTP headers rather than message body when appropriate. It also uses
# \r\n in place of \n. This is a bit nasty.
class MimeWriter:
"""Generic MIME writer.
Methods:
__init__()
addheader()
flushheaders()
startbody()
startmultipartbody()
nextpart()
lastpart()
A MIME writer is much more primitive than a MIME parser. It
doesn't seek around on the output file, and it doesn't use large
amounts of buffer space, so you have to write the parts in the
order they should occur on the output file. It does buffer the
headers you add, allowing you to rearrange their order.
General usage is:
f = <open the output file>
w = MimeWriter(f)
...call w.addheader(key, value) 0 or more times...
followed by either:
f = w.startbody(content_type)
...call f.write(data) for body data...
or:
w.startmultipartbody(subtype)
for each part:
subwriter = w.nextpart()
...use the subwriter's methods to create the subpart...
w.lastpart()
The subwriter is another MimeWriter instance, and should be
treated in the same way as the toplevel MimeWriter. This way,
writing recursive body parts is easy.
Warning: don't forget to call lastpart()!
XXX There should be more state so calls made in the wrong order
are detected.
Some special cases:
- startbody() just returns the file passed to the constructor;
but don't use this knowledge, as it may be changed.
- startmultipartbody() actually returns a file as well;
this can be used to write the initial 'if you can read this your
mailer is not MIME-aware' message.
- If you call flushheaders(), the headers accumulated so far are
written out (and forgotten); this is useful if you don't need a
body part at all, e.g. for a subpart of type message/rfc822
that's (mis)used to store some header-like information.
- Passing a keyword argument 'prefix=<flag>' to addheader(),
start*body() affects where the header is inserted; 0 means
append at the end, 1 means insert at the start; default is
append for addheader(), but insert for start*body(), which use
it to determine where the Content-type header goes.
"""
def __init__(self, fp, http_hdrs=None):
self._http_hdrs = http_hdrs
self._fp = fp
self._headers = []
self._boundary = []
self._first_part = True
def addheader(self, key, value, prefix=0,
add_to_http_hdrs=0):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
lines = value.split("\r\n")
while lines and not lines[-1]: del lines[-1]
while lines and not lines[0]: del lines[0]
if add_to_http_hdrs:
value = "".join(lines)
# 2.2 urllib2 doesn't normalize header case
self._http_hdrs.append((key.capitalize(), value))
else:
for i in range(1, len(lines)):
lines[i] = " " + lines[i].strip()
value = "\r\n".join(lines) + "\r\n"
line = key.title() + ": " + value
if prefix:
self._headers.insert(0, line)
else:
self._headers.append(line)
def flushheaders(self):
self._fp.writelines(self._headers)
self._headers = []
def startbody(self, ctype=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
if content_type and ctype:
for name, value in plist:
ctype = ctype + ';\r\n %s=%s' % (name, value)
self.addheader("Content-Type", ctype, prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs)
self.flushheaders()
if not add_to_http_hdrs: self._fp.write("\r\n")
self._first_part = True
return self._fp
def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
boundary = boundary or choose_boundary()
self._boundary.append(boundary)
return self.startbody("multipart/" + subtype,
[("boundary", boundary)] + plist,
prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs,
content_type=content_type)
def nextpart(self):
boundary = self._boundary[-1]
if self._first_part:
self._first_part = False
else:
self._fp.write("\r\n")
self._fp.write("--" + boundary + "\r\n")
return self.__class__(self._fp)
def lastpart(self):
if self._first_part:
self.nextpart()
boundary = self._boundary.pop()
self._fp.write("\r\n--" + boundary + "--\r\n")
class LocateError(ValueError): pass
class AmbiguityError(LocateError): pass
class ControlNotFoundError(LocateError): pass
class ItemNotFoundError(LocateError): pass
class ItemCountError(ValueError): pass
# for backwards compatibility, ParseError derives from exceptions that were
# raised by versions of ClientForm <= 0.2.5
if HAVE_MODULE_HTMLPARSER:
SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
class ParseError(sgmllib.SGMLParseError,
HTMLParser.HTMLParseError,
):
pass
else:
if hasattr(sgmllib, "SGMLParseError"):
SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
class ParseError(sgmllib.SGMLParseError):
pass
else:
SGMLLIB_PARSEERROR = RuntimeError
class ParseError(RuntimeError):
pass
class _AbstractFormParser:
"""forms attribute contains HTMLForm instances on completion."""
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
if entitydefs is None:
entitydefs = get_entitydefs()
self._entitydefs = entitydefs
self._encoding = encoding
self.base = None
self.forms = []
self.labels = []
self._current_label = None
self._current_form = None
self._select = None
self._optgroup = None
self._option = None
self._textarea = None
# forms[0] will contain all controls that are outside of any form
# self._global_form is an alias for self.forms[0]
self._global_form = None
self.start_form([])
self.end_form()
self._current_form = self._global_form = self.forms[0]
def do_base(self, attrs):
debug("%s", attrs)
for key, value in attrs:
if key == "href":
self.base = self.unescape_attr_if_required(value)
def end_body(self):
debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is not self._global_form:
self.end_form()
def start_form(self, attrs):
debug("%s", attrs)
if self._current_form is not self._global_form:
raise ParseError("nested FORMs")
name = None
action = None
enctype = "application/x-www-form-urlencoded"
method = "GET"
d = {}
for key, value in attrs:
if key == "name":
name = self.unescape_attr_if_required(value)
elif key == "action":
action = self.unescape_attr_if_required(value)
elif key == "method":
method = self.unescape_attr_if_required(value.upper())
elif key == "enctype":
enctype = self.unescape_attr_if_required(value.lower())
d[key] = self.unescape_attr_if_required(value)
controls = []
self._current_form = (name, action, method, enctype), d, controls
def end_form(self):
debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is self._global_form:
raise ParseError("end of FORM before start")
self.forms.append(self._current_form)
self._current_form = self._global_form
def start_select(self, attrs):
debug("%s", attrs)
if self._select is not None:
raise ParseError("nested SELECTs")
if self._textarea is not None:
raise ParseError("SELECT inside TEXTAREA")
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._select = d
self._add_label(d)
self._append_select_control({"__select": d})
def end_select(self):
debug("")
if self._select is None:
raise ParseError("end of SELECT before start")
if self._option is not None:
self._end_option()
self._select = None
def start_optgroup(self, attrs):
debug("%s", attrs)
if self._select is None:
raise ParseError("OPTGROUP outside of SELECT")
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._optgroup = d
def end_optgroup(self):
debug("")
if self._optgroup is None:
raise ParseError("end of OPTGROUP before start")
self._optgroup = None
def _start_option(self, attrs):
debug("%s", attrs)
if self._select is None:
raise ParseError("OPTION outside of SELECT")
if self._option is not None:
self._end_option()
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._option = {}
self._option.update(d)
if (self._optgroup and self._optgroup.has_key("disabled") and
not self._option.has_key("disabled")):
self._option["disabled"] = None
def _end_option(self):
debug("")
if self._option is None:
raise ParseError("end of OPTION before start")
contents = self._option.get("contents", "").strip()
self._option["contents"] = contents
if not self._option.has_key("value"):
self._option["value"] = contents
if not self._option.has_key("label"):
self._option["label"] = contents
# stuff dict of SELECT HTML attrs into a special private key
# (gets deleted again later)
self._option["__select"] = self._select
self._append_select_control(self._option)
self._option = None
def _append_select_control(self, attrs):
debug("%s", attrs)
controls = self._current_form[2]
name = self._select.get("name")
controls.append(("select", name, attrs))
def start_textarea(self, attrs):
debug("%s", attrs)
if self._textarea is not None:
raise ParseError("nested TEXTAREAs")
if self._select is not None:
raise ParseError("TEXTAREA inside SELECT")
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._add_label(d)
self._textarea = d
def end_textarea(self):
debug("")
if self._textarea is None:
raise ParseError("end of TEXTAREA before start")
controls = self._current_form[2]
name = self._textarea.get("name")
controls.append(("textarea", name, self._textarea))
self._textarea = None
def start_label(self, attrs):
debug("%s", attrs)
if self._current_label:
self.end_label()
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
taken = bool(d.get("for")) # empty id is invalid
d["__text"] = ""
d["__taken"] = taken
if taken:
self.labels.append(d)
self._current_label = d
def end_label(self):
debug("")
label = self._current_label
if label is None:
# something is ugly in the HTML, but we're ignoring it
return
self._current_label = None
# if it is staying around, it is True in all cases
del label["__taken"]
def _add_label(self, d):
#debug("%s", d)
if self._current_label is not None:
if not self._current_label["__taken"]:
self._current_label["__taken"] = True
d["__label"] = self._current_label
def handle_data(self, data):
debug("%s", data)
if self._option is not None:
# self._option is a dictionary of the OPTION element's HTML
# attributes, but it has two special keys, one of which is the
# special "contents" key contains text between OPTION tags (the
# other is the "__select" key: see the end_option method)
map = self._option
key = "contents"
elif self._textarea is not None:
map = self._textarea
key = "value"
data = normalize_line_endings(data)
# not if within option or textarea
elif self._current_label is not None:
map = self._current_label
key = "__text"
else:
return
if data and not map.has_key(key):
# according to
# http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break
# immediately after start tags or immediately before end tags must
# be ignored, but real browsers only ignore a line break after a
# start tag, so we'll do that.
if data[0:2] == "\r\n":
data = data[2:]
elif data[0:1] in ["\n", "\r"]:
data = data[1:]
map[key] = data
else:
map[key] = map[key] + data
def do_button(self, attrs):
debug("%s", attrs)
d = {}
d["type"] = "submit" # default
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
# we don't want to lose information, so use a type string that
# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
# e.g. type for BUTTON/RESET is "resetbutton"
# (type for INPUT/RESET is "reset")
type = type+"button"
self._add_label(d)
controls.append((type, name, d))
def do_input(self, attrs):
debug("%s", attrs)
d = {}
d["type"] = "text" # default
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
self._add_label(d)
controls.append((type, name, d))
def do_isindex(self, attrs):
debug("%s", attrs)
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
controls = self._current_form[2]
self._add_label(d)
# isindex doesn't have type or name HTML attributes
controls.append(("isindex", None, d))
def handle_entityref(self, name):
#debug("%s", name)
self.handle_data(unescape(
'&%s;' % name, self._entitydefs, self._encoding))
def handle_charref(self, name):
#debug("%s", name)
self.handle_data(unescape_charref(name, self._encoding))
def unescape_attr(self, name):
#debug("%s", name)
return unescape(name, self._entitydefs, self._encoding)
def unescape_attrs(self, attrs):
#debug("%s", attrs)
escaped_attrs = {}
for key, val in attrs.items():
try:
val.items
except AttributeError:
escaped_attrs[key] = self.unescape_attr(val)
else:
# e.g. "__select" -- yuck!
escaped_attrs[key] = self.unescape_attrs(val)
return escaped_attrs
def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
if not HAVE_MODULE_HTMLPARSER:
class XHTMLCompatibleFormParser:
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
raise ValueError("HTMLParser could not be imported")
else:
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
"""Good for XHTML, bad for tolerance of incorrect HTML."""
# thanks to Michael Howitz for this!
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
HTMLParser.HTMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
def feed(self, data):
try:
HTMLParser.HTMLParser.feed(self, data)
except HTMLParser.HTMLParseError, exc:
raise ParseError(exc)
def start_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
def end_option(self):
_AbstractFormParser._end_option(self)
def handle_starttag(self, tag, attrs):
try:
method = getattr(self, "start_" + tag)
except AttributeError:
try:
method = getattr(self, "do_" + tag)
except AttributeError:
pass # unknown tag
else:
method(attrs)
else:
method(attrs)
def handle_endtag(self, tag):
try:
method = getattr(self, "end_" + tag)
except AttributeError:
pass # unknown tag
else:
method()
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
return self.unescape_attr(name)
def unescape_attr_if_required(self, name):
return name # HTMLParser.HTMLParser already did it
def unescape_attrs_if_required(self, attrs):
return attrs # ditto
def close(self):
HTMLParser.HTMLParser.close(self)
self.end_body()
class _AbstractSgmllibParser(_AbstractFormParser):
def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
if sys.version_info[:2] >= (2,5):
# we override this attr to decode hex charrefs
entity_or_charref = re.compile(
'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
def convert_entityref(self, name):
return unescape("&%s;" % name, self._entitydefs, self._encoding)
def convert_charref(self, name):
return unescape_charref("%s" % name, self._encoding)
def unescape_attr_if_required(self, name):
return name # sgmllib already did it
def unescape_attrs_if_required(self, attrs):
return attrs # ditto
else:
def unescape_attr_if_required(self, name):
return self.unescape_attr(name)
def unescape_attrs_if_required(self, attrs):
return self.unescape_attrs(attrs)
class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
"""Good for tolerance of incorrect HTML, bad for XHTML."""
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
sgmllib.SGMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
def feed(self, data):
try:
sgmllib.SGMLParser.feed(self, data)
except SGMLLIB_PARSEERROR, exc:
raise ParseError(exc)
def close(self):
sgmllib.SGMLParser.close(self)
self.end_body()
# sigh, must support mechanize by allowing dynamic creation of classes based on
# its bundled copy of BeautifulSoup (which was necessary because of dependency
# problems)
def _create_bs_classes(bs,
icbinbs,
):
class _AbstractBSFormParser(_AbstractSgmllibParser):
bs_base_class = None
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
_AbstractFormParser.__init__(self, entitydefs, encoding)
self.bs_base_class.__init__(self)
def handle_data(self, data):
_AbstractFormParser.handle_data(self, data)
self.bs_base_class.handle_data(self, data)
def feed(self, data):
try:
self.bs_base_class.feed(self, data)
except SGMLLIB_PARSEERROR, exc:
raise ParseError(exc)
def close(self):
self.bs_base_class.close(self)
self.end_body()
class RobustFormParser(_AbstractBSFormParser, bs):
"""Tries to be highly tolerant of incorrect HTML."""
pass
RobustFormParser.bs_base_class = bs
class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):
"""Tries to be highly tolerant of incorrect HTML.
Different from RobustFormParser in that it more often guesses nesting
above missing end tags (see BeautifulSoup docs).
"""
pass
NestingRobustFormParser.bs_base_class = icbinbs
return RobustFormParser, NestingRobustFormParser
try:
if sys.version_info[:2] < (2, 2):
raise ImportError # BeautifulSoup uses generators
import BeautifulSoup
except ImportError:
pass
else:
RobustFormParser, NestingRobustFormParser = _create_bs_classes(
BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup
)
__all__ += ['RobustFormParser', 'NestingRobustFormParser']
#FormParser = XHTMLCompatibleFormParser # testing hack
#FormParser = RobustFormParser # testing hack
def ParseResponseEx(response,
select_default=False,
form_parser_class=FormParser,
request_class=urllib2.Request,
entitydefs=None,
encoding=DEFAULT_ENCODING,
# private
_urljoin=urlparse.urljoin,
_urlparse=urlparse.urlparse,
_urlunparse=urlparse.urlunparse,
):
"""Identical to ParseResponse, except that:
1. The returned list contains an extra item. The first form in the list
contains all controls not contained in any FORM element.
2. The arguments ignore_errors and backwards_compat have been removed.
3. Backwards-compatibility mode (backwards_compat=True) is not available.
"""
return _ParseFileEx(response, response.geturl(),
select_default,
False,
form_parser_class,
request_class,
entitydefs,
False,
encoding,
_urljoin=_urljoin,
_urlparse=_urlparse,
_urlunparse=_urlunparse,
)
def ParseFileEx(file, base_uri,
select_default=False,
form_parser_class=FormParser,
request_class=urllib2.Request,
entitydefs=None,
encoding=DEFAULT_ENCODING,
# private
_urljoin=urlparse.urljoin,
_urlparse=urlparse.urlparse,
_urlunparse=urlparse.urlunparse,
):
"""Identical to ParseFile, except that:
1. The returned list contains an extra item. The first form in the list
contains all controls not contained in any FORM element.
2. The arguments ignore_errors and backwards_compat have been removed.
3. Backwards-compatibility mode (backwards_compat=True) is not available.
"""
return _ParseFileEx(file, base_uri,
select_default,
False,
form_parser_class,
request_class,
entitydefs,
False,
encoding,
_urljoin=_urljoin,
_urlparse=_urlparse,