-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
70 lines (59 loc) · 2.01 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import sys
import types
from pprint import pprint
from xml import sax
def addElem(subelemStack, holder, content):
for attribute in subelemStack[:-1]:
holder.setdefault(attribute, {})
if not isinstance(holder[attribute], dict):
holder[attribute] = {}
holder = holder[attribute]
holder.setdefault(subelemStack[-1], '')
if not (subelemStack[-1] in holder and isinstance(holder[subelemStack[-1]], dict)):
holder[subelemStack[-1]] += content
# pprint(subelemStack)
# pprint(holder)
class WikiContentHandler(sax.ContentHandler):
def startDocument(self):
self.elemStack = []
self.ignore = False
def startElement(self, name, attrs):
self.elemStack.append(name)
if name == 'page':
self.pageElems = {}
self.revisions = []
if name == 'revision':
self.revisionElems = {}
if name == 'text':
self.ignore = True
def characters(self, content):
if self.ignore:
return
if 'page' not in self.elemStack or self.elemStack.index('page') == len(self.elemStack) - 1:
return
if 'revision' in self.elemStack:
if self.elemStack.index('page') > self.elemStack.index('revision'):
raise Error('Bad XML stack %s' % str(self.elemStack))
if self.elemStack.index('revision') == len(self.elemStack) - 1:
return
addElem(self.elemStack[self.elemStack.index('revision') + 1:], self.revisionElems, content)
else:
addElem(self.elemStack[self.elemStack.index('page') + 1:], self.pageElems, content)
def endElement(self, name):
self.elemStack.pop()
if name == 'text':
self.ignore = False
if name == 'page':
for revision in self.revisions:
merged = {}
merged.update(self.pageElems)
merged.update(revision)
json.dump(merged, sys.stdout)
sys.stdout.write("\n")
self.pageElems = {}
self.revisions = []
if name == 'revision':
self.revisions.append(self.revisionElems)
self.revisionElems = {}
sax.parse(sys.stdin, WikiContentHandler())