-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathrow_processor.py
40 lines (33 loc) · 1.27 KB
/
row_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from lxml import etree
from itertools import islice, chain
import six
# Efficient parsing of large XML files from
# http://stackoverflow.com/a/9814580/987185
def parse(fp):
"""Efficiently parses an XML file from the StackExchange data dump and
returns a generator which yields one row at a time.
"""
context = etree.iterparse(fp, events=("end",))
for action, elem in context:
if elem.tag == "row":
# processing goes here
assert elem.text is None, "The row wasn't empty"
yield elem.attrib
# cleanup
# first empty children from current element
# This is not absolutely necessary if you are also deleting
# siblings, but it will allow you to free memory earlier.
elem.clear()
# second, delete previous siblings (records)
while elem.getprevious() is not None:
del elem.getparent()[0]
# make sure you have no references to Element objects outside the loop
def batch(iterable, size):
"""Creates a batches of size `size` from the `iterable`."""
sourceiter = iter(iterable)
while True:
batchiter = islice(sourceiter, size)
try:
yield chain([six.next(batchiter)], batchiter)
except StopIteration:
return