-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse_docx.py
34 lines (28 loc) · 895 Bytes
/
parse_docx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 13 07:11:15 2019
@author: rayde
"""
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
def parse_docx(path, paragraphs):
"""
Take the path of a docx file as argument and list 'paragraphs', return the text in unicode.
"""
document = zipfile.ZipFile(path)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
for paragraph in tree.getiterator(PARA):
texts = [node.text
for node in paragraph.getiterator(TEXT)
if node.text]
if texts:
paragraphs.append(''.join(texts))
return '\n\n'.join(paragraphs)