-
Notifications
You must be signed in to change notification settings - Fork 0
/
csv2xml.py
113 lines (95 loc) · 3.63 KB
/
csv2xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import csv
import codecs
import re
from lxml import etree
import sys
from collections import defaultdict
def create_dict(csvf):
rows = csv.reader(codecs.open(csvf), delimiter=',')
entries = defaultdict(list)
for row in rows:
entry, variation, rel, postag, en_trans, example = row
entries[entry].append([variation, rel, postag, en_trans, example])
root = etree.Element('dimlex')
doc = etree.ElementTree(root)
nodeid = 1
cats = set()
senses = set()
for entry in entries:
node = etree.Element('entry')
node.set('id', 'c'+str(nodeid))
node.set('word', entry.strip())
orths = etree.Element('orths')
orth = etree.Element('orth')
orth.attrib['type'] = 'cont' if not re.search('\s', entry) else 'discont'
orth.attrib['canonical'] = '1'
orth.attrib['onr'] = str(nodeid) + 'o1'
part = etree.Element('part')
part.attrib['type'] = 'single' if not re.search('\r', entry) else 'phrasal'
part.text = entry.strip()
orth.append(part)
orths.append(orth)
orthid = 2
for inst in entries[entry]:
if inst[0]:
orth = etree.Element('orth')
orth.attrib['type'] = 'cont' if not re.search('\r', inst[0]) else 'discont'
orth.attrib['canonical'] = '0'
orth.attrib['onr'] = str(nodeid) + 'o'+str(orthid)
part = etree.Element('part')
part.attrib['type'] = 'single' if not re.search('\r', inst[0]) else 'phrasal'
part.text = inst[0].strip()
orth.append(part)
orths.append(orth)
orthid += 1
node.append(orths)
syn = etree.Element('syn')
catnode = etree.Element('cat')
catnode.text = inst[2]
cats.add(inst[2])
syn.append(catnode)
for inst in entries[entry]:
sem = inst[1]
sem = sem.strip()
sem = re.sub(r'\.', ':', sem)
sem = sem.split(':')[0].upper() + ':' + ':'.join(sem.split(':')[1:])
senses.add(sem)
postag = inst[2]
en_eq = inst[3]
ex = inst[4]
semnode = etree.Element('sem')
relnode = etree.Element('pdtb3_relation')
relnode.set('sense', sem)
exnode = etree.Element('example')
exnode.text = ex.strip()
en_eqnode = etree.Element('english_equivalent')
en_eqnode.text = en_eq.strip()
semnode.append(relnode)
semnode.append(exnode)
semnode.append(en_eqnode)
syn.append(semnode)
node.append(syn)
root.append(node)
nodeid += 1
doc.write('chinese_dimlex.xml', xml_declaration=True, encoding='utf-8', pretty_print=True)
def merge_dicts(xmlone, xmltwo):
xmlp = etree.XMLParser(strip_cdata=False, resolve_entities=False, encoding='utf-8')
treeone = etree.parse(xmlone, parser=xmlp).getroot()
treetwo = etree.parse(xmltwo, parser=xmlp).getroot()
root = etree.Element('dimlex')
doc = etree.ElementTree(root)
nodeid = 1
for entry in treeone.findall('.//entry'):
entry.set('id', 'c%i' % nodeid)
nodeid += 1
root.append(entry)
for entry in treetwo.findall('.//entry'):
entry.set('id', 'c%i' % nodeid)
nodeid += 1
root.append(entry)
doc.write('merged_chinese_dimlex.xml', xml_declaration=True, encoding='utf-8', pretty_print=True)
def main():
#create_dict(sys.argv[1])
merge_dicts(sys.argv[1], sys.argv[2])
if __name__ == '__main__':
main()