-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_datasets.py
134 lines (95 loc) · 4.03 KB
/
parse_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from glob import iglob
from lxml.etree import iterparse, tostring
def parse_eurosense(xml_path):
sentences = []
context = iterparse(xml_path, events=('start', 'end'))
for idx, (_, element) in enumerate(context):
# if valid sentence
if element.tag == 'sentence' and 'id' in element.attrib:
# get english text
eng = element.xpath('text[@lang="en"]')
if not eng or not eng[0].text: continue
sentence = process_text(eng[0].text)
# get english annotations
annotations = element.xpath('annotations/annotation[@lang="en"]')
anchor2lemma, lemma2synset = dict(), dict()
# extract lemma_synset pairs
for child in annotations:
bn = child.text
anchor = process_text(child.get('anchor').lower())
if bn in bn2wn and anchor:
lemma = '_'.join(child.get('lemma').split()).lower()
anchor2lemma[anchor] = lemma
lemma2synset[lemma] = bn2wn[bn]
# replace annotated anchors with lemma_synset pair
sorted_anchors = sorted(anchor2lemma.keys(), key=len, reverse=True)
for i, anchor in enumerate(sorted_anchors):
# check if current anchor was contained in a bigger anchor before
if anchor not in ' '.join(sorted_anchors[:i]):
lemm_anchor = anchor2lemma[anchor]
longest_lemma = get_longest_lemma_from_anchor(lemm_anchor, anchor2lemma.values())
synset = lemma2synset[longest_lemma]
old = r'\b{}\b'.format(anchor)
new = '{}_{}'.format(longest_lemma, synset)
sentence = re.sub(old, new, sentence)
sentences.append(sentence.lower().split())
element.clear()
return sentences
def parse_sew(path):
sentences = []
for i, xml in enumerate(iglob(path)):
# extract first 4M sentences
if len(sentences) > 4_000_000:
print(i)
break
context = iterparse(xml, events=('start', 'end'))
for idx, (_, element) in enumerate(context):
if element.tag.lower() == 'wikiarticle':
# article text
article_text = process_text(element.xpath('text')[0].text)
# all annotations
annotations = element.xpath('annotations/annotation')
for child in annotations:
bn = child.xpath('babelNetID')
mention = child.xpath('mention')
if not bn or not mention or not mention[0].text:
continue
bn = bn[0].text
if bn in bn2wn:
anchor = process_text(mention[0].text)
new_anchor = '_'.join(anchor.split())
# this replacement technique works 90% of the time
old = r'\b{}\b'.format(anchor)
new = '{}_{}'.format(new_anchor, bn2wn[bn])
article_text = re.sub(old, new, article_text, count=1)
# randomly pick 20% of article sentences
article_sents = article_text.split('\n')
selected_sents = random.sample(article_sents, int(0.2 * len(article_sents)))
for s in selected_sents:
sentences.append(s.split())
# only need one article element
element.clear()
break
return sentences
def parse_trainomatic(path):
sentences = []
for i, xml in enumerate(iglob(path)):
context = iterparse(xml, events=('start', 'end'))
for idx, (_, element) in enumerate(context):
if element.tag.lower() == 'corpus':
child = element.xpath('lexelt')
if not child: continue
lemma = child[0].get('item').split('.')[0]
instances = child[0].xpath('instance')
for ins in instances:
answer, context = ins.xpath('answer'), ins.xpath('context')
if answer and context:
wn = ins.xpath('answer/@senseId')[0].split(':')[1]
pair = '{}_{}'.format(lemma, wn)
sentence = tostring(context[0], method='text', encoding=str).lower()
sentence = sentence.replace(lemma, pair)
sentences.append(sentence.split())
# only one corpus
element.clear()
break
return sentences