-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextraction.py
325 lines (266 loc) · 11.6 KB
/
extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Copyright 2017 Peter de Vocht
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import en_core_web_sm
from collections.abc import Iterable
# use spacy small model
nlp = en_core_web_sm.load()
# dependency markers for subjects
SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"}
# dependency markers for objects
OBJECTS = {"dobj", "dative", "attr", "oprd"}
# POS tags that will break adjoining items
BREAKER_POS = {"CCONJ", "VERB"}
# words that are negations
NEGATIONS = {"no", "not", "n't", "never", "none"}
# does dependency set contain any coordinating conjunctions?
def contains_conj(depSet):
return "and" in depSet or "or" in depSet or "nor" in depSet or \
"but" in depSet or "yet" in depSet or "so" in depSet or "for" in depSet
# get subs joined by conjunctions
def _get_subs_from_conjunctions(subs):
more_subs = []
for sub in subs:
# rights is a generator
rights = list(sub.rights)
rightDeps = {tok.lower_ for tok in rights}
if contains_conj(rightDeps):
more_subs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
if len(more_subs) > 0:
more_subs.extend(_get_subs_from_conjunctions(more_subs))
return more_subs
# get objects joined by conjunctions
def _get_objs_from_conjunctions(objs):
more_objs = []
for obj in objs:
# rights is a generator
rights = list(obj.rights)
rightDeps = {tok.lower_ for tok in rights}
if contains_conj(rightDeps):
more_objs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
if len(more_objs) > 0:
more_objs.extend(_get_objs_from_conjunctions(more_objs))
return more_objs
# find sub dependencies
def _find_subs(tok):
head = tok.head
while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
head = head.head
if head.pos_ == "VERB":
subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
if len(subs) > 0:
verb_negated = _is_negated(head)
subs.extend(_get_subs_from_conjunctions(subs))
return subs, verb_negated
elif head.head != head:
return _find_subs(head)
elif head.pos_ == "NOUN":
return [head], _is_negated(tok)
return [], False
# is the tok set's left or right negated?
def _is_negated(tok):
parts = list(tok.lefts) + list(tok.rights)
for dep in parts:
if dep.lower_ in NEGATIONS:
return True
return False
# get all the verbs on tokens with negation marker
def _find_svs(tokens):
svs = []
verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
for v in verbs:
subs, verbNegated = _get_all_subs(v)
if len(subs) > 0:
for sub in subs:
svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
return svs
# get grammatical objects for a given set of dependencies (including passive sentences)
def _get_objs_from_prepositions(deps, is_pas):
objs = []
for dep in deps:
if dep.pos_ == "ADP" and (dep.dep_ == "prep" or (is_pas and dep.dep_ == "agent")):
objs.extend([tok for tok in dep.rights if tok.dep_ in OBJECTS or
(tok.pos_ == "PRON" and tok.lower_ == "me") or
(is_pas and tok.dep_ == 'pobj')])
return objs
# get objects from the dependencies using the attribute dependency
def _get_objs_from_attrs(deps, is_pas):
for dep in deps:
if dep.pos_ == "NOUN" and dep.dep_ == "attr":
verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
if len(verbs) > 0:
for v in verbs:
rights = list(v.rights)
objs = [tok for tok in rights if tok.dep_ in OBJECTS]
objs.extend(_get_objs_from_prepositions(rights, is_pas))
if len(objs) > 0:
return v, objs
return None, None
# xcomp; open complement - verb has no suject
def _get_obj_from_xcomp(deps, is_pas):
for dep in deps:
if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
v = dep
rights = list(v.rights)
objs = [tok for tok in rights if tok.dep_ in OBJECTS]
objs.extend(_get_objs_from_prepositions(rights, is_pas))
if len(objs) > 0:
return v, objs
return None, None
# get all functional subjects adjacent to the verb passed in
def _get_all_subs(v):
verb_negated = _is_negated(v)
subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
if len(subs) > 0:
subs.extend(_get_subs_from_conjunctions(subs))
else:
foundSubs, verb_negated = _find_subs(v)
subs.extend(foundSubs)
return subs, verb_negated
# find the main verb - or any aux verb if we can't find it
def _find_verbs(tokens):
verbs = [tok for tok in tokens if _is_non_aux_verb(tok)]
if len(verbs) == 0:
verbs = [tok for tok in tokens if _is_verb(tok)]
return verbs
# is the token a verb? (excluding auxiliary verbs)
def _is_non_aux_verb(tok):
return tok.pos_ == "VERB" and (tok.dep_ != "aux" and tok.dep_ != "auxpass")
# is the token a verb? (excluding auxiliary verbs)
def _is_verb(tok):
return tok.pos_ == "VERB" or tok.pos_ == "AUX"
# return the verb to the right of this verb in a CCONJ relationship if applicable
# returns a tuple, first part True|False and second part the modified verb if True
def _right_of_verb_is_conj_verb(v):
# rights is a generator
rights = list(v.rights)
# VERB CCONJ VERB (e.g. he beat and hurt me)
if len(rights) > 1 and rights[0].pos_ == 'CCONJ':
for tok in rights[1:]:
if _is_non_aux_verb(tok):
return True, tok
return False, v
# get all objects for an active/passive sentence
def _get_all_objs(v, is_pas):
# rights is a generator
rights = list(v.rights)
objs = [tok for tok in rights if tok.dep_ in OBJECTS or (is_pas and tok.dep_ == 'pobj')]
objs.extend(_get_objs_from_prepositions(rights, is_pas))
#potentialNewVerb, potentialNewObjs = _get_objs_from_attrs(rights)
#if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
# objs.extend(potentialNewObjs)
# v = potentialNewVerb
potential_new_verb, potential_new_objs = _get_obj_from_xcomp(rights, is_pas)
if potential_new_verb is not None and potential_new_objs is not None and len(potential_new_objs) > 0:
objs.extend(potential_new_objs)
v = potential_new_verb
if len(objs) > 0:
objs.extend(_get_objs_from_conjunctions(objs))
return v, objs
# return true if the sentence is passive - at he moment a sentence is assumed passive if it has an auxpass verb
def _is_passive(tokens):
for tok in tokens:
if tok.dep_ == "auxpass":
return True
return False
# resolve a 'that' where/if appropriate
def _get_that_resolution(toks):
for tok in toks:
if 'that' in [t.orth_ for t in tok.lefts]:
return tok.head
return toks
# simple stemmer using lemmas
def _get_lemma(word: str):
tokens = nlp(word)
if len(tokens) == 1:
return tokens[0].lemma_
return word
# print information for displaying all kinds of things of the parse tree
def printDeps(toks):
for tok in toks:
print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])
# expand an obj / subj np using its chunk
def expand(item, tokens, visited):
if item.lower_ == 'that':
item = _get_that_resolution(tokens)
parts = []
if hasattr(item, 'lefts'):
for part in item.lefts:
if part.pos_ in BREAKER_POS:
break
if not part.lower_ in NEGATIONS:
parts.append(part)
parts.append(item)
if hasattr(item, 'rights'):
for part in item.rights:
if part.pos_ in BREAKER_POS:
break
if not part.lower_ in NEGATIONS:
parts.append(part)
if hasattr(parts[-1], 'rights'):
for item2 in parts[-1].rights:
if item2.pos_ == "DET" or item2.pos_ == "NOUN":
if item2.i not in visited:
visited.add(item2.i)
parts.extend(expand(item2, tokens, visited))
break
return parts
# convert a list of tokens to a string
def to_str(tokens):
if isinstance(tokens, Iterable):
return ' '.join([item.text for item in tokens])
else:
return ''
# find verbs and their subjects / objects to create SVOs, detect passive/active sentences
def findSVOs(tokens):
svos = []
is_pas = _is_passive(tokens)
verbs = _find_verbs(tokens)
visited = set() # recursion detection
for v in verbs:
subs, verbNegated = _get_all_subs(v)
# hopefully there are subs, if not, don't examine this verb any longer
if len(subs) > 0:
isConjVerb, conjV = _right_of_verb_is_conj_verb(v)
if isConjVerb:
v2, objs = _get_all_objs(conjV, is_pas)
for sub in subs:
for obj in objs:
objNegated = _is_negated(obj)
if is_pas: # reverse object / subject for passive
svos.append((to_str(expand(obj, tokens, visited)),
"!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited))))
svos.append((to_str(expand(obj, tokens, visited)),
"!" + v2.lemma_ if verbNegated or objNegated else v2.lemma_, to_str(expand(sub, tokens, visited))))
else:
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited))))
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v2.lower_ if verbNegated or objNegated else v2.lower_, to_str(expand(obj, tokens, visited))))
else:
v, objs = _get_all_objs(v, is_pas)
for sub in subs:
if len(objs) > 0:
for obj in objs:
objNegated = _is_negated(obj)
if is_pas: # reverse object / subject for passive
svos.append((to_str(expand(obj, tokens, visited)),
"!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited))))
else:
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited))))
else:
# no obj - just return the SV parts
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v.lower_ if verbNegated else v.lower_,))
return svos