-
Notifications
You must be signed in to change notification settings - Fork 6
/
features.py
144 lines (115 loc) · 4.97 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
__author__ = 'Daan van Stigt'
from tokens import Token, XToken, UToken
START = '<sos>'
END = '<eos>'
START_POS = 'SOS'
END_POS = 'EOS'
START_XTOKEN = XToken(
-1, START, START, START_POS, START_POS, '_', -1, '_', '_', '_')
END_XTOKEN = XToken(
-1, END, END, END_POS, END_POS, '_', -1, '_', '_', '_')
START_UTOKEN = UToken(
-1, START, START, START_POS, START_POS, '_', -1, START, '_', '_')
END_UTOKEN = UToken(
-1, END, END, END_POS, END_POS, '_', -1, END, '_', '_')
def shape(word):
"""Inspired by spaCy's `token.shape` attribute."""
punct = (',', '.', ';', ':', '?', '!', "'", '"')
special = ('-', '/', '', '@', '#', '$', '%', '&')
brackets = ('(', ')', '[', ']', '{', '}')
shape = ''
for char in word:
if char.isupper():
shape += 'X'
elif char.islower():
shape += 'x'
elif char.isdigit():
shape += 'd'
elif char in brackets:
shape += 'b'
elif char in punct or char in special:
shape += char
else:
shape += 'c'
return shape
def get_features(head, dep, line, add_distance=False, add_surrounding=False, add_inbetween=False):
"""Feature-set loosely following McDonald et al. 2006."""
assert isinstance(line, list)
assert isinstance(head, Token), f'type {type(head)}'
assert isinstance(dep, Token), f'type {type(dep)}'
def get_token(line, id):
assert isinstance(id, int), f'id not and int: {id}'
type = 'utok' if isinstance(line[0], UToken) else 'xtok'
if id in range(len(line)):
token = line[id]
elif id < 0:
token = START_UTOKEN if type == 'utok' else START_XTOKEN
else:
token = END_UTOKEN if type == 'utok' else END_XTOKEN
return token
dep_min_2 = get_token(line, dep.id - 2)
dep_min_1 = get_token(line, dep.id - 1)
dep_plus_1 = get_token(line, dep.id + 1)
dep_plus_2 = get_token(line, dep.id + 2)
head_min_2 = get_token(line, head.id - 2)
head_min_1 = get_token(line, head.id - 1)
head_plus_1 = get_token(line, head.id + 1)
head_plus_2 = get_token(line, head.id + 2)
# Basic arc features
features = (
# Distance and position bias.
'distance=%d' % (dep.id - head.id),
'head dep id id=%d %d' % (head.id, dep.id),
# Unigram features
'head word=%s' % head.form,
'head shape=%s' % shape(head.form),
'head pos=%s' % head.pos,
'head word pos=%s %s' % (head.form, head.pos),
'head shape pos=%s %s' % (shape(head.form), head.pos),
'dep word=%s' % dep.form,
'dep shape=%s' % shape(dep.form),
'dep pos=%s' % dep.pos,
'dep word pos=%s %s' % (dep.form, dep.pos),
'dep shape pos=%s %s' % (shape(dep.form), dep.pos),
# Bigram (arc) features
'head dep word word=%s %s' % (head.form, dep.form),
'head dep pos pos=%s %s' % (head.pos, dep.pos),
'head dep word pos=%s %s' % (head.form, dep.pos),
'head dep pos word=%s %s' % (head.pos, dep.form),
'head dep prefix prefix=%s %s' % (head.form[:2], dep.form[:2]),
'head dep suffix suffix=%s %s' % (head.form[:-2], dep.form[:-2]),
'head dep prefix suffix=%s %s' % (head.form[:2], dep.form[-2:]),
'head dep suffix prefix=%s %s' % (head.form[-2:], dep.form[:2]),
'head dep prefix prefix=%s %s' % (head.form[:3], dep.form[:3]),
'head dep suffix suffix=%s %s' % (head.form[-3:], dep.form[-3:]),
'head dep prefix suffix=%s %s' % (head.form[:3], dep.form[-3:]),
'head dep suffix prefix=%s %s' % (head.form[-3:], dep.form[:3]),
'head dep shape shape=%s %s' % (shape(head.form), shape(dep.form)),
)
if add_distance:
# Stamp each of the basic features with the distance.
features = tuple(f + ' (%d)' % (dep.id - head.id) for f in features[2:]) # distances do not need distance stamp
if add_surrounding:
features += (
'head dep i i+1/i-1 i=%s %s/%s %s' % (head.pos, head_plus_1.pos, dep_min_1.pos, dep.pos),
'head dep i-1 i/i-1 i=%s %s/%s %s' % (head_min_1.pos, head.pos, dep_min_1.pos, dep.pos),
'head dep i i+1/i i+1=%s %s/%s %s' % (head.pos, head_plus_1.pos, dep.pos, dep_plus_1.pos),
'head dep i-1 i/i i+1=%s %s/%s %s' % (head_min_1.pos, head.pos, dep.pos, dep_plus_1.pos)
)
if add_inbetween:
betweens = line[head.id+1:dep.id] if head.id < dep.id else line[dep.id+1:head.id]
features += tuple(
('head between dep=%s %s %s (%d %d)' % (
head.pos, between.pos, dep.pos, between.id-head.id, dep.id-between.id)
for between in betweens))
return features
def get_feature_opts(features):
opts = dict()
for opt in features:
if opt == 'dist':
opts['add_distance'] = True
if opt == 'surround':
opts['add_surrounding'] = True
if opt == 'between':
opts['add_inbetween'] = True
return opts