-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_engineering.py
238 lines (223 loc) · 8.8 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""This file generates features from Abstract Meaning Representations and creates a dataset that can be used for multiclass-classification.
"""
import pandas as pd
from collections import Counter
import features
df = pd.read_csv('dataset_with_amrs.txt', sep="\t", header=0)
df = df[["AMR", "Relation"]]
# Deletes all rows with LEX compounds
df = df.loc[df["Relation"] != "LEX"]
numbers_f1 = features.focus_numbers(df)
df['sense_with_numbers'] = numbers_f1
and_f2 = features.focus_and_or("and", df)
df['and'] = and_f2
or_f3 = features.focus_and_or("or", df)
df['or'] = or_f3
no_numbers_and_or_f4 = features.focus_entity(df)
df['no_numbers_and_or'] = no_numbers_and_or_f4
person_f5 = features.focus_word("person", df)
df['person'] = person_f5
capable01_f6 = features.focus_word("capable-01", df)
df['capable-01'] = capable01_f6
relation03_f7 = features.focus_word("relation-03", df)
df['relation-03'] = relation03_f7
game_f8 = features.focus_word("game", df)
df['game'] = game_f8
edge_f9 = features.focus_word("edge", df)
df['edge'] = edge_f9
phase_f10 = features.focus_word("phase", df)
df['phase'] = phase_f10
something_f11 = features.focus_word("something", df)
df['something'] = something_f11
process02_f12 = features.focus_word("process-02", df)
df['process-02'] = process02_f12
plant_f13 = features.focus_word("plant", df)
df['plant'] = plant_f13
possible01_f14 = features.focus_word("possible-01", df)
df['possible-01'] = possible01_f14
lack01_f15 = features.focus_word("lack-01", df)
df['lack-01'] = lack01_f15
water_f16 = features.focus_word("water", df)
df['water'] = water_f16
juice_f17 = features.focus_word("juice", df)
df['juice'] = juice_f17
sequence01_f18 = features.focus_word("sequence-01", df)
df['sequence-01'] = sequence01_f18
area_f19 = features.focus_word("area", df)
df['area'] = area_f19
soup_f20 = features.focus_word("soup", df)
df['soup'] = soup_f20
report01_f21 = features.focus_word("report-01", df)
df['report-01'] = report01_f21
head01_f22 = features.focus_word("head-01", df)
df['head-01'] = head01_f22
play01_f23 = features.focus_word("play-01", df)
df['play-01'] = play01_f23
status_f24 = features.focus_word("status", df)
df['status'] = status_f24
contact01_f25 = features.focus_word("contact-01", df)
df['contact-01'] = contact01_f25
deficiency_f26 = features.focus_word("deficiency", df)
df['deficiency'] = deficiency_f26
joy_f27 = features.focus_word("joy", df)
df['joy'] = joy_f27
thing_f28 = features.focus_word("thing", df)
df['thing'] = thing_f28
matter_f29 = features.focus_word("matter", df)
df['matter'] = matter_f29
certificate_f30 = features.focus_word("certificate", df)
df['certificate'] = certificate_f30
match03_f31 = features.focus_word("match-03", df)
df['match-03'] = match03_f31
consist01_f32 = features.focus_word("consist-01", df)
df['consist-01'] = consist01_f32
world_f33 = features.focus_word("world", df)
df['world'] = world_f33
work01_f34 = features.focus_word("work-01", df)
df['work-01'] = work01_f34
language_f35 = features.focus_word("language", df)
df['language'] = language_f35
degree_f36 = features.focus_word("degree", df)
df['degree'] = degree_f36
state_f37 = features.focus_word("state", df)
df['state'] = state_f37
place_f38 = features.focus_word("place", df)
df['place'] = place_f38
way_f39 = features.focus_word("way", df)
df['way'] = way_f39
interest_f40 = features.focus_word("interest", df)
df['interest'] = interest_f40
move01_41 = features.focus_word("move-01", df)
df['move-01'] = move01_41
train_f42 = features.focus_word("train", df)
df['train'] = train_f42
use01_f43 = features.focus_word("use-01", df)
df['use-01'] = use01_f43
surface_f44 = features.focus_word("surface", df)
df['surface'] = surface_f44
act02_f45 = features.focus_word("act-02", df)
df['act-02'] = act02_f45
lead03_f46 = features.focus_word("lead-03", df)
df['lead-03'] = lead03_f46
play11_f47 = features.focus_word("play-11", df)
df['play-11'] = play11_f47
poss_f48 = features.part_of_AMR(":poss", df)
df[':poss'] = poss_f48
part_f49 = features.part_of_AMR(":part ", df)
df[':part'] = part_f49
part_of_f50 = features.part_of_AMR(":part-of", df)
df[':part-of'] = part_of_f50
time_of_f51 = features.part_of_AMR(":time-of", df)
df[':time-of'] = time_of_f51
time_f52 = features.part_of_AMR(":time ", df)
df[':time'] = time_f52
source_f53 = features.part_of_AMR(":source", df)
df[':source'] = source_f53
polarity_f54 = features.part_of_AMR(":polarity", df)
df[':polarity'] = polarity_f54
domain_f55 = features.part_of_AMR(":domain", df)
df[':domain'] = domain_f55
topic_f56 = features.part_of_AMR(":topic", df)
df[':topic'] = topic_f56
location_of_f57 = features.part_of_AMR(":location-of", df)
df[':location-of'] = location_of_f57
location_f58 = features.part_of_AMR(":location ", df)
df[':location'] = location_f58
manner_f59 = features.part_of_AMR(":manner ", df)
df[':manner'] = manner_f59
manner_of_f60 = features.part_of_AMR(":manner-of", df)
df[':manner-of'] = manner_of_f60
purpose_f61 = features.part_of_AMR(":purpose", df)
df[':purpose'] = purpose_f61
have_rel_role91_f62 = features.part_of_AMR("have-rel-role-91", df)
df['have-rel-role-91'] = have_rel_role91_f62
ordinal_entity_f63 = features.part_of_AMR("ordinal-entity", df)
df['ordinal-entity'] = ordinal_entity_f63
accompanier_f64 = features.part_of_AMR(":accompanier", df)
df[':accompanier'] = accompanier_f64
value_f65 = features.part_of_AMR(":value", df)
df[':value'] = value_f65
date_entity_f66 = features.part_of_AMR("date-entity", df)
df['date-entity'] = date_entity_f66
date_interval_f67 = features.part_of_AMR("date-interval", df)
df['date-interval'] = date_interval_f67
consist_of_f68 = features.part_of_AMR(":consist-of", df)
df[':consist-of'] = consist_of_f68
instrument_f69 = features.part_of_AMR(":instrument", df)
df[':instrument'] = instrument_f69
cause01_f70 = features.part_of_AMR("cause-01", df)
df['cause-01'] = cause01_f70
medium_f71 = features.part_of_AMR(":medium", df)
df[':medium'] = medium_f71
name_f72 = features.part_of_AMR(":name", df)
df[':name'] = name_f72
include91_f73 = features.part_of_AMR("include-91", df)
df['include-91'] = include91_f73
have_org_role91_f74 = features.part_of_AMR("have-org-role-91", df)
df['have-org-role-91'] = have_org_role91_f74
type_f75 = features.part_of_AMR("type", df)
df['type'] = type_f75
man_f76 = features.part_of_AMR("man)", df)
df['man'] = man_f76
team_f77 = features.part_of_AMR("team", df)
df['team'] = team_f77
department_f78 = features.part_of_AMR("department", df)
df['department'] = department_f78
commission_f79 = features.part_of_AMR("commission", df)
df['commission'] = commission_f79
lead_80 = features.part_of_AMR("lead", df)
df['lead'] = lead_80
responsible_f81 = features.part_of_AMR("responsible", df)
df['responsible'] = responsible_f81
capable_f82 = features.part_of_AMR("capable", df)
df['capable'] = capable_f82
obtain_f83 = features.part_of_AMR("obtain", df)
df['obtain'] = obtain_f83
create_f84 = features.part_of_AMR("create", df)
df['create'] = create_f84
need_f85 = features.part_of_AMR("need", df)
df['need'] = need_f85
ARG0_f86 = features.first_role(":ARG0", df)
df[':ARG0'] = ARG0_f86
ARG1_f87 = features.first_role(":ARG1", df)
df[':ARG1'] = ARG1_f87
ARG2_f88 = features.first_role(":ARG2", df)
df[':ARG2'] = ARG2_f88
ARG3_f89 = features.first_role(":ARG3", df)
df[':ARG3'] = ARG3_f89
ARG0_of_f90 = features.first_role(":ARG0-of", df)
df[':ARG0-of'] = ARG0_of_f90
ARG1_of_f91 = features.first_role(":ARG1-of", df)
df[':ARG1-of'] = ARG1_of_f91
ARG2_of_f92 = features.first_role(":ARG2-of", df)
df[':ARG2-of'] = ARG2_of_f92
mod_f93 = features.first_role(":mod", df)
df[':mod'] = mod_f93
nodes_greater10_f94 = features.count_higher("/", 10, df)
df['nodes_count_11+'] = nodes_greater10_f94
nodes_less3_f95 = features.count_lower("/", 3, df)
df['nodes_count_2-'] = nodes_less3_f95
newlines_less3_f96 = features.count_lower("\n", 3, df)
df['newlines_count_2-'] = newlines_less3_f96
newlines_greater15_f97 = features.count_higher("\n", 15, df)
df['newlines_count_16+'] = newlines_greater15_f97
relation_in_AMR_f98 = features.relation_in_amr(df)
df['relation_in_AMR'] = relation_in_AMR_f98
end_bracket1_f99 = features.end_brackets("equal", 1, df)
df['one_bracket'] = end_bracket1_f99
end_brackets_greater5_f100 = features.end_brackets("greater", 5, df)
df['end_brackets_5+'] = end_brackets_greater5_f100
adj_greater1_f101 = features.POS_occurance("ADJ", 1, df)
df['adj_count_2+'] = adj_greater1_f101
verb_greater1_f102 = features.POS_occurance("VERB", 1, df)
df['verb_count_2+'] = verb_greater1_f102
adverb_greater0_f103 = features.POS_occurance("ADV", 0, df)
df['adverb_count_1+'] = adverb_greater0_f103
verb_firstPOS_f104 = features.first_POS("VERB", df)
df['verb_first_POS'] = verb_firstPOS_f104
noun_firstPOS_f105 = features.first_POS("NOUN", df)
df['noun_first_POS'] = noun_firstPOS_f105
propn_firstPOS_f106 = features.first_POS("PROPN", df)
df['propn_first_POS'] = propn_firstPOS_f106
df = df.drop('AMR', axis=1)
df.to_csv('amr_features_compound_relation.csv', sep=';', index=False)