-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetection_sw.py
402 lines (342 loc) · 17.1 KB
/
detection_sw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
import shutil
import resource
from collections import Counter
from tkinter.font import names
import numpy as np
from make_offical_dataset_splits import create_ideal_train_test_split
from reactdetect.utils.device_casting import move_to_cpu
from reactdetect.allennlp_wrappers import ReactDataLoader
from reactdetect.utils.file_io import grab_joblibs, load_json, path_splitall
from reactdetect.feature_extraction.feature_extractor import FeatureExtractor
from reactdetect.aggregation.label_embedders import ReactLabelEmbedder
from reactdetect.aggregation.feature_embedders import ReactSampleWiseFeatureEmbedder
from reactdetect.allennlp_wrappers import ReactDataLoader
from reactdetect.models.base_models import ReactClassicalModel
from reactdetect.allennlp_wrappers import ReactClassicalModelTrainer
from reactdetect.utils.file_io import mkdir_if_dne, rm_rf_dir, grab_joblibs, vim_write_zz
from reactdetect.utils.feature_dim_names import get_feature_dim_names
from reactdetect.utils.magic_vars import SUPPORTED_TARGET_MODELS, SUPPORTED_TARGET_MODEL_DATASETS
from reactdetect.utils.device_casting import move_to_cpu
from sklearn import preprocessing
from sklearn.model_selection import train_test_split as train_test_split_skl
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle as sk_shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from bertdetector_repr import load_bert_detector, re_encode_with_bertclassifier
import joblib
import numpy as np
import os
from collections import Counter
import sys
def clear_dir(in_dir):
"""
Clear contents of directory.
"""
if not os.path.exists(in_dir):
return -1
# remove contents of the directory
for fn in os.listdir(in_dir):
fp = os.path.join(in_dir, fn)
# directory
if os.path.isdir(fp):
shutil.rmtree(fp)
# file
else:
os.remove(fp)
return 0
class StubEmbedderSamplewise(ReactSampleWiseFeatureEmbedder):
def __init__(self, feature_names, verbose=False):
super().__init__(use_cache=False, cuda_device=None, verbose=False)
self.feature_names = feature_names
def embed_texts(self, X):
"do nothing"
return X
def get_feature_names(self):
return self.feature_names
def get_feature_names_by_setting(feature_setting):
if feature_setting == 'bert':
feature_names = ['tp_bert']
elif feature_setting == 'bert+tp':
feature_names = sorted([fcn.__name__ for fcn in FeatureExtractor(add_tags=['tp']).extractors])
elif feature_setting == 'bert+tp+lm':
feature_names = sorted([fcn.__name__ for fcn in FeatureExtractor(add_tags=['lm','tp']).extractors])
elif feature_setting == 'bert+tp+lm+tm' or feature_setting=='all':
feature_names = sorted([fcn.__name__ for fcn in FeatureExtractor(add_tags=['lm','tp','tm']).extractors])
else:
print('invalid feature combo! must be bert / bert+tp / bert+tp+lm/ bert+tp+lm+tm / all(equals bert+tp+lm+tm)!')
raise NameError
#print('you will using the following features \n', feature_names,'\n')
return feature_names
def get_feature_dim_names_by_feature_names(feature_names):
out = []
for _fname in feature_names:
out += get_feature_dim_names(_fname)
return out
def joblib_to_x_and_y(joblib_dir, feature_names, mask_to_binary, args):
print('--- loading from ', joblib_dir, 'mask to bin? ', mask_to_binary)
valid_features=feature_names
instance_dict = joblib.load(joblib_dir)
if args.detection_bert_root != None:
bert_detector = load_bert_detector(args.detection_bert_root)
print('loader bert detector')
instance_dict = re_encode_with_bertclassifier(instance_dict, bert_detector)
print('done re encoding instance_dict')
del bert_detector
print('num of instances in the file: ', len(instance_dict))
out_x = []
out_y = []
for k in instance_dict:
instance = instance_dict[k]
feats = []
for f in valid_features:
feat = move_to_cpu(instance['deliverable'][f])
feats.append(feat)
feats = np.concatenate([i[0] for i in feats])
out_x.append(feats)
if mask_to_binary:
if k[0] == 'clean':
out_y.append('clean')
else:
out_y.append('perturbed')
else: # mask to binary
out_y.append(k[0])
unused_feats = [f for f in instance['deliverable'].keys() if f not in valid_features]
for uf in unused_feats:
del instance['deliverable'][uf] #save memory
del instance_dict
print('loaded, labels: ',Counter(out_y))
assert(len(out_x) == len(out_y))
return out_x, out_y
def run_experiment(experiment_root_dir, feature_setting, setting, model, test_mode, args):
if setting == 'clean_vs_all':
print('--- your labels will be masked to binary')
MASK_TO_BIN = True
else:
MASK_TO_BIN = False
all_jblbs = grab_joblibs(experiment_root_dir)
names_of_feats_to_use = get_feature_names_by_setting(feature_setting)
print('--- you will use these features')
print(names_of_feats_to_use)
if model == 'lr' or model == 'LR':
output_dir = os.path.join(experiment_root_dir, 'LR', feature_setting)
elif model == 'lgb' or model == 'LGB':
output_dir = os.path.join(experiment_root_dir, 'LGB', feature_setting)
elif model == 'rf' or model == 'RF':
output_dir = os.path.join(experiment_root_dir, 'RF', feature_setting)
elif model == 'dt' or model == 'DT':
output_dir = os.path.join(experiment_root_dir, 'DT', feature_setting)
else:
print('model must be lr/lgb/rf/dt!')
raise TypeError
if args.detection_bert_root != None:
output_dir += '_reencode_with'
output_dir = os.path.join(
output_dir,
'@'.join(path_splitall(args.detection_bert_root)).strip('.').strip()
)
print('--output to ', output_dir)
sample_wise_embedder = StubEmbedderSamplewise(
feature_names = get_feature_dim_names_by_feature_names(names_of_feats_to_use)
)
train_dat_dir = os.path.join(experiment_root_dir, 'train.joblib')
val_dat_dir = os.path.join(experiment_root_dir, 'val.joblib')
test_dat_dir = os.path.join(experiment_root_dir, 'test_release.joblib')
whole_dat_dir = os.path.join(experiment_root_dir, 'data.joblib')
if os.path.join(experiment_root_dir, 'train.joblib') in all_jblbs and os.path.join(experiment_root_dir, 'val.joblib') and os.path.join(experiment_root_dir, 'test_release.joblib')in all_jblbs:
print('--- looks you have a pre-splitted train-val-test(release) split, potentially due to ablation analysis purpose')
x_train, y_train = joblib_to_x_and_y( os.path.join(experiment_root_dir, 'train.joblib'), names_of_feats_to_use , MASK_TO_BIN, args)
x_val, y_val = joblib_to_x_and_y( os.path.join(experiment_root_dir, 'val.joblib'), names_of_feats_to_use, MASK_TO_BIN, args)
x_test, y_test = joblib_to_x_and_y( os.path.join(experiment_root_dir, 'test_release.joblib'), names_of_feats_to_use , MASK_TO_BIN, args)
training_main(
model_type = model,
samplewise_embedder = sample_wise_embedder,
save_dir = output_dir,
pre_splitted=True,
x_train=x_train, x_val=x_val, x_test=x_test, y_train=y_train, y_val=y_val, y_test=y_test, test_mode=test_mode, args=args
)
else:
print('no train.joblib, val.joblib and test.joblib found, did you distribute and make the experiments correctly?')
print('tried to search the following')
print(train_dat_dir)
print(val_dat_dir)
print(test_dat_dir)
print(whole_dat_dir)
def training_main(model_type, samplewise_embedder, save_dir, pre_splitted:bool, X=None, y=None, x_train=None, x_val=None, x_test=None, y_train=None, y_val=None, y_test=None, test_mode=False, args=None):
print('--- training main starting')
import time
start_time = time.time()
print('skip if done? ', args.skip_if_done)
if os.path.exists(os.path.join(save_dir, 'model.joblib')) and args.skip_if_done=='yes':
print('looks you already have a model in there, skipping')
exit(0)
if pre_splitted is True:
for _ in [x_train, x_val, x_test, y_train, y_val, y_test]:
if _ is None:
print('you have to pass in all trainx, trainy, valx, valy, testx, testy for pre_splitted=True')
raise TypeError
assert set(y_train) == set(y_val), 'inconsistant label classes in ytrain and yval'
_ = preprocessing.LabelEncoder().fit(y_train)
mapping = dict(zip(_.classes_, _.transform(_.classes_)))
label_embedder = ReactLabelEmbedder(mapping, _.classes_)
else:
print('please split the experiment before calling training_main')
exit(0)
print(f'\nno. train: {len(y_train):,}')
print(f'no. val: {len(y_val):,}')
print(f'no. test: {len(y_test):,}')
print(f'no. features: {len(x_train[0]):,}')
# shuffle the data
x_train, y_train = sk_shuffle(x_train, y_train)
x_val, y_val = sk_shuffle(x_val, y_val)
# not shuffling test data
if args.train_frac < 1.0:
assert args.train_frac > 0.0
n_train = int(len(x_train) * args.train_frac)
x_train = x_train[:n_train]
y_train = y_train[:n_train]
print(x_train[0])
print('\nyour label mapping is')
print(label_embedder.get_label_mapping())
print('\nyour inverse label mapping is')
print(label_embedder.get_inverse_label_mapping())
train_loader = ReactDataLoader(x_train, y_train)
val_loader = ReactDataLoader(x_val, y_val)
test_loader = ReactDataLoader(x_test, y_test)
print('--- before oversample')
stats_before_oversample = 'train: '+str(train_loader)+'\n'+'va;: '+str(val_loader)+'\n'+'test: '+str(test_loader)
print(stats_before_oversample)
print('not oversampling test_loader')
if args.oversample_train == 'yes':
train_loader.oversample()
val_loader.oversample()
# not over sampling test loader
print('--- after oversample')
stats_after_oversample = 'train: '+str(train_loader)+'\n'+'val: '+str(val_loader)
print(stats_after_oversample)
print("-over sample is done, took-- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
vim_write_zz(
os.path.join(save_dir, 'oversample_statistics.txt'),
string='pre-oversample\n' + stats_before_oversample + 'post-oversample\n' + stats_after_oversample
)
class_weight = None
else:
val_loader.oversample()
class_weight = 'balanced'
mkdir_if_dne(save_dir)
clear_dir(save_dir)
import multiprocessing
ncpu = multiprocessing.cpu_count()
print('btw you have ', ncpu, ' cpus avail')
ss = StandardScaler()
if model_type in ('lr', 'LR'):
clf_temp = LogisticRegression(solver=args.solver, penalty=args.penalty, random_state=0,
n_jobs=args.model_n_jobs, class_weight=class_weight)
param_grid = {'logisticregression__C': [1e-1, 1e0]}
elif model_type in ('lgb', 'LGB'):
clf_temp = LGBMClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth,
num_leaves=args.num_leaves, random_state=0, n_jobs=args.model_n_jobs,
class_weight=class_weight)
param_grid = {'lgbmclassifier__n_estimators': [50, 100],
'lgbmclassifier__max_depth': [3, 5],
'lgbmclassifier__num_leaves': [2, 15],
'lgbmclassifier__boosting_type': ['gbdt']}
elif model_type in ('rf','RF'):
clf_temp = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth,
random_state=0, n_jobs=args.model_n_jobs,class_weight=class_weight)
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
'randomforestclassifier__max_depth': [3, 5],
'randomforestclassifier__min_samples_leaf': [2, 4]}
elif model_type in ('dt','DT'):
clf_temp = DecisionTreeClassifier(random_state=0, class_weight=class_weight)
param_grid = {'decisiontreeclassifier__max_depth': [3, 5, None],
'decisiontreeclassifier__min_samples_leaf': [1, 2, 4, 10]}
pipeline = make_pipeline(ss, clf_temp)
if args.tune == 'yes':
clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, verbose=3, n_jobs=args.cv_n_jobs)
else:
clf = clf_temp if args.model == 'lgb' else pipeline
if test_mode:
clf = clf_temp
model = ReactClassicalModel(
classifier=clf,
sample_wise_embedder=samplewise_embedder,
label_embedder=label_embedder
)
trainer = ReactClassicalModelTrainer(
model=model,
serialization_dir=save_dir,
data_loader=train_loader,
validation_data_loader=val_loader,
test_data_loader=test_loader
)
print("Starting training")
trainer.train()
print("--- %s seconds ---" % (time.time() - start_time))
print("Finished training")
max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print(f'max_rss: {max_rss:,}b')
if __name__ == '__main__':
import argparse
# must have cms
parser = argparse.ArgumentParser()
parser.add_argument('--experiment_dir', type=str, help='dir for that experiment')
parser.add_argument('--feature_setting', type=str, default='all',help='set of features to use, must be one of the ones one spreadsheet')
parser.add_argument('--model', type=str, default='lgb', help='specify your linear model: lr or lgb or rf.')
parser.add_argument('--skip_if_done', type=str, default='yes', help='skip if an exp is already runned')
parser.add_argument('--test', type=str, default='no', help='if yes only run pretty quick')
parser.add_argument('--model_n_jobs', type=int, default=1, help='no. jobs to run in parallel for the model.')
parser.add_argument('--cv_n_jobs', type=int, default=1, help='no. jobs to run in parallel for gridsearch.')
parser.add_argument('--solver', type=str, default='lbfgs', help='LR solver.')
parser.add_argument('--penalty', type=str, default='l2', help='LR penalty: l1 or l2.')
parser.add_argument('--train_frac', type=float, default=1.0, help='fraction of train data to train with.')
parser.add_argument('--oversample_train', type=str, default='no', help='yes for oversampling, no otherwise.')
parser.add_argument('--n_estimators', type=int, default=100, help='no. boosting rounds for lgb.')
parser.add_argument('--max_depth', type=int, default=5, help='max. depth for each tree.')
parser.add_argument('--num_leaves', type=int, default=32, help='no. leaves per tree.')
parser.add_argument('--tune', type=str, default='yes', help='if yes, then tune, otherwise no.')
parser.add_argument('--detection_bert_root', type=str, default=None, help='if valid, use bert in that root to re-encode tp_bert')
args = parser.parse_args()
print('starting ', args.experiment_dir)
if args.model == None:
print('--- no model arch given, defaulting to lr')
args.model = 'lr' #defaulting to logistic regression
out_dir = args.experiment_dir
print('--- searching exp data in ', out_dir)
assert os.path.exists(os.path.join( out_dir, 'setting.json')), 'cannot find setting.json in '+out_dir+', did you make the experiment?'
assert os.path.exists(os.path.join( out_dir, 'train.csv')), 'cannot find train.csv in '+out_dir+', did you make the experiment?'
assert os.path.exists(os.path.join(out_dir, 'val.csv')), 'cannot find val.csv in '+out_dir+', did you make the experiment?'
assert os.path.exists(os.path.join(out_dir, 'test_release.csv')), 'cannot find test_release.csv in '+out_dir+', did you make the experiment?'
exp_args = load_json( os.path.join( out_dir, 'setting.json') )
exp_args['skip_if_done'] = args.skip_if_done
exp_args['test'] = args.test
exp_args['model'] = args.model
exp_args['feature_setting'] = args.feature_setting
exp_args['model_n_jobs'] = args.model_n_jobs
exp_args['cv_n_jobs'] = args.cv_n_jobs
exp_args['solver'] = args.solver
exp_args['penalty'] = args.penalty
exp_args['train_frac'] = args.train_frac
exp_args['oversample_train'] = args.oversample_train
exp_args['n_estimators'] = args.n_estimators
exp_args['max_depth'] = args.max_depth
exp_args['num_leaves'] = args.num_leaves
exp_args['tune'] = args.tune
exp_args['detection_bert_root'] = args.detection_bert_root
exp_args = argparse.Namespace(**exp_args)
TEST_MODE = False
if args.test == 'yes':
TEST_MODE = True
run_experiment(
experiment_root_dir=out_dir,
feature_setting=args.feature_setting,
setting=exp_args.experiment_setting,
model=args.model,
test_mode=TEST_MODE,
args=exp_args
)