-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgb_helper.py
135 lines (118 loc) · 5.28 KB
/
xgb_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
from hyperopt import hp, tpe, fmin, space_eval, Trials
from hyperopt.pyll.base import scope
import pickle
import xgboost as xgb
from skmultilearn.model_selection.iterative_stratification import iterative_train_test_split
from sklearn.utils import class_weight
from sklearn.multioutput import MultiOutputClassifier
from tensorflow.keras.losses import BinaryCrossentropy
import gc
from data_set_params import DataSetParams
def tune_xgb(params, feat_train, labels, trials_filename):
"""
Tunes the XGBoost with hyperopt.
Parameters
------------
params : DataSetParams
Parameters of the model.
feat_train : ndarray
Array containing the spectrogram features for each training window of the audio file.
labels : ndarray
Class labels in one-hot encoding for each training window of the audio files.
trials_filename : String
Name of the file where the previous iterations of hyperopt are saved.
"""
print("\n tune xgboost")
space_xgb = { 'eta': hp.choice('eta', np.logspace(-4, -0.522879, num=20)),
'min_child_weight': hp.choice('min_child_weight', [1,2,3]),
'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
'n_estimators': hp.choice('n_estimators', [100, 500, 1000, 2000]),
'gamma_xgb': hp.choice('gamma_xgb', [0, 0.0001, 0.005, 0.001, 0.005, 0.01]),
'subsample': hp.choice('subsample', [0.7, 0.8, 0.9, 1]),
'scale_pos_weight': hp.choice('scale_pos_weight', [0, 0.25, 0.5, 1, 1.5]),
'objective': 'binary:logistic',
'eval_metric': 'mlogloss',
'model': "xgboost",
'feat_train': feat_train,
'labels': labels
}
# load the saved trials
try:
trials = pickle.load(open(trials_filename+".hyperopt", "rb"))
max_trials = len(trials.trials) + 1
# create a new trials
except:
max_trials = 1
trials = Trials()
# optimise the objective function with the defined set of XGBoost parameters
best_space_indices = fmin(obj_func_xgb, space_xgb, trials=trials, algo=tpe.suggest, max_evals=max_trials)
best_space = space_eval(space_xgb, best_space_indices)
best_space = {k: best_space[k] for k in best_space.keys() - {'feat_train', 'labels'}}
with open(trials_filename + ".hyperopt", "wb") as f:
pickle.dump(trials, f)
params.eta = best_space['eta']
params.min_child_weight = best_space['min_child_weight']
params.max_depth = best_space['max_depth']
params.n_estimators = best_space['n_estimators']
params.gamma_xgb = best_space['gamma_xgb']
params.subsample = best_space['subsample']
params.scale_pos_weight = best_space['scale_pos_weight']
def obj_func_xgb(args):
"""
Fits and returns the best loss of an XGBoost model with given parameters.
Parameters
-----------
args : dict
Dictionnary of all the parameters needed to fit an XGBoost.
Returns
--------
loss : float
minimum value of the loss during training of the XGBoost.
"""
# split dataset into training and validation set
train_feat, train_labels, val_feat, val_labels = iterative_train_test_split(args['feat_train'], args['labels'], 0.1)
# Fit XGBoost and compute minimum loss
params_xgb = DataSetParams()
params_xgb.eta = args['eta']
params_xgb.min_child_weight = args['min_child_weight']
params_xgb.max_depth = args['max_depth']
params_xgb.n_estimators = args['n_estimators']
params_xgb.gamma_xgb = args['gamma_xgb']
params_xgb.subsample = args['subsample']
params_xgb.scale_pos_weight = args['scale_pos_weight']
print_params_xgb(params_xgb)
xgb_clf = xgb.XGBClassifier(eta=params_xgb.eta, min_child_weight=params_xgb.min_child_weight, max_depth=params_xgb.max_depth,
n_estimators=params_xgb.n_estimators, gamma=params_xgb.gamma_xgb, subsample=params_xgb.subsample,
scale_pos_weight=params_xgb.scale_pos_weight, objective="binary:logistic",
tree_method='gpu_hist', predictor='gpu_predictor')
clf = MultiOutputClassifier(xgb_clf)
sample_w = class_weight.compute_sample_weight('balanced',train_labels)
clf.fit(train_feat, train_labels, sample_weight=sample_w)
y_pred_train = clf.predict_proba(val_feat)
y_pred_train = np.array(y_pred_train)[:,:,1].T
sample_w = class_weight.compute_sample_weight('balanced',val_labels)
bce = BinaryCrossentropy()
loss = bce(val_labels, y_pred_train, sample_weight=sample_w).numpy()
# Free memory
for clf_estimator in clf.estimators_:
clf_estimator._Booster.__del__()
gc.collect()
return loss
def print_params_xgb(params):
"""
Prints the parameters of XGBoost.
Parameters
------------
params : DataSetParams
Parameters of the model.
"""
dic = {}
dic["eta"] = params.eta
dic['min_child_weight'] = params.min_child_weight
dic["max_depth"] = params.max_depth
dic["n_estimators"] = params.n_estimators
dic["gamma_xgb"] = params.gamma_xgb
dic["subsample"] = params.subsample
dic["scale_pos_weight"] = params.scale_pos_weight
print("params used for XGBoost =", dic)