-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparams.yaml
395 lines (347 loc) · 9.83 KB
/
params.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
data_source:
data_folders: data
training_data_folder:
folder: training
train: train.csv
test: test.csv
prepared:
folder: prepared
clean_train: clean_train.csv
featurize:
binary_encoder:
columns:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
combine_feat:
targetcol: ACTION
ignore_columns: #['ROLE_TITLE']
- ROLE_TITLE
- MGR_ID
- RESOURCE
resource_catagory_encode:
column_to_consider:
- ROLE_DEPTNAME_ROLE_FAMILY
concat_result_to_input: true
ktarget_enc:
n_fold: 15
random_seed: 2023
targetcol: ACTION
columns: []
concat_result_to_input: true
random_catagory_encode:
random_seed: 2023
targetcol: ACTION
random_cnt: 5
columns:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
fequency_encode:
min_group_size: 2
n_fold: 5
random_seed: 2023
targetcol: ACTION
log_transform: true
concat_result_to_input: true
columns:
- ROLE_DEPTNAME
- ROLE_ROLLUP_1_ROLE_DEPTNAME
- ROLE_ROLLUP_2_ROLE_DEPTNAME
- ROLE_ROLLUP_2_ROLE_CODE
- ROLE_DEPTNAME_ROLE_FAMILY_DESC
- ROLE_DEPTNAME_ROLE_FAMILY
- ROLE_DEPTNAME_ROLE_CODE
tfidf:
random_seed: 2023
pair_columns:
- RESOURCE
permute_columns:
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_FAMILY
- ROLE_CODE
- ROLE_ROLLUP_2_ROLE_CODE
targetcol: ACTION
combine_columns_required: false
concat_result_to_input: true
#dim_reduction: 1
#var_explained: 0.2
dim_reduction:
ROLE_ROLLUP_1: 27
ROLE_ROLLUP_2: 21
#ROLE_DEPTNAME: 129
#ROLE_FAMILY_DESC: 328
ROLE_FAMILY: 15
ROLE_CODE: 60
#ROLE_ROLLUP_1_ROLE_DEPTNAME: 160
#ROLE_ROLLUP_2_ROLE_DEPTNAME: 154
ROLE_ROLLUP_2_ROLE_CODE: 90
#ROLE_DEPTNAME_ROLE_FAMILY_DESC: 317
#ROLE_DEPTNAME_ROLE_FAMILY: 130
#ROLE_DEPTNAME_ROLE_CODE: 186
#columns: [ 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_CODE', 'ROLE_ROLLUP_2', 'ROLE_ROLLUP_1', 'ROLE_FAMILY', 'RESOURCE', 'ROLE_ROLLUP_1_ROLE_ROLLUP_2','ROLE_FAMILY_ROLE_CODE', 'ROLE_ROLLUP_1_ROLE_FAMILY', 'ROLE_ROLLUP_2_ROLE_FAMILY', 'ROLE_ROLLUP_1_ROLE_DEPTNAME','ROLE_ROLLUP_2_ROLE_DEPTNAME','ROLE_ROLLUP_1_ROLE_CODE', 'ROLE_DEPTNAME_ROLE_FAMILY', 'ROLE_ROLLUP_2_ROLE_CODE']
output:
folder: feature\tfidf
filename: tfidf.parquet
count_vector:
random_seed: 2023
targetcol: ACTION
combine_columns_required: false
concat_result_to_input: true
columns:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
permute_columns:
- RESOURCE_ROLE_ROLLUP_1
- ROLE_ROLLUP_1_ROLE_ROLLUP_2
- ROLE_ROLLUP_1_ROLE_FAMILY_DESC
- ROLE_ROLLUP_1_ROLE_FAMILY
- ROLE_ROLLUP_1_ROLE_CODE
- ROLE_FAMILY_DESC_ROLE_FAMILY
- ROLE_FAMILY_ROLE_ROLLUP_2
- ROLE_ROLLUP_2_ROLE_FAMILY_DESC
- ROLE_FAMILY_DESC_ROLE_CODE
- ROLE_ROLLUP_1_RESOURCE
- ROLE_ROLLUP_2_ROLE_ROLLUP_1
- ROLE_FAMILY_DESC_ROLE_ROLLUP_1
- ROLE_FAMILY_ROLE_ROLLUP_1
- ROLE_CODE_ROLE_ROLLUP_1
- ROLE_FAMILY_ROLE_FAMILY_DESC
- ROLE_ROLLUP_2_ROLE_FAMILY
- ROLE_FAMILY_DESC_ROLE_ROLLUP_2
- ROLE_CODE_ROLE_FAMILY_DESC
- ROLE_ROLLUP_1_ROLE_DEPTNAME
- ROLE_DEPTNAME_ROLE_ROLLUP_1
dim_reduction:
RESOURCE_ROLE_ROLLUP_1: 1
ROLE_ROLLUP_1_ROLE_ROLLUP_2: 1
ROLE_ROLLUP_1_ROLE_FAMILY_DESC: 1
ROLE_ROLLUP_1_ROLE_FAMILY: 1
ROLE_ROLLUP_1_ROLE_CODE: 1
ROLE_FAMILY_DESC_ROLE_FAMILY: 1
ROLE_FAMILY_ROLE_ROLLUP_2: 3
ROLE_ROLLUP_2_ROLE_FAMILY_DESC: 4
ROLE_FAMILY_DESC_ROLE_CODE: 5
ROLE_ROLLUP_1_RESOURCE: 1
ROLE_ROLLUP_2_ROLE_ROLLUP_1: 1
ROLE_FAMILY_DESC_ROLE_ROLLUP_1: 1
ROLE_FAMILY_ROLE_ROLLUP_1: 1
ROLE_CODE_ROLE_ROLLUP_1: 1
ROLE_FAMILY_ROLE_FAMILY_DESC: 1
ROLE_ROLLUP_2_ROLE_FAMILY: 3
ROLE_FAMILY_DESC_ROLE_ROLLUP_2: 4
ROLE_CODE_ROLE_FAMILY_DESC: 5
ROLE_ROLLUP_1_ROLE_DEPTNAME: 5
ROLE_DEPTNAME_ROLE_ROLLUP_1: 7
#dim_reduction: 1
#var_explained: 0.9
output:
folder: feature\cntvector
filename: count_vectorizer.parquet
train_test_split:
test_size: 0.1
random_seed: 7899
cv: 3
train_data: feature\train_data.parquet
model:
model_type: logistic_reg
logistic_reg:
pipeline_type:
KFoldTE: false
frequency_encoding: false
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: false
binary_encode: true
hyper_params:
random_state: 2023
max_iter: 370
penalty: l2
solver: liblinear
fit_intercept: true
C: 0.01
class_weight: balanced
n_jobs: -1
trained_model: model\logistic_reg\model.pkl
eval_metrics: model\logistic_reg\metrics
eval_plots: model\logistic_reg\plots
decision_tree:
pipeline_type:
KFoldTE: false
frequency_encoding: false
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: true
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
hyper_params:
random_state: 1907
max_depth: 21
splitter: best
#min_samples_leaf: 0.001
#max_features: 0.3 #0.3
ccp_alpha: 2.4e-05
#min_samples_split: 0.01
#class_weight: balanced
trained_model: model\decision_tree\model.pkl
eval_metrics: model\decision_tree\metrics
eval_plots: model\decision_tree\plots
bagging_decision_tree:
pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
hyper_params:
base_estimator:
random_state: 1907
max_depth: 21
splitter: best
ccp_alpha: 2.4e-05
bagging:
random_seed: 1907
n_estimators: 19
#max_samples: 0.8200000000000001
#max_features: 0.6000000000000001
trained_model: model\bagging_decision_tree\model.pkl
eval_metrics: model\bagging_decision_tree\metrics
eval_plots: model\bagging_decision_tree\plots
extra_decision_tree:
pipeline_type:
KFoldTE: true
frequency_encoding: false
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: true
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: false
binary_encode: false
hyper_params:
random_state: 56
n_estimators: 55
max_depth: 4
bootstrap: true
max_samples: 0.75 #Max sample to consider for bootstrap
max_features: 0.7
min_samples_leaf: 0.5
class_weight: balanced #balanced
trained_model: model\extra_decision_tree\model.pkl
eval_metrics: model\extra_decision_tree\metrics
eval_plots: model\extra_decision_tree\plots
random_forest:
pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: true
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: false
binary_encode: false
hyper_params:
random_state: 42
n_estimators: 220
max_depth: 5
bootstrap: true
max_samples: 0.5 #Max sample to consider for bootstrap
max_features: 0.5
min_samples_leaf: 0.005
class_weight: balanced #balanced
trained_model: model\random_forest\model.pkl
eval_metrics: model\random_forest\metrics
eval_plots: model\random_forest\plots
xgboost:
pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: true
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
hyper_params:
n_estimators: 300
reg_lambda: 1.1800000000000002
max_depth: 4
learning_rate: 0.07
random_state: 2045
colsample_bytree: 0.685
trained_model: model\xgboost\model.pkl
eval_metrics: model\xgboost\metrics
eval_plots: model\xgboost\plots
gbdt_embedding:
pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
hyper_params:
rf_estimator:
random_state: 1907
max_depth: 21
ccp_alpha: 2.4e-05
n_estimators: 19
bootstrap: true
max_features: None
max_samples: 0.99
gbdt_embed_estimator:
feature_col:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
n_estimators: 19
max_depth: 2
random_state: 10
trained_model:
model_path: model\model.pkl
feature_eng: model\feature_engg_pipeline.json
params:
base_estimator:
random_state: 1907
max_depth: 21
splitter: best
ccp_alpha: 2.4e-05
bagging:
n_estimators: 19
test_size: 0.01
random_seed: 1907
pipeline_type:
combine_feature: true
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false