-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipe.py
88 lines (73 loc) · 2.24 KB
/
pipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import (
GridSearchCV,
StratifiedKFold
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from typing import Literal
from params import PARAM_GRID
def get_pipeline(model_name: Literal["LR", "XGB", "RF", "MLP"], continuous_cols: list, inner_folds: int, n_jobs: int, rnd_state: int) -> tuple[ColumnTransformer, GridSearchCV]:
"""Get the `sklearn.pipeline`.
Parameters
----------
-
Returns
-------
pipeline = `sklearn.Pipeline`
The pipeline composed of the scaler (if needed) and the model.
"""
inner_cv = StratifiedKFold(
n_splits=inner_folds,
shuffle=True,
random_state=rnd_state
)
transformer = ColumnTransformer(
transformers = [
(
"scaler",
StandardScaler(),
continuous_cols
)
],
remainder = "passthrough",
verbose_feature_names_out = False
).set_output(transform="pandas")
if model_name == "LR":
classifier = LogisticRegression(
solver="saga",
max_iter=5000,
random_state = rnd_state,
class_weight = "balanced",
n_jobs=n_jobs
)
elif model_name == "XGB":
classifier = XGBClassifier(
importance_type = "gain",
random_state = rnd_state,
enable_categorical=True,
n_jobs=n_jobs,
)
elif model_name == "MLP":
classifier = MLPClassifier(
solver = "adam",
learning_rate = "adaptive",
learning_rate_init = 0.001,
max_iter = 5000,
shuffle = True,
random_state = rnd_state
)
else:
raise ValueError(
f"Possible models are 'LR', 'XGB', and 'MLP'. {model_name} is passed, instead. "
)
return transformer, GridSearchCV(
estimator=classifier,
param_grid=PARAM_GRID[model_name],
cv=inner_cv,
scoring="roc_auc",
n_jobs=n_jobs,
return_train_score=True
)