-
Notifications
You must be signed in to change notification settings - Fork 95
/
Copy pathdemo4.py
111 lines (90 loc) · 3.42 KB
/
demo4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import time
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
np.random.seed(42)
print("Load data...")
data = pd.read_csv("./data/sampled_app_train.csv")
print("Data loaded")
print("Features modification from user side...")
data["BIRTH_DATE"] = (np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str)
data["EMP_DATE"] = (
np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]"))
).astype(str)
data["constant"] = 1
data["allnan"] = np.nan
data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)
print("Features modification finished")
print("Split data...")
train_data, test_data = train_test_split(data, test_size=2000, stratify=data["TARGET"], random_state=13)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
print("Data splitted. Parts sizes: train_data = {}, test_data = {}".format(train_data.shape, test_data.shape))
for task_params, target in zip(
[
{"name": "binary"},
{"name": "binary", "metric": roc_auc_score},
{"name": "reg", "loss": "mse", "metric": "r2"},
{"name": "reg", "loss": "rmsle", "metric": "rmsle"},
{
"name": "reg",
"loss": "quantile",
"loss_params": {"q": 0.9},
"metric": "quantile",
"metric_params": {"q": 0.9},
},
],
["TARGET", "TARGET", "AMT_CREDIT", "AMT_CREDIT", "AMT_CREDIT"],
):
print("Create task..")
task = Task(**task_params)
print("Task created")
print("Create reader...")
reader = PandasToPandasReader(task, cv=5, random_state=1)
print("Reader created")
# pipeline 1 level parts
print("Start creation pipeline_1...")
pipe = LGBSimpleFeatures()
print("\t ParamsTuner2 and Model2...")
model2 = BoostLGBM(
default_params={
"learning_rate": 0.025,
"num_leaves": 64,
"seed": 2,
"num_threads": 5,
}
)
print("\t Tuner2 and model2 created")
print("\t Pipeline1...")
pipeline_lvl1 = MLPipeline(
[model2],
pre_selection=None, # selector,
features_pipeline=pipe,
post_selection=None,
)
print("Pipeline1 created")
print("Create AutoML pipeline...")
automl = AutoML(
reader,
[
[pipeline_lvl1],
],
skip_conn=False,
)
print("AutoML pipeline created...")
print("Start AutoML pipeline fit_predict...")
start_time = time.time()
oof_pred = automl.fit_predict(train_data, roles={"target": target})
print("AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(time.time() - start_time))
test_pred = automl.predict(test_data)
print("Prediction for test data:\n{}\nShape = {}".format(test_pred, test_pred.shape))
print("Check scores...")
print("OOF score: {}".format(task.metric_func(train_data[target].values, oof_pred.data[:, 0])))
print("TEST score: {}".format(task.metric_func(test_data[target].values, test_pred.data[:, 0])))