-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_model_DRF_grid.py
93 lines (79 loc) · 3.12 KB
/
create_model_DRF_grid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import datetime
import sys
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ORandomForestEstimator
from helpers.functions import best_model_results
import matplotlib.pyplot as plt
import datetime
import time
import pickle
import pandas as pd
import data_parser
import h2o
# Load generated df
DataParser = data_parser.DataParser()
df = DataParser.load_complete_data_from_pickle()
df = df[DataParser.selected_cols_v2]
df = DataParser.preprocess_dropna(df)
df = DataParser.rename_columns_df(df)
opt_save = True
seeds = [42]
r2s = []
h2o.init(nthreads=-1, min_mem_size_GB=10)
h2o_data = h2o.H2OFrame(df, destination_frame="CatNum", column_types=DataParser.col_dtypes_renamed)
h2o_data = DataParser.rename_columns_h2o(h2o_data)
for seed in seeds:
start = time.time()
# train, test, valid = h2o_data.split_frame([0.7, 0.15], seed=seed)
train, test = h2o_data.split_frame(ratios=DataParser.ratios, seed=seed)
# train_valid = h2o.H2OFrame.rbind(train, valid)
train.frame_id = "Train"
# valid.frame_id = "Valid"
test.frame_id = "Test"
X = h2o_data.columns
X.remove(DataParser.target)
y = DataParser.target
grid_params = dict()
grid_params['ntrees'] = [120] # 120
grid_params['max_depth'] = [20] # [5, 10, 20, 40, 50] # Best:30
grid_params['min_rows'] = [5] # 5, 10, 20, 40, 50 # Best:10
#grid_params['nbins'] = [100] # 32
# grid_params['nbins_cats'] = [100] # important
grid_params['seed'] = [seed]
grid_params['sample_rate'] = [1] # 0.99 important
grid_params['col_sample_rate_per_tree'] = [1] # 1 important
grid_params['stopping_rounds'] = [5] #
# grid_params['stopping_tolerance'] = [0.001]
drf_grid = H2OGridSearch(model=H2ORandomForestEstimator(keep_cross_validation_predictions=True, nfolds=5),
hyper_params=grid_params)
print("Training")
drf_grid.train(x=X,
y=y,
training_frame=train,
#validation_frame=valid
)
print("Importance results")
# drf_grid.show()
grid_sorted = drf_grid.get_grid(sort_by='mean_residual_deviance', decreasing=False)
print("Getting best model")
# best_model = grid_sorted[0]
best_model = h2o.get_model(grid_sorted[0].model_id)
best_model.keep_cross_validation_predictions = True
r2, mae, mrd = best_model_results(best_model, test, train)
# best_model.plot()
plt.savefig(f'images/results/train_plot/DRF_{seed}', bbox_inches='tight')
print(best_model.actual_params['max_depth'])
print(best_model.actual_params['min_rows'])
print(best_model.actual_params['sample_rate'])
print(best_model.actual_params['col_sample_rate_per_tree'])
print(best_model.actual_params['ntrees'])
print(best_model.actual_params)
now = datetime.datetime.now().strftime("%y%m%d%H%M")
h2o.save_model(best_model, path="temp/best_DRF_model", filename=f"DRF_{now}_{seed}_{r2}_{mae}_{mrd}", force=True)
h2o.cluster().shutdown()
time.sleep(10)
r2s.append(float(r2))
df_r2 = pd.DataFrame(r2s)
# print(best_model.actual_params)
print('Mean all r2s', df_r2.mean())
print(df_r2)