-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_model_DLE.py
93 lines (81 loc) · 3.56 KB
/
create_model_DLE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ODeepLearningEstimator
from helpers.functions import best_model_results
import matplotlib.pyplot as plt
import datetime
import time
import pandas as pd
import data_parser
import h2o
# Load generated df
DataParser = data_parser.DataParser()
df = DataParser.load_complete_data_from_pickle()
# df = df[DataParser.selected_cols_reduced]
df = df[DataParser.selected_cols_v2]
df = DataParser.preprocess_dropna(df)
df = DataParser.rename_columns_df(df)
opt_save = True
seeds = [42] # 6, 18, 25, 32, 42
r2s = []
start = time.time()
h2o.init(nthreads=-1, min_mem_size="8g")
# Split the dataset into a train and valid set:
h2o_data = h2o.H2OFrame(df, destination_frame="CatNum", column_types=DataParser.col_dtypes_renamed)
cv = True
for seed in seeds:
# train, test, valid = h2o_data.split_frame(ratios, seed=seed)
train, test = h2o_data.split_frame(ratios=DataParser.ratios, seed=seed)
# train_valid = h2o.H2OFrame.rbind(train, valid)
train.frame_id = "Train"
# valid.frame_id = "Valid"
test.frame_id = "Test"
X = h2o_data.columns
X.remove(DataParser.target)
y = DataParser.target
grid_params = dict()
grid_params['hidden'] = [ [100, 50, 25]] # [100, 50, 25], [50, 25, 10], [200, 100, 50], [100, 50, 25, 10], [50, 25, 10], [100, 50, 25, 10, 5], [2000, 70, 50, 25, 10]] # Best: 100/50/25
grid_params['epochs'] = [1000] # 1000
grid_params['activation'] = ['Rectifier'] # 'TanhWithDropout', 'RectifierWithDropout'
# grid_params['tweedie_power'] = [1.2]
# grid_params['score_interval'] = [5.0, 3.0, 10.0]
#grid_params['l1'] = [1e-6] #, 5e-7, 1e-7]
#grid_params['l2'] = [1e-6] #, 1e-7, 5e-6]
# grid_params['input_dropout_ratio'] = [0.1, 0.2]
grid_params['rho'] = [0.99]
grid_params['loss'] = ['Absolute'] # 'Quadratic', 'Huber'
grid_params['reproducible'] = [False] # False
grid_params['seed'] = [seed]
grid_params['stopping_rounds'] = [7] # 20
#grid_params['variable_importances'] = [True]
rnn_grid = H2OGridSearch(model=H2ODeepLearningEstimator(standardize=True, seed=seed, keep_cross_validation_predictions=True, nfolds=5),
hyper_params=grid_params)
print("Training")
rnn_grid.train(x=X,
y=y,
training_frame=train,
# validation_frame=valid
)
print("Importance results")
# drf_grid.show()
grid_sorted = rnn_grid.get_grid(sort_by="mean_residual_deviance", decreasing=False)
print("Getting best model")
# best_model = grid_sorted[0]
best_model = h2o.get_model(grid_sorted[0].model_id)
# best_model.keep_cross_validation_predictions = True
r2, mae, mrd = best_model_results(best_model, test, train)
best_model.plot()
plt.savefig(f'images/results/train_plot/DLE_{seed}', bbox_inches='tight')
now = datetime.datetime.now().strftime("%y%m%d%H%M")
h2o.save_model(best_model, path="temp/best_DLE_model", filename=f"DLE_{now}_{seed}_{r2}_{mae}_{mrd}", force=True)
print("Elapsed {:.04f} minutes".format((time.time() - start)/60))
print("hidden", best_model.actual_params['hidden'])
print("epochs:", best_model.actual_params['epochs'])
print("dropout:", best_model.actual_params['input_dropout_ratio'])
print("l1:", best_model.actual_params['l1'])
print("l2:", best_model.actual_params['l2'])
r2s.append(float(r2))
print("Elapsed {:.04f} minutes".format((time.time() - start)/60))
df_r2 = pd.DataFrame(r2s)
# print(best_model.actual_params)
print('Mean all r2s', df_r2.mean())
print(df_r2)