-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_test_model.py
150 lines (123 loc) · 6.22 KB
/
run_test_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# run prophet in test mode
import numpy as np
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
import itertools
import warnings
import logging
from datetime import date
from datetime import timedelta
import fetch_data
warnings.filterwarnings('ignore')
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)
def run_model(hospital: str, horizon_value_arg: int, initial_argument_value_arg: int, cap_type: str):
# fetch the final dataset
df = fetch_data.pull_dataset(hospital)
print('df.shape :\n', df.shape)
print("\ndf data types with ds as datetime :\n", df.dtypes)
print("\ndf head with ds as datetime :\n", df.head())
# sort data df by datetime column ds
df = df.sort_values(by='ds')
print("\n\033[92mhead of sorted data by datetime ds :\n", df.head(), "\033[00m")
print("\n\033[91mtail of sorted data by datetime ds :\n", df.tail(), "\033[00m")
# reset index using column 'ds' and rename the index as 'idx'
df = df.reset_index(drop=True)
print("\n\033[94mhead of reindexed data :\n", df.head(), "\033[00m")
print("\n\033[94mtail of reindexed data :\n", df.tail(), "\033[00m")
df['weekday'] = df['ds'].dt.dayofweek # add additional regressor 'weekday'
# define function that yields max value of the following datasets: df, future (see further down for future)
if cap_type == 'hard':
cap_factor = 0
elif cap_type == 'soft':
cap_factor = 0.08
else:
raise Exception("wrong 'cap_type' value")
cap = max(df['y']) + round(cap_factor * max(df['y']))
# define max & min values for target variable df['y']
df['cap'] = cap
df['floor'] = 0
# create the parameter grid for hyperparameter tuning
param_grid = {
'growth': ['logistic'],
'seasonality_mode': ['multiplicative'],
'holidays_mode': ['multiplicative'],
'changepoint_prior_scale': [0.01, 0.025, 0.04, 0.05, 0.1, 0.5],
'seasonality_prior_scale': [1.5, 2.2, 4.0, 6.0, 7.0, 8.0, 9.0, 10.0],
'daily_seasonality': [True],
}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = [] # Store the RMSEs for each params here
mapes = [] # Store the MAPEs for each params here
# define cross-validation arguments
horizon_value = horizon_value_arg
horizon_argument = str(horizon_value) + ' days'
# initial_argument_value = int(0.921*len(df))
initial_argument_value = initial_argument_value_arg
initial_argument = str(initial_argument_value) + ' days'
period_argument = horizon_argument
# generate train-validate & test subsets based on split_date
split_date = str(date.today() - timedelta(days=horizon_value+1))
df_train_val = df.loc[(df['ds'] <= split_date)]
df_test = df.loc[(df['ds'] > split_date)]
print("\n\033[92mtail of df_train_val :\n", df_train_val.tail(), "\033[00m")
pd.set_option('display.max_rows', None)
print("\n\033[91mfull test subset df_test :\n", df_test, "\033[00m")
# implement cross-validation to evaluate all parameters
for params in all_params:
m = Prophet(**params) # instantiate Prophet using params
m.add_country_holidays(country_name='GR') # add Greek holidays
m.add_regressor('weekday') # add to Prophet the additional regressor 'weekday'
m.fit(df_train_val) # fit model on train-validate set
df_cv = cross_validation(m, initial=initial_argument, period=period_argument, horizon=horizon_argument,
parallel="threads")
df_p = performance_metrics(df_cv, rolling_window=1)
rmses.append(df_p['rmse'].values[0])
mapes.append(round(100 * df_p['mape'].values[0], 2))
# Find the best parameters
pd.set_option('display.max_columns', None)
tuning_results = pd.DataFrame(all_params)
# tuning_results['rmse'] = rmses
tuning_results['mape'] = mapes
print('tuning results :\n', tuning_results)
# print best params
# best_params = all_params[np.argmin(rmses)]
best_params = all_params[np.argmin(mapes)]
print('best params :', best_params)
print('best params dtype:', type(best_params))
print('df_cv for best params :\n', df_cv.head())
# instantiate prophet model based on best parameter set best_params
m_best = Prophet(**best_params).add_regressor('weekday').add_country_holidays(country_name='GR').fit(df_train_val)
# create future
future = m_best.make_future_dataframe(periods=horizon_value, freq='D')
# define max & min future values for future
future['cap'] = cap
future['floor'] = 0
print('\nfuture :\n', future.tail(horizon_value))
# add to future additional regressors 'weekday', 'month', 'week'
future['weekday'] = future['ds'].dt.dayofweek
# future['month'] = future['ds'].dt.month
# future['week'] = future['ds'].dt.isocalendar().week
# make the forecast
forecast = m_best.predict(future)
print('\nforecast :\n', forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(horizon_value))
test_predictions = forecast.iloc[-len(df_test):]['yhat']
test_predictions = round(test_predictions).astype('int64')
# enhance the test set df_test
df_test['y_hat'] = test_predictions
df_test['weekday'] = df_test['ds'].dt.dayofweek
df_test['ds'] = df_test['ds'].dt.strftime('%Y-%m-%d') # convert column df_test['ds'] to yyyy-mm-dd format
df_test['APE [%]'] = round(100 * abs(test_predictions - df_test['y']) / df_test['y'], 1)
df_test['yhat_lower'] = forecast['yhat_lower'].apply(np.floor).astype('int64')
df_test['yhat_upper'] = forecast['yhat_upper'].apply(np.ceil).astype('int64')
df_test.drop(['cap', 'floor'], axis = 1, inplace=True)
print('\ndf_test for horizon =', horizon_argument, ' :\n', df_test)
mape = round(df_test['APE [%]'].mean(), 1)
print('\ndf_test : MAPE [%] :', mape)
df_train_val['hospital'] = hospital # add column hospital to train-validation set
print('holidays :', m_best.train_holiday_names)
return df_test, mape, horizon_argument, initial_argument, period_argument, best_params, cap, df_train_val