-
Notifications
You must be signed in to change notification settings - Fork 4
/
traditional_methods.py
155 lines (129 loc) · 6.32 KB
/
traditional_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# use all cores
#import os
#os.system("taskset -p 0xff %d" % os.getpid())
pd.options.mode.chained_assignment = None # deactivating slicing warns
def load_seattle_speed_matrix():
""" Loads the whole Seattle `speed_matrix_2015` into memory.
Caution ~ 200 mb of data
:param:
:return df (pandas.DataFrame): speed matrix as DataFrame. Columns are sensors, rows are timestamps
"""
speed_matrix = './data/Seattle_Loop_Dataset/speed_matrix_2015'
print('Loading data...')
df = pd.read_pickle(speed_matrix)
df.index = pd.to_datetime(df.index, format='%Y-%m-%d %H:%M')
print('Load completed.')
return df
def best_moving_average(df, col, average_window_in_hours=27, from_date=None, to_date=None, plot=False):
""" Calculates the moving average in a window of `average_window_in_hours` hours and propagates
into the future.
Beware! This code uses data from the future to perform predictions.
Meaning it is meant to be used to generate the "perfect" moving average baseline.
:param df (pandas.DataFrame): dataset being used
:param col (str): column for which the moving average will be applied
:param average_window_in_hours (int): the window (in hours) used to generate predictions
:param from_date (str): initial date to be shown in the plot, format: "YYYY-MM-DD"
:param to_date (str): end date to be shown in the plot
:param plot (bool): plot moving average and original df
:return MAE, RMSE (tuple): Both metrics are calculated for the column `col`
"""
ndf = df[[col]]
window_size = average_window_in_hours*12
ndf['preds'] = ndf.rolling(window=window_size).mean().shift(1)
MAE = ndf.apply((lambda x: np.abs(x[0] - x[1])), axis=1).dropna().mean()
RMSE = np.sqrt(ndf.apply((lambda x: np.power(x[0] - x[1], 2)), axis=1).dropna().mean())
if plot:
if from_date is not None and to_date is not None:
ndf.resample('1h').mean().loc[from_date:to_date].plot(figsize=(12, 7))
else:
ndf.resample('1h').mean()[:500].plot(figsize=(12, 7))
plt.show()
return (MAE, RMSE)
def calculate_metrics(df, average_window_in_hours, verbose=5, save=True):
""" Calculates MAE and RMSE for all columns of `df`, taking a sliding window of `average_window_in_hours` hours.
:param df (panads.DataFrame): dataset being used
:param average_window_in_hours (int): the window (in hours) used to generate predictions
:param verbose (int): option to display the calculations on-the-fly.
Values are going to be displayed after `verbose` iterations.
:param save (bool):
:return mae_and_rmse (dict): dictionary containing (MAE, RMSE) for each column of `df`
"""
mae_and_rmse = {}
for (it, col) in enumerate(df.columns):
MAE, RMSE = best_moving_average(df, col, average_window_in_hours)
mae_and_rmse[col] = (MAE, RMSE)
if it%verbose == 0:
print('Column: {}, MAE: {}, RMSE: {}'.format(col, MAE, RMSE))
if save:
# TODO: add param to attribute filename and filedir
pd.DataFrame(mae_rmse, index=['MAE', 'RMSE']).to_csv('./experiment_results/seattle_best_moving_average_mae_rmse.csv')
return mae_and_rmse
def real_moving_average(df, col, sliding_window_in_hours, forecast_window_in_minutes):
""" Calculating the moving average using a sliding window of `sliding_window_in_hours`
on a forecast window of `forecast_window_in_minutes` over the dataset.
Returns a dataframe with the forecast for the given dataframe.
"""
sliding_window = 12*sliding_window_in_hours
forecast_window = ((forecast_window_in_minutes+5)//5)
X = df[col].values
Y = X[:sliding_window]
for i in range(forecast_window):
ypred = np.mean(Y[i: i+sliding_window])
Y = np.append(Y, ypred)
forecast_df = pd.DataFrame(
data=Y[len(Y)-forecast_window:],
index=df.index[sliding_window:sliding_window+forecast_window]
)
return forecast_df
# still need to compute MAE and RMSE for all data
def moving_average_forecast(df, col, sliding_window_in_hours, forecast_window_in_minutes):
""" Applies moving average forecast across all the dataset. Stride can be applied to make forecasting faster,
ie, stride makes the sliding window jump a window of `stride_in_minutes`.
Returns a pandas.DataFrame containing a side-by-side comparison of the real dataframe and its predictions,
for all predicted values.
"""
sliding_window = 12*sliding_window_in_hours
forecast_window = ((forecast_window_in_minutes+5)//5)
stride_in_minutes = 60
stride = (stride_in_minutes//5)
all_predictions = []
if stride_in_minutes == 0:
max_it = len(df)
else:
max_it = len(df)//stride
for i in range(max_it):
try:
smaller_df = df.iloc[i*stride: (sliding_window+forecast_window) + (i+1)*stride]
preds = real_moving_average(smaller_df, col, sliding_window_in_hours, forecast_window_in_minutes)
fdf = pd.concat([smaller_df[[col]].loc[preds.index[0]:preds.index[-1]],preds], axis=1)
fdf = fdf.rename(columns={0:col+'_pred'})
all_predictions.append(fdf)
except:
pass
return pd.concat(all_predictions, axis=0)
def metrics(preds_df):
""" Given a `preds_df` containing two columns, the first with real values and the second being preds,
returns MAE and RMSE
"""
preds = preds_df
MAE = np.mean(np.abs(preds[preds.columns[0]] - preds[preds.columns[1]] ))
RMSE = np.sqrt(np.mean(np.power(preds[preds.columns[0]] - preds[preds.columns[1]], 2)))
return (MAE, RMSE)
def main():
# this options should go into an argument parser
SLIDING_WINDOW_IN_HOURS = 4
FORECAST_WINDOW_IN_MINUTES = 15
STRIDE_IN_MINUTES = 60
df = load_seattle_speed_matrix()
metrics_dict = {}
for col in df.columns:
print(col)
preds = moving_average_forecast(df, col, SLIDING_WINDOW_IN_HOURS, FORECAST_WINDOW_IN_MINUTES)
mae_rmse = metrics(preds)
metrics_dict[col] = mae_rmse
pd.DataFrame(metrics_dict, index=['MAE', 'RMSE']).to_csv('./experiment_results/training_window_4_hour_forecast_window_15_min_mae_rmse_seattle.csv')
if __name__ == '__main__':
main()