Skip to content

Commit b61eeea

Browse files
committed
model evaluation
1 parent 91e0b84 commit b61eeea

File tree

2 files changed

+107
-9
lines changed

2 files changed

+107
-9
lines changed

src/components/data_transformation.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ class DataTransformationConfig:
2121
holidays:str = os.path.join("artifacts", "holidays.csv")
2222
processed_data:str = os.path.join("artifacts", "processed_data.csv")
2323
train_data:str = os.path.join("artifacts", "train_data.csv")
24-
test_data:str = os.path.join("artifacts", "test_data.csv")
24+
test_data:str = os.path.join("artifacts", "test_data.joblib")
25+
test_data_covariates:str = os.path.join("artifacts", "test_data_covariates.joblib")
2526
timeseries_data:str = os.path.join("artifacts", "timeseries_data.joblib")
2627
covariates:str = os.path.join("artifacts", "covariates.joblib")
2728

@@ -85,7 +86,7 @@ def integrate_data(self):
8586
indices = processed_data[((processed_data["date"] == date) & (processed_data["city"] == city))].index
8687
processed_data["is_holiday"][indices] = 1
8788

88-
processed_data.to_csv(self.datatransformationconfig.processed_data)
89+
processed_data.to_csv(self.datatransformationconfig.processed_data, index = False)
8990

9091
logging.info("data integration complete")
9192

@@ -112,8 +113,8 @@ def split_data(self, number_of_test_days = 15):
112113
train_data = processed_data.iloc[:split_index + 1, :]
113114
test_data = processed_data.iloc[split_index + 1:, :]
114115

115-
train_data.to_csv(self.datatransformationconfig.train_data)
116-
test_data.to_csv(self.datatransformationconfig.test_data)
116+
train_data.to_csv(self.datatransformationconfig.train_data, index = False)
117+
joblib.dump(test_data, self.datatransformationconfig.test_data)
117118

118119
logging.info("data split complete")
119120

@@ -131,7 +132,7 @@ def transform_data(self):
131132
logging.info("executing transform_data function")
132133
try:
133134
train_data = pd.read_csv(self.datatransformationconfig.train_data)
134-
test_data = pd.read_csv(self.datatransformationconfig.test_data)
135+
test_data = joblib.load(self.datatransformationconfig.test_data)
135136

136137
train_data.drop(["id", "city", "store_type", "state", "cluster"], axis = 1, inplace = True)
137138
test_data.drop(["id", "city", "store_type", "state", "cluster"], axis = 1, inplace = True)
@@ -160,6 +161,19 @@ def transform_data(self):
160161
covariates[cov].loc[date, :] = [np.NaN] * covariates[cov].shape[1]
161162
covariates[cov] = covariates[cov].ffill()
162163

164+
logging.info("reformatting test_data")
165+
166+
test_sales = {}
167+
test_covariates = {}
168+
for group, data_slice in test_data.groupby(by = ["store_nbr", "family"]):
169+
data_slice.set_index("date", drop = True, inplace = True)
170+
test_covariate = data_slice[["onpromotion", "dcoilwtico", "is_holiday"]]
171+
test_sales_series = data_slice["sales"]
172+
test_sales[group] = test_sales_series
173+
test_covariates[str(group)] = test_covariate
174+
175+
test_data = pd.DataFrame(data = test_sales)
176+
163177
logging.info("detecting and removing outliers from different series")
164178

165179
temp = series_dataset.apply(lambda x : hampel(x, window_size = 7, n_sigma = 3.0).filtered_data)
@@ -178,24 +192,34 @@ def transform_data(self):
178192
constant_features.append(feature)
179193
features_to_keep = set(series_dataset.columns).difference(set(constant_features))
180194
series_dataset = series_dataset[features_to_keep]
181-
for constant_feature in constant_features:
182-
test_data[~((test_data["store_nbr"] == constant_feature[0]) & (test_data["family"] == constant_feature[1]))]
195+
183196
series_dataset = series_dataset[sorted(series_dataset.columns)]
197+
test_data = test_data[series_dataset.columns]
184198

185-
logging.info("converting sales series and covariates into Darta TimeSeries")
199+
logging.info("converting sales series and covariates into Darts TimeSeries")
186200

187201
series_dataset.set_index(pd.to_datetime(series_dataset.index), inplace = True)
202+
test_data.set_index(pd.to_datetime(test_data.index), inplace = True)
203+
188204
timeseries_data = TimeSeries.from_dataframe(series_dataset)
205+
test_data = TimeSeries.from_dataframe(test_data)
189206

190207
for cov_key in covariates:
191208
temp_cov = covariates[cov_key]
192209
temp_cov.set_index(pd.to_datetime(temp_cov.index), inplace = True)
193210
covariates[cov_key] = TimeSeries.from_dataframe(temp_cov)
194211

212+
for cov_key in test_covariates:
213+
temp_cov = test_covariates[cov_key]
214+
temp_cov.set_index(pd.to_datetime(temp_cov.index), inplace = True)
215+
test_covariates[cov_key] = TimeSeries.from_dataframe(temp_cov)
216+
195217
joblib.dump(timeseries_data, self.datatransformationconfig.timeseries_data)
196218
joblib.dump(covariates, self.datatransformationconfig.covariates)
219+
joblib.dump(test_data, self.datatransformationconfig.test_data)
220+
joblib.dump(test_covariates, self.datatransformationconfig.test_data_covariates)
197221

198-
logging.info("saved timeseries_data and covariates to artifacts")
222+
logging.info("saved timeseries_data, test_data and covariates to artifacts")
199223
logging.info(">>> DATA TRANSFORMATION COMPLETE <<<")
200224

201225
except Exception as e:

src/components/model_evaluation.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from src.utils.exception import CustomException
2+
from src.utils.logger import logging
3+
from src.utils import generate_covariates
4+
from dataclasses import dataclass
5+
from sklearn.metrics import mean_squared_error
6+
from sklearn.preprocessing import MinMaxScaler
7+
import os
8+
import numpy as np
9+
import pandas as pd
10+
import joblib
11+
12+
@dataclass
13+
class ModelEvaluationConfig:
14+
trained_model_path:str = os.path.join("artifacts", "trained_model.joblib")
15+
oil_model_path:str = os.path.join("artifacts", "oil_model.joblib")
16+
covariates:str = os.path.join("artifacts", "covariates.joblib")
17+
test_data_path:str = os.path.join("artifacts", "test_data.joblib")
18+
timeseries_data_path:str = os.path.join("artifacts", "timeseries_data.joblib")
19+
test_covariates_path:str = os.path.join("artifacts", "test_data_covariates.joblib")
20+
21+
class ModelEvaluation:
22+
def __init__(self):
23+
self.modelevaluationconfig = ModelEvaluationConfig()
24+
25+
def generate_predictions(self):
26+
try:
27+
trained_model = joblib.load(self.modelevaluationconfig.trained_model_path)
28+
oil_model = joblib.load(self.modelevaluationconfig.oil_model_path)
29+
covariates = joblib.load(self.modelevaluationconfig.covariates)
30+
test_data = joblib.load(self.modelevaluationconfig.test_data_path)
31+
test_data_covariates = joblib.load(self.modelevaluationconfig.test_covariates_path)
32+
timeseries_data = joblib.load(self.modelevaluationconfig.timeseries_data_path)
33+
34+
oil_forecasts = oil_model.predict(n = len(test_data)).pd_series().to_list()
35+
new_covariates = [
36+
covariates[cov].append(generate_covariates(
37+
horizon = len(test_data),
38+
onpromotion = test_data_covariates[cov].pd_dataframe()["onpromotion"],
39+
oil_forecasts = oil_forecasts,
40+
is_holiday = test_data_covariates[cov].pd_dataframe()["is_holiday"],
41+
trained_last_date = oil_model.training_series.end_time()
42+
)) for cov in test_data.components
43+
]
44+
45+
predictions = trained_model.predict(
46+
n = len(test_data),
47+
series = [timeseries_data[series] for series in timeseries_data.components],
48+
past_covariates = new_covariates
49+
)
50+
51+
predictions_df = pd.DataFrame()
52+
for prediction in predictions:
53+
predictions_df[prediction.components[0]] = list(prediction.pd_series())
54+
55+
return timeseries_data, test_data, predictions_df
56+
except Exception as e:
57+
print(CustomException(e))
58+
59+
def evaluate_predictions(self, train_data, targets, predictions):
60+
try:
61+
scaler = MinMaxScaler()
62+
scaler.fit(np.array(train_data))
63+
real_values = scaler.transform(np.array(targets))
64+
predicted_values = scaler.transform(np.array(predictions))
65+
66+
real = []
67+
pred = []
68+
for col in range(real_values.shape[1]):
69+
real += real_values[:, col]
70+
pred += predicted_values[:, col]
71+
72+
logging.info(mean_squared_error(real, pred))
73+
except Exception as e:
74+
print(CustomException(e))

0 commit comments

Comments
 (0)