forked from h1st-ai/h1st
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathforecast.py
127 lines (106 loc) · 5.03 KB
/
forecast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import h1st as h1
import pandas as pd
import os
import sklearn
import sklearn.metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from Forecasting import config
class ForecastModel(h1.Model):
def __init__(self):
super().__init__()
self.model = None
self.feature_cols = ['Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
'DayOfWeek', 'DayOfMonth', 'Month',
'StoreType', 'Assortment', 'CompetitionDistance',
'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear']
self.data_dir = config.FORECAST_DATA_PATH
def load_data(self):
# does it "fetch" the data or also perform join etc.?
# needs to have kaggle tools, and user credentials, and agreed to competition rules etc.
os.system("mkdir {data}".format(data=self.data_dir))
if not os.path.isfile(os.path.join(self.data_dir, "train.csv")):
os.system("kaggle competitions download -c rossmann-store-sales -p {data}/".format(data=self.data_dir))
os.system("cd {data}; unzip rossmann-store-sales.zip".format(data=self.data_dir))
df = pd.read_csv(os.path.join(self.data_dir, "train.csv"), low_memory=False)
store_info = pd.read_csv(os.path.join(self.data_dir, "store.csv"))
df = df.merge(store_info, on="Store")
return df
def explore(self):
df = self.load_data()
import seaborn
print(df.count()) # count NA
seaborn.distplot(df.Sales) # Sales distribution
def prep_data(self, loaded_data):
"""
Prepare data for modelling
:param loaded_data: data return from load_data method
:returns: dictionary contains train data and validation data
"""
df = loaded_data
df.fillna(0, inplace=True) # safe to fill, see countNA table below:
# Store 1017209
# DayOfWeek 1017209
# Date 1017209
# Sales 1017209
# Customers 1017209
# Open 1017209
# Promo 1017209
# StateHoliday 1017209
# SchoolHoliday 1017209
# StoreType 1017209
# Assortment 1017209
# CompetitionDistance 1014567
# CompetitionOpenSinceMonth 693861
# CompetitionOpenSinceYear 693861
# Promo2 1017209
# Promo2SinceWeek 509178
# Promo2SinceYear 509178
# PromoInterval 509178
# dtype: int64
df["Date"] = pd.to_datetime(df.Date)
df["DayOfWeek"] = df.Date.dt.dayofweek
df["DayOfMonth"] = df.Date.dt.day
df["Month"] = df.Date.dt.month
train_df = df[df["Date"] < "2015-06-01"]
val_df = df[df["Date"] >= "2015-06-01"]
print(len(train_df), len(val_df))
# sales only should get 949194 68015
# after dropNA on storeinfo: 302061 22265
return {
'train_df': train_df,
'val_df': val_df,
'len_train_val': (len(train_df), len(val_df))
}
def train(self, prepared_data):
train_df = prepared_data['train_df'][self.feature_cols]
sales = prepared_data['train_df']["Sales"]
transformer = make_column_transformer(
(OneHotEncoder(handle_unknown="ignore"), ['StateHoliday', "StoreType", "Assortment"]),
remainder="passthrough")
transformer.fit(train_df[self.feature_cols])
model = Pipeline([('transform', transformer),
('model', RandomForestRegressor(max_depth=10, n_estimators=200))])
model.fit(train_df, sales)
self.model = model
def evaluate(self, prepared_data):
val_df = prepared_data['val_df']
y_pred = self.model.predict(val_df[self.feature_cols])
y_true = val_df['Sales']
self.metrics = {'mae': sklearn.metrics.mean_absolute_error(y_true, y_pred),
}
def predict(self, input_data):
# repeat this because input_data might not be "prepared" e.g. come from another test file
store_info = pd.read_csv(os.path.join(self.data_dir, "store.csv"))
input_data = input_data.merge(store_info, on="Store")
input_data.fillna(0, inplace=True)
input_data["Date"] = pd.to_datetime(input_data.Date)
input_data["DayOfWeek"] = input_data.Date.dt.dayofweek
input_data["DayOfMonth"] = input_data.Date.dt.day
input_data["Month"] = input_data.Date.dt.month
input_data = input_data[self.feature_cols]
result = self.model.predict(input_data)
return result