-
Notifications
You must be signed in to change notification settings - Fork 0
/
WeatherVARAnalysis.py
204 lines (155 loc) · 8.68 KB
/
WeatherVARAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 2000)
class ts:
def data_preprocessing(self):
self.df = pd.read_csv('./austin_weather.csv')
self.df.set_index('Date').sort_index()
# use average data only
self.columns_of_interest = ['TempAvgF','DewPointAvgF', 'HumidityAvgPercent', 'SeaLevelPressureAvgInches', 'VisibilityAvgMiles', 'WindAvgMPH', 'PrecipitationSumInches']
self.data = self.df[self.columns_of_interest]
self.events = self.df[['Events']].replace(' ', 'None')
self.events.Events.value_counts().plot(kind='bar', figsize=(10,5))
plt.show()
#Get unique events categories
self.unique_events = set()
for value in self.events.Events.value_counts().index:
self.splitted = [x.strip() for x in value.split(',')]
self.unique_events.update(self.splitted)
print("unique_events:\n",self.unique_events)
self.single_events = pd.DataFrame()
for event_type in self.unique_events:
self.event_occurred = self.events.Events.str.contains(event_type)
self.single_events = pd.concat([self.single_events, pd.DataFrame(data={event_type: self.event_occurred.values})], join='outer', axis=1)
#single_events.head()
ax = self.single_events.sum().sort_values(ascending=False).plot.bar(figsize=(10,5))
ax.set_title("Weather events in dataset", fontsize=18)
ax.set_ylabel("Number of occurrences", fontsize=14)
for i in ax.patches:
ax.text(i.get_x()+.18, i.get_height()+5, i.get_height(), fontsize=12)
plt.show()
print("\nsingle_events:\n",self.single_events.head())
#Check how many traces do we have in PrecipitationSumInches collum
self.precipitation = self.data[pd.to_numeric(self.data.PrecipitationSumInches, errors='coerce').isnull()].PrecipitationSumInches.value_counts()
print("\nprecipitation:\n",self.precipitation)
# this function returns array with one item for each row
# each item indicates if the row with columns of our interest had non-numeric data
def isColumnNotNumeric(self,columns_of_interest, data):
result = np.zeros(data.shape[0], dtype=bool)
for column_name in columns_of_interest:
result = result | pd.to_numeric(data[column_name], errors='coerce').isnull()
return result
def getDataFrameWithNonNumericRows(self,dataFrame):
return self.data[ts.isColumnNotNumeric(self.columns_of_interest, self.data)]
def numberOrZero(self,value):
try:
parsed = float(value)
return parsed
except:
return 0
def Feature_Extraction(self):
non_numeric_rows_count = ts.getDataFrameWithNonNumericRows(self.data).shape[0]
print("\nNon numeric rows: {0}\n".format(non_numeric_rows_count))
# this line is unnecessary if we run script from top to bottom,
# but it helps debugging this part of code to get fresh PrecipitationSumInches column
self.data['PrecipitationSumInches'] = self.df['PrecipitationSumInches']
#Find rows indices with "T" values
self.has_precipitation_trace_series = ts.isColumnNotNumeric(['PrecipitationSumInches'], self.data).astype(int)
#data['PrecipitationTrace'] = has_precipitation_trace_series
#data.loc[:,'PrecipitationTrace'] = has_precipitation_trace_series
self.data = self.data.assign(PrecipitationTrace=self.has_precipitation_trace_series.values)
self.data['PrecipitationSumInches'] = self.data['PrecipitationSumInches'].apply(ts.numberOrZero)
print("\n",self.data.iloc[0:10,:])
#Check how many non numeric rows we still have
print("\nCheck how many non numeric rows we still have:\n",ts.getDataFrameWithNonNumericRows(self.data))
self.row_indices_for_missing_values = ts.getDataFrameWithNonNumericRows(self.data).index.values
print("\nrow indices for missing values:\n",self.row_indices_for_missing_values)
self.df=self.df.drop(self.row_indices_for_missing_values)
self.data_prepared = self.data.drop(self.row_indices_for_missing_values)
self.events_prepared = self.single_events.drop(self.row_indices_for_missing_values)
print("Data rows: {0}, Events rows: {1}".format(self.data_prepared.shape[0], self.events_prepared.shape[0]))
#Convert dataframe columns to be treated as numbers
print("\ndata types:\n",self.data_prepared.dtypes)
self.data_prepared = self.data_prepared.apply(pd.to_numeric)
print("\nafter converting data types\n",self.data_prepared.dtypes)
#Normalize input data
self.data_values = self.data_prepared.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
self.data_prepared = pd.DataFrame(min_max_scaler.fit_transform(self.data_prepared), columns=self.data_prepared.columns, index=self.data_prepared.index)
self.data_prepared["Date"]=self.df["Date"]
self.data_prepared['Date'] = pd.to_datetime(self.data_prepared['Date'])
self.data_prepared=self.data_prepared.set_index('Date')
print(self.data_prepared.head())
print("\n\nafter inserting the date:\n",self.data_prepared.dtypes)
#Final look at the prepared data
print("\nfinal look at prepared data:\n",self.data_prepared.head())
print("\nfinal look at event prepared\n",self.events_prepared.head())
# for i in columns_of_interest:
# z = data_prepared[i].resample('MS').mean()
# z.plot(figsize=(15, 6))
# plt.ylabel(i)
# plt.show()
# decomposition = sm.tsa.seasonal_decompose(z, model='additive')
# fig = decomposition.plot()
# plt.title(i)
# plt.show()
def model_fitting(self):
#creating the train and validation set
self.train = self.data_prepared[:int(0.8*(len(self.data_prepared)))]
self.valid = self.data_prepared[int(0.8*(len(self.data_prepared))):]
#fit the model
from statsmodels.tsa.vector_ar.var_model import VAR
model = VAR(endog=self.train)
model_fit = model.fit()
# make prediction on validation
self.prediction = model_fit.forecast(model_fit.y, steps=len(self.valid))
print('prediction:\n',self.prediction)
print("\n\n predection 1 chunk:\n",self.prediction[0])
#converting predictions to dataframe
self.pred = pd.DataFrame(index=range(0,len(self.prediction)),columns=[self.data_prepared.columns])
for j in range(0,8):
for i in range(0, len(self.prediction)):
self.pred.iloc[i][j] = self.prediction[i][j]
print("\npred:\n",self.pred)
self.pred.columns=['TempAvgF', 'DewPointAvgF', 'HumidityAvgPercent', 'SeaLevelPressureAvgInches', 'VisibilityAvgMiles', 'WindAvgMPH', 'PrecipitationSumInches', 'PrecipitationTrace']
print("tarin shape:",self.train.shape)
print("valid shape:",self.valid.shape)
print("pred shape:",self.pred.shape)
for i in self.data_prepared.columns:
print('rmse value for', i, 'is : ', np.sqrt(mean_squared_error(self.pred[i], self.valid[i])))
#make final predictions
model = VAR(endog=self.data_prepared)
model_fit = model.fit()
self.yhat = model_fit.forecast(model_fit.y, steps=1)
print("\nyaht:\n",self.yhat)
self.pred["Date"]=self.valid.index
#pred['Date'] = pd.to_datetime(data_prepared['Date'])
self.pred=self.pred.set_index('Date')
print(self.pred)
for col in self.pred.columns:
plt.figure(figsize=(15, 8))
# x = valid.index
# y = pred[col]
# z=valid[col]
# plt.plot(x, y,'b')
# plt.plot(x,z,'r')
plt.plot(self.valid[col])
plt.plot(self.pred[col])
plt.xlabel("Date")
plt.ylabel(col)
plt.title("Date and "+ col + " relationship")
plt.legend(['valid', 'prediction'], loc='upper left')
plt.show()
return self.yhat
ts=ts()
ts.data_preprocessing()
ts.Feature_Extraction()
temp=ts.model_fitting()