비즈니스 애널리틱스 2021-2 프로젝트
- e비즈니스학과 201823871 박지헌
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
%matplotlib inline
from fbprophet import Prophet
from statsmodels.stats.outliers_influence import variance_inflation_factor
from matplotlib import pyplot
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn import metrics # 나머지는 sklearn에서 활용
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,mean_squared_log_error
import eli5
from eli5.sklearn import PermutationImportance
# pd.set_option('mode.chained_assignment', None)
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
df = pd.read_csv('data/rice_weather3.csv')
df['date']=pd.to_datetime(df['date'])
# pd.set_option("max_rows", None)
fig = plt.figure(figsize=(10, 4))
df.isnull().sum().plot(kind='bar')
<AxesSubplot:>
df['WTI'].interpolate(method='linear',limit_direction='forward',inplace=True)
df.dropna(inplace=True)
# pd.set_option("max_rows", None)
fig = plt.figure(figsize=(10, 4))
df.isnull().sum().plot(kind='bar')
<AxesSubplot:>
df.plot(kind='scatter',x='date', y='WTI')
<AxesSubplot:xlabel='date', ylabel='WTI'>
cond = (df["WTI"] < 0)
df[cond]
df.drop(6392,inplace=True)
df.plot(kind='scatter',x='date', y='WTI')
<AxesSubplot:xlabel='date', ylabel='WTI'>
# df.to_csv("data/rice_df.csv",index=False)
df=pd.read_csv("data/rice_df2.csv")
df['date']=pd.to_datetime(df['date'])
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
date | avg_price | consumption | CPI | consumer_price_growth_rate | food_non_alcoholic_beverages | Liquor_tobacco | clothes_shoes | housing_water_electricity_fuel | household_goods_housekeeping_services | ... | export | import | price_Thailand | price_US | temperature | precipitation | wind_speed | relative_humidity | total_insolation | maximum_snow_depth | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2000-01-03 | 41900 | 93.600 | 66.6 | 2.3 | 0.8 | 0.8 | 1.5 | 3.7 | -2.5 | ... | 0 | 43034642 | 247.500000 | 0.499000 | 0.692857 | 0.000000 | 2.100000 | 56.707143 | 4.188571 | 0.000000 |
1 | 2000-01-04 | 41900 | 93.600 | 66.6 | 2.3 | 0.8 | 0.8 | 1.5 | 3.7 | -2.5 | ... | 0 | 43034642 | 247.500000 | 0.499000 | 1.428571 | 0.000000 | 1.271429 | 56.035714 | 3.552143 | 0.000000 |
2 | 2000-01-05 | 41800 | 93.600 | 66.6 | 2.3 | 0.8 | 0.8 | 1.5 | 3.7 | -2.5 | ... | 0 | 43034642 | 247.500000 | 0.499000 | 5.642857 | 10.550000 | 2.450000 | 75.957143 | 1.521429 | 1.185714 |
3 | 2000-01-06 | 41800 | 93.600 | 66.6 | 2.3 | 0.8 | 0.8 | 1.5 | 3.7 | -2.5 | ... | 0 | 43034642 | 247.500000 | 0.499000 | 5.171429 | 5.421429 | 3.078571 | 75.964286 | 1.362857 | 0.428571 |
4 | 2000-01-07 | 41900 | 93.600 | 66.6 | 2.3 | 0.8 | 0.8 | 1.5 | 3.7 | -2.5 | ... | 0 | 43034642 | 247.500000 | 0.499000 | -3.750000 | 0.442857 | 2.935714 | 56.807143 | 3.915000 | 1.028571 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5636 | 2021-11-05 | 54020 | 52.609 | 105.4 | 0.5 | 4.4 | 0.3 | 0.7 | 0.5 | 0.0 | ... | 196359 | 36667741 | 496.120606 | 0.745517 | 12.220000 | 4.393333 | 1.293333 | 72.720000 | 6.414667 | 0.000000 |
5637 | 2021-11-08 | 53880 | 52.609 | 105.4 | 0.5 | 4.4 | 0.3 | 0.7 | 0.5 | 0.0 | ... | 196359 | 36667741 | 496.120606 | 0.745517 | 9.786667 | 19.986667 | 2.686667 | 78.246667 | 2.232667 | 0.000000 |
5638 | 2021-11-09 | 53880 | 52.609 | 105.4 | 0.5 | 4.4 | 0.3 | 0.7 | 0.5 | 0.0 | ... | 196359 | 36667741 | 496.120606 | 0.745517 | 7.020000 | 1.226667 | 2.260000 | 72.180000 | 4.563333 | 0.000000 |
5639 | 2021-11-10 | 53880 | 52.609 | 105.4 | 0.5 | 4.4 | 0.3 | 0.7 | 0.5 | 0.0 | ... | 196359 | 36667741 | 496.120606 | 0.745517 | 6.453333 | 3.106667 | 2.253333 | 74.633333 | 4.077333 | 0.000000 |
5640 | 2021-11-11 | 53880 | 52.609 | 105.4 | 0.5 | 4.4 | 0.3 | 0.7 | 0.5 | 0.0 | ... | 196359 | 36667741 | 496.120606 | 0.745517 | 6.480000 | 1.766667 | 2.213333 | 66.660000 | 5.422000 | 0.000000 |
5641 rows × 31 columns
# 품목별 소비자 물가지수
consumer_vars = df[['date','avg_price', 'consumption', 'CPI', 'consumer_price_growth_rate',
'food_non_alcoholic_beverages', 'Liquor_tobacco', 'clothes_shoes',
'housing_water_electricity_fuel',
'household_goods_housekeeping_services', 'sanitation', 'traffic',
'communication', 'entertainment_culture', 'education',
'food_accommodation', 'other_goods_services']]
consumer_vars
plt.rcParams['figure.figsize']=[12,7] #크기조정 [가로,높이]인치
plt.title("쌀 가격 - 소비자 물가지수 상관계수", fontsize=15)
sns.heatmap(consumer_vars.corr(),annot=True)
C:\Users\82104\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font.
font.set_text(s, 0.0, flags=flags)
C:\Users\82104\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font.
font.set_text(s, 0, flags=flags)
<AxesSubplot:title={'center':'쌀 가격 - 소비자 물가지수 상관계수'}>
consumer_vars.hist(bins=50, figsize=(20, 15))
array([[<AxesSubplot:title={'center':'date'}>,
<AxesSubplot:title={'center':'avg_price'}>,
<AxesSubplot:title={'center':'consumption'}>,
<AxesSubplot:title={'center':'CPI'}>],
[<AxesSubplot:title={'center':'consumer_price_growth_rate'}>,
<AxesSubplot:title={'center':'food_non_alcoholic_beverages'}>,
<AxesSubplot:title={'center':'Liquor_tobacco'}>,
<AxesSubplot:title={'center':'clothes_shoes'}>],
[<AxesSubplot:title={'center':'housing_water_electricity_fuel'}>,
<AxesSubplot:title={'center':'household_goods_housekeeping_services'}>,
<AxesSubplot:title={'center':'sanitation'}>,
<AxesSubplot:title={'center':'traffic'}>],
[<AxesSubplot:title={'center':'communication'}>,
<AxesSubplot:title={'center':'entertainment_culture'}>,
<AxesSubplot:title={'center':'education'}>,
<AxesSubplot:title={'center':'food_accommodation'}>],
[<AxesSubplot:title={'center':'other_goods_services'}>,
<AxesSubplot:title={'center':'Year'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
consumer_vars['Year'] = df['date'].dt.year;consumer_vars
plt.figure(figsize=(12,8))
sns.pairplot(consumer_vars,
x_vars=['consumption', 'CPI','consumer_price_growth_rate',
'food_non_alcoholic_beverages', 'Liquor_tobacco', 'clothes_shoes',
'housing_water_electricity_fuel',
'household_goods_housekeeping_services', 'sanitation', 'traffic',
'communication', 'entertainment_culture', 'education',
'food_accommodation', 'other_goods_services'],
y_vars=['avg_price'],
kind='scatter',
hue='Year',
palette='husl')
plt.show()
<Figure size 864x576 with 0 Axes>
# 재배면적 당 수확량
local_vars = df[['avg_price','total_Production_per_cultivation_area',
'world_Production_per_cultivation_area']]
plt.rcParams['figure.figsize']=[12,7] #크기조정 [가로,높이]인치
plt.title("쌀 가격 - 지역별 재배면적 당 수확량 상관계수", fontsize=15)
sns.heatmap(local_vars.corr(),annot=True)
<AxesSubplot:title={'center':'쌀 가격 - 지역별 재배면적 당 수확량 상관계수'}>
local_vars.hist(bins=50, figsize=(20, 15))
array([[<AxesSubplot:title={'center':'avg_price'}>,
<AxesSubplot:title={'center':'total_Production_per_cultivation_area'}>],
[<AxesSubplot:title={'center':'world_Production_per_cultivation_area'}>,
<AxesSubplot:>]], dtype=object)
local_vars['Year'] = df['date'].dt.year;local_vars
plt.figure(figsize=(12,8))
sns.pairplot(local_vars,
x_vars=['total_Production_per_cultivation_area',
'world_Production_per_cultivation_area'],
y_vars=['avg_price'],
kind='scatter',
hue='Year',
palette='husl')
plt.show()
<ipython-input-135-a478685de506>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
local_vars['Year'] = df['date'].dt.year;local_vars
<Figure size 864x576 with 0 Axes>
# 국내 & 국제
international_vars = df[['avg_price','WTI', 'domestic', 'export',
'import', 'price_Thailand', 'price_US']]
plt.rcParams['figure.figsize']=[12,7] #크기조정 [가로,높이]인치
plt.title("쌀 가격 - 국내 & 국제 상관계수", fontsize=15)
sns.heatmap(international_vars.corr(),annot=True)
<AxesSubplot:title={'center':'쌀 가격 - 국내 & 국제 상관계수'}>
international_vars.hist(bins=50, figsize=(20, 15))
array([[<AxesSubplot:title={'center':'avg_price'}>,
<AxesSubplot:title={'center':'WTI'}>,
<AxesSubplot:title={'center':'domestic'}>],
[<AxesSubplot:title={'center':'export'}>,
<AxesSubplot:title={'center':'import'}>,
<AxesSubplot:title={'center':'price_Thailand'}>],
[<AxesSubplot:title={'center':'price_US'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
international_vars['Year'] = df['date'].dt.year
plt.figure(figsize=(12,8))
sns.pairplot(international_vars,
x_vars=['WTI', 'domestic', 'export',
'import', 'price_Thailand', 'price_US'],
y_vars=['avg_price'],
kind='scatter',
hue='Year',
palette='husl')
plt.show()
<ipython-input-138-271b388c4644>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
international_vars['Year'] = df['date'].dt.year
<Figure size 864x576 with 0 Axes>
weather_vars = df[['avg_price','temperature',
'precipitation', 'wind_speed', 'relative_humidity',
'total_insolation', 'maximum_snow_depth']]
plt.rcParams['figure.figsize']=[15,15] #크기조정 [가로,높이]인치
plt.title("쌀 가격 - 지역별 기상 상관계수", fontsize=15)
sns.heatmap(weather_vars.corr(),annot=True)
<AxesSubplot:title={'center':'쌀 가격 - 지역별 기상 상관계수'}>
weather_vars['Year'] = df['date'].dt.year
plt.figure(figsize=(12,8))
sns.pairplot(weather_vars,
x_vars=['temperature',
'precipitation', 'wind_speed', 'relative_humidity',
'maximum_snow_depth','total_insolation'],
y_vars=['avg_price'],
kind='scatter',
hue='Year',
palette='husl')
plt.show()
C:\Users\82104\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font.
font.set_text(s, 0.0, flags=flags)
C:\Users\82104\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font.
font.set_text(s, 0, flags=flags)
<Figure size 864x576 with 0 Axes>
- Facebook Prophet은 페이스북이 개발한 일변량(하나의 변수) 시계열 예측을 위한 오픈소스 라이브러리이다.
- additive time series forecasting model을 참고해서 구현되었다.
- 경향성(Trends), 계절성(seasonality), 휴일(holidays) 을 지원하는 모델이다.
pro_df = df[['date', 'avg_price']]
plt.figure(figsize=(12,7))
plt.title("쌀 가격 추이", fontsize=15)
pro_df['avg_price'].plot()
pyplot.show()
# 컬럼명 변경
pro_df.columns = ['ds', 'y']
# 데이터 타입 변경
pro_df['ds']= pd.to_datetime(pro_df['ds'])
# 모델 생성
model = Prophet(yearly_seasonality=True, daily_seasonality=True)
# 모델 학습
model.fit(pro_df)
<ipython-input-47-aaaae1762298>:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pro_df['ds']= pd.to_datetime(pro_df['ds'])
<fbprophet.forecaster.Prophet at 0x1663ac28af0>
pro_df['ds']= pd.to_datetime(pro_df['ds'])
pro_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6779 entries, 0 to 6778
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ds 6779 non-null datetime64[ns]
1 y 6779 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 106.0 KB
<ipython-input-12-687ef13da825>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pro_df['ds']= pd.to_datetime(pro_df['ds'])
# train set 2016-01 ~ 2021-10
last_year = list()
for i in range(1, 13):
last_year.append(['2016-%02d' % i])
for i in range(1, 13):
last_year.append(['2017-%02d' % i])
for i in range(1, 13):
last_year.append(['2018-%02d' % i])
for i in range(1, 13):
last_year.append(['2019-%02d' % i])
for i in range(1, 13):
last_year.append(['2020-%02d' % i])
for i in range(1, 11):
last_year.append(['2021-%02d' % i])
last_year = pd.DataFrame(last_year, columns = ['ds'])
last_year['ds']= pd.to_datetime(last_year['ds'])
# 에측
forecast = model.predict(last_year)
# 예측값, 하한선, 상한선
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
ds yhat yhat_lower yhat_upper
0 2016-01-01 35171.509857 33093.252195 37215.323197
1 2016-02-01 34608.385330 32664.519028 36555.377296
2 2016-03-01 33956.563327 31895.450321 36189.745461
3 2016-04-01 33240.021981 31225.663302 35526.980510
4 2016-05-01 32674.528164 30593.537637 34722.859733
model.plot(forecast)
pyplot.show()
- 검은색 점들이 학습으로 사용된 데이터, 파란 색선이 예측한 값
- 하늘색 범위는 상한선과 하한선
- 실제 데이터와 큰 차이가 없는 걸 보아 잘 학습된 것을 알 수 있다.
- 학습 데이터 내부에 있었던 기간 외에 미래를 예측
# train set 이후 3년 날짜 생성 2021-11 ~ 2024-12
last_3year = list()
for i in range(11, 13):
last_3year.append(['2021-%02d' % i])
for i in range(1, 13):
last_3year.append(['2022-%02d' % i])
for i in range(1, 13):
last_3year.append(['2023-%02d' % i])
for i in range(1, 13):
last_3year.append(['2024-%02d' % i])
last_3year = pd.DataFrame(last_3year, columns = ['ds'])
last_3year['ds']= pd.to_datetime(last_3year['ds'])
forecast = model.predict(last_3year)
model.plot(forecast)
pyplot.show()
plt.savefig("data/output/pro_outsample.png", format = "png", dpi = 300)
<Figure size 864x504 with 0 Axes>
- 학습 데이터 내부에 있었던 기간 외에 미래를 예측
# 마지막 20% 제외시키기
train = pro_df.drop(pro_df.index[-1355:])
y_true = pro_df['y'][-1355:].values
# 모델 생성 후 학습
model = Prophet(yearly_seasonality=True, daily_seasonality=True)
model.fit(train)
test = pro_df.drop(pro_df.index[:-1355])
last_year = list()
last_year = test['ds'].astype(str).values.tolist()
last_year = pd.DataFrame(last_year, columns = ['ds'])
last_year['ds']= pd.to_datetime(last_year['ds'])
# 예측하고 비교하기
forecast = model.predict(last_year)
y_pred = forecast['yhat'].values
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_true, y_pred)
print('MAE: %.3f' % mae)
pyplot.plot(y_true, label='Actual')
pyplot.plot(y_pred, label='Predicted')
pyplot.legend()
pyplot.show()
plt.savefig("data/output/pro_평가.png", format = "png", dpi = 300)
print('MAE: %.3f' % mae)
MAE: 15126.220
MAE: 15126.220
<Figure size 864x504 with 0 Axes>
MAE: 11816.284
- mae가 작을수록 좋은 모델이다.
- 유의미한 예측 실패
- 경향성, 계절성, 휴일 등의 규칙적인 주기성으로 농산물 가격이 예측되지않아, 좀 더 다양한 요인이 필요해 보인다.
$$ y = a + b1x1 + b2x2 + b3x3 ... bnxn + e $$
- 현재 데이터의 열 83개
- 변수를 제거해야하는지 확인하는 작업이 사전에 필요하다.
- 그리고 다중공선성/과적합 등 문제가 발생하면 변수를 가공
# 날짜 제거
df2 = df.iloc[:,1:]
# 스케일링 전
def get_model1(seed):
df_train, df_test = train_test_split(df2, test_size=0.5, random_state=seed)
model = sm.OLS(df_train.avg_price,df_train.iloc[:,1:])
return df_train, df_test, model.fit()
df_train, df_test, result1 = get_model1(3)
#print(result1.summary())
print(result1.rsquared_adj)
test1 = []
for i in range(10):
df_train, df_test, result = get_model1(i)
y_predict = result1.predict(df_test.iloc[:,1:])
test1.append(r2_score(df_test['avg_price'],y_predict))
test1
0.9976024446327592
[0.861474789945303,
0.8655275605047131,
0.8657811107074469,
0.860655376334104,
0.8616771658349666,
0.8620948216271576,
0.8646700556289367,
0.8673016307493058,
0.8530760532860246,
0.858965681812265]
# 스케일링 후
x = df2.iloc[:,1:]
X = sm.add_constant(x)
y = df2.avg_price
def get_model1(seed):
train_x, test_x, train_y, test_y = train_test_split(X,y, train_size = 0.7, test_size = 0.3, random_state=seed)
#학습
scaler = QuantileTransformer()
scaler.fit(train_x)
train_X_scaled = pd.DataFrame(scaler.transform(train_x),columns=train_x.columns,index=train_x.index)
test_X_scaled = pd.DataFrame(scaler.transform(test_x),columns=test_x.columns,index=test_x.index)
model = sm.OLS(train_y,train_X_scaled)
return train_X_scaled, train_y, test_X_scaled, test_y, model.fit()
train_X_scaled, train_y, test_X_scaled, test_y, result1 = get_model1(3)
#print(result1.summary())
print(result1.rsquared_adj)
test1 = []
for i in range(10):
train_X_scaled, train_y, test_X_scaled, test_y, result1 = get_model1(i)
y_predict = result1.predict(test_X_scaled)
test1.append(r2_score(test_y,y_predict))
test1
0.9978385163377207
[0.8784226730474811,
0.8764266190047516,
0.8778216083353005,
0.8715207644357739,
0.8736299000329093,
0.8735042775939541,
0.8841989791997369,
0.8873244856050717,
0.8670479800551076,
0.8728588216571687]
# 전체 데이터 사용모델
x = df2.iloc[:,1:]
X = sm.add_constant(x)
y = df2.avg_price
train_x, test_x, train_y, test_y = train_test_split(X,y, train_size = 0.7, test_size = 0.3)
#학습
scaler = QuantileTransformer()
scaler.fit(train_x)
train_X_scaled = pd.DataFrame(scaler.transform(train_x),columns=train_x.columns,index=train_x.index)
test_X_scaled = pd.DataFrame(scaler.transform(test_x),columns=test_x.columns,index=test_x.index)
# 회귀모델 적합
full_model = sm.OLS(train_y,train_X_scaled)
fitted_full_model = full_model.fit()
print(fitted_full_model.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: avg_price R-squared (uncentered): 0.998
Model: OLS Adj. R-squared (uncentered): 0.998
Method: Least Squares F-statistic: 6.547e+04
Date: Sat, 18 Dec 2021 Prob (F-statistic): 0.00
Time: 21:55:47 Log-Likelihood: -35407.
No. Observations: 3948 AIC: 7.087e+04
Df Residuals: 3919 BIC: 7.105e+04
Df Model: 29
Covariance Type: nonrobust
=========================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------
const -1.483e-11 5.35e-13 -27.737 0.000 -1.59e-11 -1.38e-11
consumption 2.208e+04 711.767 31.025 0.000 2.07e+04 2.35e+04
CPI 3.96e+04 735.254 53.854 0.000 3.82e+04 4.1e+04
consumer_price_growth_rate 3.462e+04 1445.420 23.948 0.000 3.18e+04 3.74e+04
food_non_alcoholic_beverages -1.129e+04 673.755 -16.756 0.000 -1.26e+04 -9968.228
Liquor_tobacco -3063.3902 176.660 -17.341 0.000 -3409.745 -2717.035
clothes_shoes 7471.8682 311.873 23.958 0.000 6860.419 8083.318
housing_water_electricity_fuel 5577.4519 321.735 17.336 0.000 4946.668 6208.236
household_goods_housekeeping_services -1.664e+04 398.447 -41.758 0.000 -1.74e+04 -1.59e+04
sanitation 3914.7213 231.108 16.939 0.000 3461.618 4367.825
traffic -2.028e+04 536.474 -37.806 0.000 -2.13e+04 -1.92e+04
communication -1.761e+04 288.407 -61.051 0.000 -1.82e+04 -1.7e+04
entertainment_culture -7149.0163 338.008 -21.150 0.000 -7811.704 -6486.329
education 1.428e+04 527.519 27.073 0.000 1.32e+04 1.53e+04
food_accommodation 8025.8560 366.947 21.872 0.000 7306.432 8745.280
other_goods_services -1.202e+04 245.106 -49.060 0.000 -1.25e+04 -1.15e+04
total_Production_per_cultivation_area 9631.1792 288.623 33.369 0.000 9065.313 1.02e+04
world_Production_per_cultivation_area 2.17e+04 758.490 28.613 0.000 2.02e+04 2.32e+04
WTI 5596.6162 419.427 13.343 0.000 4774.301 6418.931
domestic -1598.4632 469.810 -3.402 0.001 -2519.559 -677.368
export -3811.2542 189.282 -20.135 0.000 -4182.355 -3440.153
import -260.3490 136.398 -1.909 0.056 -527.767 7.069
price_Thailand -1281.1071 347.199 -3.690 0.000 -1961.815 -600.400
price_US 5771.5101 272.958 21.144 0.000 5236.358 6306.662
temperature 409.5936 238.135 1.720 0.086 -57.286 876.474
precipitation 199.8339 127.634 1.566 0.118 -50.401 450.069
wind_speed 142.4403 123.049 1.158 0.247 -98.806 383.686
relative_humidity -717.8333 217.326 -3.303 0.001 -1143.917 -291.750
total_insolation 45.0479 193.523 0.233 0.816 -334.368 424.464
maximum_snow_depth -135.9929 117.816 -1.154 0.248 -366.979 94.993
==============================================================================
Omnibus: 3.636 Durbin-Watson: 2.007
Prob(Omnibus): 0.162 Jarque-Bera (JB): 3.682
Skew: 0.070 Prob(JB): 0.159
Kurtosis: 2.945 Cond. No. 1.30e+16
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The smallest eigenvalue is 1.65e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
# VIF를 통한 다중공선성 확인
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns
vif
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
VIF Factor | features | |
---|---|---|
0 | 71192.149514 | consumption |
1 | 194.196480 | CPI |
2 | 536.519590 | consumer_price_growth_rate |
3 | 1524.927314 | food_non_alcoholic_beverages |
4 | 126.915435 | Liquor_tobacco |
5 | 22.171916 | clothes_shoes |
6 | 6.908269 | housing_water_electricity_fuel |
7 | 130.565433 | household_goods_housekeeping_services |
8 | 37.807617 | sanitation |
9 | 15.929721 | traffic |
10 | 211.791133 | communication |
11 | 28.051193 | entertainment_culture |
12 | 28.370781 | education |
13 | 57.536508 | food_accommodation |
14 | 24.854851 | other_goods_services |
15 | 23.450257 | total_Production_per_cultivation_area |
16 | 25.279687 | world_Production_per_cultivation_area |
17 | 148.022943 | WTI |
18 | 17.782151 | domestic |
19 | 29.928931 | export |
20 | 1.147255 | import |
21 | 1.414651 | price_Thailand |
22 | 6.798047 | price_US |
23 | 11.566978 | temperature |
24 | 4.795313 | precipitation |
25 | 1.704532 | wind_speed |
26 | 1.270737 | relative_humidity |
27 | 4.150746 | total_insolation |
28 | 3.597733 | maximum_snow_depth |
import time
def processSubset(X,y,feature_set):
model = sm.OLS(y,X[list(feature_set)]) # Modeling
regr = model.fit() # model fitting
AIC = regr.aic # model's AIC
return {"model" : regr, "AIC" : AIC}
def forward(X,y,predictors):
# predictor - 현재 선택되어있는 변수
# 데이터 변수들이 미리정의된 predictors에 있는지 없는지 확인 및 분류
remaining_predictors = [p for p in X.columns.difference(['const']) if p not in predictors]
tic = time.time()
results = []
for p in remaining_predictors :
results.append(processSubset(X=X,y=y,feature_set=predictors+[p]+['const']))
# 데이터프레임으로 변환
models = pd.DataFrame(results)
# AIC가 가장 낮은 것을 선택
best_model = models.loc[models['AIC'].argmin()]
toc = time.time()
print("Processed ",models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic))
print("Selected predictors:",best_model["model"].model.exog_names,"AIC: ",best_model[1])
return best_model
### 전진선택법 모델
def forward_model(X,y):
X['const'] = np.ones([len(train_x.iloc[:,0]), 1])
Fmodels = pd.DataFrame(columns=["AIC","model"])
tic = time.time()
# 미리 정의된 데이터 변수
predictors = []
# 변수 1~10개 : 0-9 -> 1-10
for i in range(1,len(X.columns.difference(['const']))+1):
Forward_result = forward(X=X,y=y,predictors=predictors)
if i > 1 :
if Forward_result["AIC"] > Fmodel_before:
break
Fmodels.loc[i] = Forward_result
predictors = Fmodels.loc[i]["model"].model.exog_names
Fmodel_before = Fmodels.loc[i]["AIC"]
predictors = [k for k in predictors if k != 'const']
toc = time.time()
print("Total elapsed time:",(toc-tic), "seconds.")
return (Fmodels['model'][len(Fmodels['model'])])
Forward_best_model = forward_model(X=train_X_scaled, y=train_y.astype(float))
print(Forward_best_model.summary())
Processed 29 models on 1 predictors in 0.07300806045532227
Selected predictors: ['other_goods_services', 'const'] AIC: 78105.37952774977
Processed 28 models on 2 predictors in 0.03800606727600098
Selected predictors: ['other_goods_services', 'communication', 'const'] AIC: 77576.98363868354
Processed 27 models on 3 predictors in 0.038033246994018555
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'const'] AIC: 75864.84417359797
Processed 26 models on 4 predictors in 0.034215450286865234
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'const'] AIC: 75387.42122793631
Processed 25 models on 5 predictors in 0.03680706024169922
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'const'] AIC: 74817.73112899458
Processed 24 models on 6 predictors in 0.056111812591552734
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'const'] AIC: 74514.38628771537
Processed 23 models on 7 predictors in 0.05683708190917969
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'const'] AIC: 74119.18186452621
Processed 22 models on 8 predictors in 0.050232648849487305
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'const'] AIC: 73582.80712896092
Processed 21 models on 9 predictors in 0.04754281044006348
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'const'] AIC: 73300.80603019353
Processed 20 models on 10 predictors in 0.05469703674316406
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'const'] AIC: 73172.45689465525
Processed 19 models on 11 predictors in 0.06541705131530762
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'const'] AIC: 73007.98393391632
Processed 18 models on 12 predictors in 0.05470633506774902
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'const'] AIC: 72600.22749551709
Processed 17 models on 13 predictors in 0.06354212760925293
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'const'] AIC: 72213.4779500062
Processed 16 models on 14 predictors in 0.07707476615905762
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'const'] AIC: 72016.66207822859
Processed 15 models on 15 predictors in 0.052005767822265625
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'const'] AIC: 71794.57289203294
Processed 14 models on 16 predictors in 0.057944536209106445
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'const'] AIC: 71545.01099081835
Processed 13 models on 17 predictors in 0.06377935409545898
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'const'] AIC: 71421.46799093019
Processed 12 models on 18 predictors in 0.0765688419342041
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'const'] AIC: 71227.49684651385
Processed 11 models on 19 predictors in 0.06574892997741699
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'const'] AIC: 70710.53444193077
Processed 10 models on 20 predictors in 0.05604743957519531
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'const'] AIC: 70698.9108083537
Processed 9 models on 21 predictors in 0.056960344314575195
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'const'] AIC: 70690.41174407185
Processed 8 models on 22 predictors in 0.06496286392211914
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'const'] AIC: 70685.29270576051
Processed 7 models on 23 predictors in 0.05409598350524902
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'relative_humidity', 'const'] AIC: 70684.7916953338
Processed 6 models on 24 predictors in 0.049225807189941406
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'const'] AIC: 70675.7123405445
Processed 5 models on 25 predictors in 0.042273759841918945
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'const'] AIC: 70675.55369410984
Processed 4 models on 26 predictors in 0.03394961357116699
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'export', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'WTI', 'consumption', 'consumer_price_growth_rate', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'price_Thailand', 'const'] AIC: 70675.82760001883
Total elapsed time: 1.5350260734558105 seconds.
OLS Regression Results
==============================================================================
Dep. Variable: avg_price R-squared: 0.883
Model: OLS Adj. R-squared: 0.882
Method: Least Squares F-statistic: 1183.
Date: Fri, 17 Dec 2021 Prob (F-statistic): 0.00
Time: 18:29:21 Log-Likelihood: -35312.
No. Observations: 3948 AIC: 7.068e+04
Df Residuals: 3922 BIC: 7.084e+04
Df Model: 25
Covariance Type: nonrobust
=========================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------
other_goods_services -1.218e+04 238.683 -51.033 0.000 -1.26e+04 -1.17e+04
communication -1.779e+04 278.757 -63.818 0.000 -1.83e+04 -1.72e+04
world_Production_per_cultivation_area 2.74e+04 830.404 32.995 0.000 2.58e+04 2.9e+04
total_Production_per_cultivation_area 7505.6447 372.738 20.137 0.000 6774.866 8236.423
Liquor_tobacco -4648.7420 204.199 -22.766 0.000 -5049.088 -4248.396
export -3561.9152 182.178 -19.552 0.000 -3919.088 -3204.743
food_non_alcoholic_beverages -1.416e+04 647.357 -21.868 0.000 -1.54e+04 -1.29e+04
traffic -2.377e+04 559.617 -42.482 0.000 -2.49e+04 -2.27e+04
price_US 5108.4111 268.067 19.056 0.000 4582.846 5633.976
housing_water_electricity_fuel 4630.9450 306.903 15.089 0.000 4029.240 5232.650
household_goods_housekeeping_services -1.771e+04 369.753 -47.905 0.000 -1.84e+04 -1.7e+04
education 1.042e+04 588.096 17.717 0.000 9266.143 1.16e+04
sanitation 3495.2998 211.211 16.549 0.000 3081.207 3909.393
WTI 4486.4072 409.928 10.944 0.000 3682.715 5290.099
consumption -2.48e+04 4278.034 -5.797 0.000 -3.32e+04 -1.64e+04
consumer_price_growth_rate 4.309e+04 1448.430 29.747 0.000 4.02e+04 4.59e+04
food_accommodation 9367.3226 353.485 26.500 0.000 8674.291 1.01e+04
entertainment_culture -8428.7045 306.233 -27.524 0.000 -9029.095 -7828.314
clothes_shoes 6883.6193 327.895 20.993 0.000 6240.759 7526.480
total_insolation -141.8990 180.565 -0.786 0.432 -495.908 212.110
domestic -1489.5448 445.602 -3.343 0.001 -2363.179 -615.911
CPI -1.223e+04 4691.535 -2.607 0.009 -2.14e+04 -3033.312
relative_humidity -728.2903 198.794 -3.664 0.000 -1118.039 -338.542
temperature 632.9604 206.270 3.069 0.002 228.555 1037.366
import -193.3851 132.041 -1.465 0.143 -452.260 65.490
const 5.112e+04 4521.211 11.308 0.000 4.23e+04 6e+04
==============================================================================
Omnibus: 9.327 Durbin-Watson: 1.964
Prob(Omnibus): 0.009 Jarque-Bera (JB): 9.860
Skew: 0.084 Prob(JB): 0.00723
Kurtosis: 3.178 Cond. No. 715.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import itertools
def processSubset(X,y,feature_set):
model = sm.OLS(y,X[list(feature_set)]) # Modeling
regr = model.fit() # model fitting
AIC = regr.aic # model's AIC
return {"model" : regr, "AIC" : AIC}
def backward(X,y,predictors):
tic = time.time()
results = []
# 데이터 변수들이 미리 정의된 predictors 조합 확인
for combo in itertools.combinations(predictors, len(predictors) - 1):
results.append(processSubset(X=X,y=y,feature_set=list(combo)))
models = pd.DataFrame(results)
# 가장 낮은 AIC를 가진 모델을 선택
best_model = models.loc[models['AIC'].argmin()]
toc = time.time()
print("Processed ",models.shape[0], "models on", len(predictors) - 1, "predictors in",(toc-tic))
print("Selected predictors:",best_model['model'].model.exog_names,' AIC:',best_model[1])
return best_model
def backward_model(X,y) :
X['const'] = np.ones([len(train_x.iloc[:,0]), 1])
Bmodels = pd.DataFrame(columns=["AIC","model"], index = range(1,len(X.columns)))
tic = time.time()
predictors = X.columns#.difference(['const'])
Bmodel_before = processSubset(X,y,predictors)['AIC']
while (len(predictors) > 1):
Backward_result = backward(X, y, predictors=predictors)
if Backward_result['AIC'] > Bmodel_before :
break
Bmodels.loc[len(predictors) -1] = Backward_result
predictors = Bmodels.loc[len(predictors) - 1]['model'].model.exog_names
Bmodel_before = Backward_result["AIC"]
predictors = [k for k in predictors if k != 'const']
toc = time.time()
print("Total elapsed time:",(toc-tic),"seconds.")
return (Bmodels["model"].dropna().iloc[0])
Backward_best_model = backward_model(X=train_X_scaled, y=train_y.astype(float))
print(Backward_best_model.summary())
Processed 30 models on 29 predictors in 0.2884030342102051
Selected predictors: ['const', 'consumption', 'CPI', 'consumer_price_growth_rate', 'food_non_alcoholic_beverages', 'Liquor_tobacco', 'clothes_shoes', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'sanitation', 'traffic', 'communication', 'entertainment_culture', 'education', 'food_accommodation', 'other_goods_services', 'total_Production_per_cultivation_area', 'world_Production_per_cultivation_area', 'WTI', 'domestic', 'export', 'import', 'price_Thailand', 'price_US', 'temperature', 'precipitation', 'relative_humidity', 'total_insolation', 'maximum_snow_depth'] AIC: 70678.42124479609
Processed 28 models on 27 predictors in 0.2950406074523926
Selected predictors: ['consumption', 'CPI', 'consumer_price_growth_rate', 'food_non_alcoholic_beverages', 'Liquor_tobacco', 'clothes_shoes', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'sanitation', 'traffic', 'communication', 'entertainment_culture', 'education', 'food_accommodation', 'other_goods_services', 'total_Production_per_cultivation_area', 'world_Production_per_cultivation_area', 'WTI', 'domestic', 'export', 'import', 'price_Thailand', 'price_US', 'temperature', 'precipitation', 'relative_humidity', 'maximum_snow_depth'] AIC: 70792.20559916232
Total elapsed time: 0.6070828437805176 seconds.
OLS Regression Results
==============================================================================
Dep. Variable: avg_price R-squared: 0.883
Model: OLS Adj. R-squared: 0.882
Method: Least Squares F-statistic: 1057.
Date: Fri, 17 Dec 2021 Prob (F-statistic): 0.00
Time: 18:29:31 Log-Likelihood: -35310.
No. Observations: 3948 AIC: 7.068e+04
Df Residuals: 3919 BIC: 7.086e+04
Df Model: 28
Covariance Type: nonrobust
=========================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------
const 5.349e+04 4910.180 10.894 0.000 4.39e+04 6.31e+04
consumption -2.681e+04 4577.402 -5.857 0.000 -3.58e+04 -1.78e+04
CPI -1.425e+04 4972.618 -2.866 0.004 -2.4e+04 -4503.744
consumer_price_growth_rate 4.394e+04 1585.133 27.723 0.000 4.08e+04 4.71e+04
food_non_alcoholic_beverages -1.456e+04 716.131 -20.336 0.000 -1.6e+04 -1.32e+04
Liquor_tobacco -4668.2988 205.403 -22.728 0.000 -5071.006 -4265.592
clothes_shoes 6915.5141 328.439 21.056 0.000 6271.587 7559.442
housing_water_electricity_fuel 4474.5162 328.146 13.636 0.000 3831.163 5117.870
household_goods_housekeeping_services -1.79e+04 395.842 -45.212 0.000 -1.87e+04 -1.71e+04
sanitation 3370.4022 230.279 14.636 0.000 2918.924 3821.880
traffic -2.404e+04 598.392 -40.174 0.000 -2.52e+04 -2.29e+04
communication -1.789e+04 291.636 -61.331 0.000 -1.85e+04 -1.73e+04
entertainment_culture -8652.0249 350.070 -24.715 0.000 -9338.362 -7965.688
education 1.007e+04 647.997 15.539 0.000 8798.632 1.13e+04
food_accommodation 9509.9228 370.136 25.693 0.000 8784.245 1.02e+04
other_goods_services -1.218e+04 238.830 -51.006 0.000 -1.27e+04 -1.17e+04
total_Production_per_cultivation_area 7498.3075 372.794 20.114 0.000 6767.420 8229.195
world_Production_per_cultivation_area 2.729e+04 834.667 32.691 0.000 2.56e+04 2.89e+04
WTI 4422.9561 412.255 10.729 0.000 3614.702 5231.210
domestic -1575.8005 451.991 -3.486 0.000 -2461.961 -689.640
export -3569.9260 182.754 -19.534 0.000 -3928.227 -3211.625
import -180.7444 132.801 -1.361 0.174 -441.111 79.622
price_Thailand 472.4395 371.932 1.270 0.204 -256.759 1201.638
price_US 4984.6402 287.213 17.355 0.000 4421.540 5547.741
temperature 536.2858 231.602 2.316 0.021 82.214 990.357
precipitation 106.0556 113.581 0.934 0.350 -116.628 328.739
relative_humidity -747.6154 205.759 -3.633 0.000 -1151.021 -344.210
total_insolation -95.2002 188.796 -0.504 0.614 -465.348 274.947
maximum_snow_depth -104.5966 115.659 -0.904 0.366 -331.355 122.162
==============================================================================
Omnibus: 11.499 Durbin-Watson: 1.961
Prob(Omnibus): 0.003 Jarque-Bera (JB): 12.347
Skew: 0.093 Prob(JB): 0.00208
Kurtosis: 3.202 Cond. No. 792.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
def Stepwise_model(X,y):
X['const'] = np.ones([len(train_x.iloc[:,0]), 1])
Stepmodels = pd.DataFrame(columns = ["AIC","model"])
tic = time.time()
predictors = []
Smodel_before = processSubset(X,y,predictors + ['const'])['AIC']
# 변수 1~10개 0-9 -> 1-10
for i in range(1,len(X.columns.difference(['const']))+1) :
Forward_result = forward(X=X,y=y,predictors = predictors) # constant added
print('forward')
Stepmodels.loc[i] = Forward_result
predictors = Stepmodels.loc[i]['model'].model.exog_names
Backward_result = backward(X=X,y=y,predictors = predictors)
if Backward_result["AIC"] < Forward_result["AIC"]:
Stepmodels.loc[i] = Backward_result
predictors = Stepmodels.loc[i]["model"].model.exog_names
Smodel_before = Stepmodels.loc[i]["AIC"]
predictors = [k for k in predictors if k != "const"]
print('backward')
if Stepmodels.loc[i]["AIC"] > Smodel_before:
break
else :
Smodel_before = Stepmodels.loc[i]["AIC"]
toc = time.time()
print("Total elapsed time:",(toc-tic),"seconds.")
return (Stepmodels["model"][len(Stepmodels["model"])])
Stepwise_best_model = Stepwise_model(X=train_X_scaled, y=train_y.astype(float))
print(Stepwise_best_model.summary())
Processed 29 models on 1 predictors in 0.07729840278625488
Selected predictors: ['other_goods_services', 'const'] AIC: 78105.37952774977
forward
Processed 2 models on 1 predictors in 0.009069442749023438
Selected predictors: ['const'] AIC: 79094.61931304072
Processed 28 models on 3 predictors in 0.04531979560852051
Selected predictors: ['other_goods_services', 'const', 'communication', 'const'] AIC: 77579.77371601379
forward
Processed 4 models on 3 predictors in 0.008055448532104492
Selected predictors: ['other_goods_services', 'const', 'communication'] AIC: 77576.98363868354
backward
Processed 27 models on 3 predictors in 0.0346071720123291
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'const'] AIC: 75864.84417359797
forward
Processed 4 models on 3 predictors in 0.007726430892944336
Selected predictors: ['communication', 'world_Production_per_cultivation_area', 'const'] AIC: 76096.07069711725
Processed 26 models on 5 predictors in 0.042700767517089844
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'const', 'export', 'const'] AIC: 75515.1633562185
forward
Processed 6 models on 5 predictors in 0.011532783508300781
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'const', 'export'] AIC: 75497.85461844032
backward
Processed 25 models on 5 predictors in 0.0642092227935791
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'const'] AIC: 75062.97568502938
forward
Processed 6 models on 5 predictors in 0.018305540084838867
Selected predictors: ['communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'const'] AIC: 75259.1335563535
Processed 24 models on 7 predictors in 0.05551290512084961
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'const', 'Liquor_tobacco', 'const'] AIC: 74516.38066822974
forward
Processed 8 models on 7 predictors in 0.014393806457519531
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'const', 'Liquor_tobacco'] AIC: 74514.38628771537
backward
Processed 23 models on 7 predictors in 0.03779864311218262
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'const'] AIC: 74119.18186452621
forward
Processed 8 models on 7 predictors in 0.01401972770690918
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'const'] AIC: 74514.38628771537
Processed 22 models on 9 predictors in 0.06651806831359863
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'const', 'traffic', 'const'] AIC: 73582.80712896092
forward
Processed 10 models on 9 predictors in 0.059084415435791016
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'const', 'traffic'] AIC: 73582.80712896092
Processed 21 models on 11 predictors in 0.13430166244506836
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'const', 'traffic', 'const', 'price_US', 'const'] AIC: 73329.91309175434
forward
Processed 12 models on 11 predictors in 0.07312250137329102
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'const', 'traffic', 'const', 'price_US'] AIC: 73305.16193314802
backward
Processed 20 models on 10 predictors in 0.11478185653686523
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'const'] AIC: 73172.45689465525
forward
Processed 11 models on 10 predictors in 0.05933666229248047
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'const'] AIC: 73300.80603019353
Processed 19 models on 12 predictors in 0.12869644165039062
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'const', 'household_goods_housekeeping_services', 'const'] AIC: 73007.98393391632
forward
Processed 13 models on 12 predictors in 0.04625058174133301
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'const', 'household_goods_housekeeping_services'] AIC: 73007.98393391632
Processed 18 models on 14 predictors in 0.07900333404541016
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'const', 'household_goods_housekeeping_services', 'const', 'education', 'const'] AIC: 72605.31090800592
forward
Processed 15 models on 14 predictors in 0.0642538070678711
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'const', 'household_goods_housekeeping_services', 'const', 'education'] AIC: 72600.22749551709
backward
Processed 17 models on 13 predictors in 0.08031606674194336
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'const'] AIC: 72213.47795000618
forward
Processed 14 models on 13 predictors in 0.046999216079711914
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'const'] AIC: 72364.01074269338
Processed 16 models on 15 predictors in 0.06592655181884766
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'const', 'consumption', 'const'] AIC: 72019.7168131776
forward
Processed 16 models on 15 predictors in 0.06621241569519043
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'const', 'consumption'] AIC: 72017.86210961561
backward
Processed 15 models on 15 predictors in 0.12184715270996094
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const'] AIC: 71763.06656374168
forward
Processed 16 models on 15 predictors in 0.06098508834838867
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const'] AIC: 71820.33985690365
Processed 14 models on 17 predictors in 0.07139873504638672
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'const'] AIC: 71545.01099081835
forward
Processed 18 models on 17 predictors in 0.0866250991821289
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI'] AIC: 71545.01099081835
Processed 13 models on 19 predictors in 0.11402368545532227
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'const', 'food_accommodation', 'const'] AIC: 71424.94560131917
forward
Processed 20 models on 19 predictors in 0.15393614768981934
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'food_accommodation', 'const'] AIC: 71434.07991903853
Processed 12 models on 21 predictors in 0.09138703346252441
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'const', 'food_accommodation', 'const', 'entertainment_culture', 'const'] AIC: 71227.49684651385
forward
Processed 22 models on 21 predictors in 0.18970465660095215
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'const', 'food_accommodation', 'const', 'entertainment_culture'] AIC: 71227.49684651385
Processed 11 models on 23 predictors in 0.11341047286987305
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'const', 'food_accommodation', 'const', 'entertainment_culture', 'const', 'clothes_shoes', 'const'] AIC: 70750.19869653613
forward
Processed 24 models on 23 predictors in 0.16450047492980957
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'const', 'WTI', 'const', 'food_accommodation', 'const', 'entertainment_culture', 'clothes_shoes', 'const'] AIC: 70711.24559005174
backward
Processed 10 models on 20 predictors in 0.0630350112915039
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'const'] AIC: 70698.91080835368
forward
Processed 21 models on 20 predictors in 0.17449283599853516
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'const'] AIC: 70710.53444193077
Processed 9 models on 22 predictors in 0.09258270263671875
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'const', 'temperature', 'const'] AIC: 70706.36635754925
forward
Processed 23 models on 22 predictors in 0.14938044548034668
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'const', 'const'] AIC: 70698.9108083537
backward
Processed 9 models on 21 predictors in 0.08931684494018555
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'const'] AIC: 70690.41174407185
forward
Processed 22 models on 21 predictors in 0.17949986457824707
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'const'] AIC: 70698.91080835368
Processed 8 models on 23 predictors in 0.06899857521057129
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'const', 'maximum_snow_depth', 'const'] AIC: 70691.2281311667
forward
Processed 24 models on 23 predictors in 0.23407220840454102
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'const', 'const'] AIC: 70690.41174407185
backward
Processed 8 models on 22 predictors in 0.06008267402648926
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'const'] AIC: 70685.29270576051
forward
Processed 23 models on 22 predictors in 0.19397878646850586
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'const'] AIC: 70690.41174407185
Processed 7 models on 24 predictors in 0.08258676528930664
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'const', 'relative_humidity', 'const'] AIC: 70684.7916953338
forward
Processed 25 models on 24 predictors in 0.16710734367370605
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'const', 'relative_humidity'] AIC: 70684.79169533378
backward
Processed 6 models on 24 predictors in 0.07037067413330078
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'total_insolation', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'const'] AIC: 70675.7123405445
forward
Processed 25 models on 24 predictors in 0.21991562843322754
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'const'] AIC: 70674.31050172621
backward
Processed 6 models on 24 predictors in 0.038034677505493164
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'const'] AIC: 70674.1753192839
forward
Processed 25 models on 24 predictors in 0.14841771125793457
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'const'] AIC: 70674.31050172621
Processed 5 models on 26 predictors in 0.03907918930053711
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'const', 'price_Thailand', 'const'] AIC: 70675.18003619988
forward
Processed 27 models on 26 predictors in 0.20505881309509277
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'const', 'const'] AIC: 70674.1753192839
backward
Processed 5 models on 25 predictors in 0.04565286636352539
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'price_Thailand', 'const'] AIC: 70674.5389454217
forward
Processed 26 models on 25 predictors in 0.1743621826171875
Selected predictors: ['other_goods_services', 'communication', 'world_Production_per_cultivation_area', 'export', 'total_Production_per_cultivation_area', 'Liquor_tobacco', 'food_non_alcoholic_beverages', 'traffic', 'price_US', 'housing_water_electricity_fuel', 'household_goods_housekeeping_services', 'education', 'sanitation', 'consumption', 'consumer_price_growth_rate', 'WTI', 'food_accommodation', 'entertainment_culture', 'clothes_shoes', 'domestic', 'CPI', 'relative_humidity', 'temperature', 'import', 'const'] AIC: 70674.1753192839
backward
Total elapsed time: 5.425881624221802 seconds.
OLS Regression Results
==============================================================================
Dep. Variable: avg_price R-squared: 0.883
Model: OLS Adj. R-squared: 0.882
Method: Least Squares F-statistic: 1233.
Date: Fri, 17 Dec 2021 Prob (F-statistic): 0.00
Time: 18:29:43 Log-Likelihood: -35312.
No. Observations: 3948 AIC: 7.067e+04
Df Residuals: 3923 BIC: 7.083e+04
Df Model: 24
Covariance Type: nonrobust
=========================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------
other_goods_services -1.218e+04 238.654 -51.049 0.000 -1.27e+04 -1.17e+04
communication -1.777e+04 277.125 -64.109 0.000 -1.83e+04 -1.72e+04
world_Production_per_cultivation_area 2.738e+04 829.975 32.988 0.000 2.58e+04 2.9e+04
export -3565.4222 182.114 -19.578 0.000 -3922.470 -3208.374
total_Production_per_cultivation_area 7500.6925 372.667 20.127 0.000 6770.054 8231.331
Liquor_tobacco -4647.5543 204.183 -22.762 0.000 -5047.870 -4247.239
food_non_alcoholic_beverages -1.414e+04 646.907 -21.855 0.000 -1.54e+04 -1.29e+04
traffic -2.376e+04 559.111 -42.488 0.000 -2.49e+04 -2.27e+04
price_US 5109.2013 268.052 19.060 0.000 4583.666 5634.737
housing_water_electricity_fuel 4635.7578 306.827 15.109 0.000 4034.202 5237.313
household_goods_housekeeping_services -1.77e+04 369.089 -47.945 0.000 -1.84e+04 -1.7e+04
education 1.042e+04 588.059 17.714 0.000 9263.835 1.16e+04
sanitation 3491.3050 211.139 16.536 0.000 3077.352 3905.258
consumption -2.481e+04 4277.795 -5.800 0.000 -3.32e+04 -1.64e+04
consumer_price_growth_rate 4.304e+04 1447.174 29.741 0.000 4.02e+04 4.59e+04
WTI 4474.2457 409.616 10.923 0.000 3671.166 5277.326
food_accommodation 9369.7159 353.455 26.509 0.000 8676.744 1.01e+04
entertainment_culture -8427.9612 306.217 -27.523 0.000 -9028.320 -7827.603
clothes_shoes 6885.4360 327.871 21.000 0.000 6242.623 7528.249
domestic -1480.8882 445.444 -3.325 0.001 -2354.212 -607.564
CPI -1.227e+04 4691.092 -2.615 0.009 -2.15e+04 -3069.355
relative_humidity -607.5714 126.176 -4.815 0.000 -854.947 -360.195
temperature 507.5266 130.649 3.885 0.000 251.380 763.673
import -192.3362 132.027 -1.457 0.145 -451.185 66.513
const 5.108e+04 4520.603 11.299 0.000 4.22e+04 5.99e+04
==============================================================================
Omnibus: 10.190 Durbin-Watson: 1.964
Prob(Omnibus): 0.006 Jarque-Bera (JB): 10.772
Skew: 0.090 Prob(JB): 0.00458
Kurtosis: 3.182 Cond. No. 703.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# 모델에 의해 예측된/추정된 값 = test_y
pred_y_full = fitted_full_model.predict(test_X_scaled)
pred_y_forward = Forward_best_model.predict(test_X_scaled[Forward_best_model.model.exog_names])
pred_y_backward = Backward_best_model.predict(test_X_scaled[Backward_best_model.model.exog_names])
pred_y_stepwise = Stepwise_best_model.predict(test_X_scaled[Stepwise_best_model.model.exog_names])
# MSE, RMSE, MAE, MAPE 4가지 지표를 통해 예측성능을 확인할 예정
perf_mat = pd.DataFrame(columns=["ALL", "FORWARD", "BACKWARD", "STEPWISE"],index =['MSE', 'RMSE','MAE', 'MAPE', 'nParams', 'AIC', 'R2 for Training','R2 for Test'])
# MAPE의 함수
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# 성능지표
perf_mat.loc['MSE']['ALL'] = metrics.mean_squared_error(test_y,pred_y_full)
perf_mat.loc['MSE']['FORWARD'] = metrics.mean_squared_error(test_y,pred_y_forward)
perf_mat.loc['MSE']['BACKWARD'] = metrics.mean_squared_error(test_y,pred_y_backward)
perf_mat.loc['MSE']['STEPWISE'] = metrics.mean_squared_error(test_y,pred_y_stepwise)
perf_mat.loc['RMSE']['ALL'] = np.sqrt(metrics.mean_squared_error(test_y, pred_y_full))
perf_mat.loc['RMSE']['FORWARD'] = np.sqrt(metrics.mean_squared_error(test_y, pred_y_forward))
perf_mat.loc['RMSE']['BACKWARD'] = np.sqrt(metrics.mean_squared_error(test_y, pred_y_backward))
perf_mat.loc['RMSE']['STEPWISE'] = np.sqrt(metrics.mean_squared_error(test_y, pred_y_stepwise))
perf_mat.loc['MAE']['ALL'] = metrics.mean_absolute_error(test_y, pred_y_full)
perf_mat.loc['MAE']['FORWARD'] = metrics.mean_absolute_error(test_y, pred_y_forward)
perf_mat.loc['MAE']['BACKWARD'] = metrics.mean_absolute_error(test_y, pred_y_backward)
perf_mat.loc['MAE']['STEPWISE'] = metrics.mean_absolute_error(test_y, pred_y_stepwise)
perf_mat.loc['MAPE']['ALL'] = mean_absolute_percentage_error(test_y, pred_y_full)
perf_mat.loc['MAPE']['FORWARD'] = mean_absolute_percentage_error(test_y, pred_y_forward)
perf_mat.loc['MAPE']['BACKWARD'] = mean_absolute_percentage_error(test_y, pred_y_backward)
perf_mat.loc['MAPE']['STEPWISE'] = mean_absolute_percentage_error(test_y, pred_y_stepwise)
perf_mat.loc['nParams']['ALL'] = fitted_full_model.params.shape
perf_mat.loc['nParams']['FORWARD'] = Forward_best_model.params.shape
perf_mat.loc['nParams']['BACKWARD'] = Backward_best_model.params.shape
perf_mat.loc['nParams']['STEPWISE'] = Stepwise_best_model.params.shape
perf_mat.loc['AIC']['ALL'] = fitted_full_model.aic
perf_mat.loc['AIC']['FORWARD'] = Forward_best_model.aic
perf_mat.loc['AIC']['BACKWARD'] = Backward_best_model.aic
perf_mat.loc['AIC']['STEPWISE'] = Stepwise_best_model.aic
perf_mat.loc['R2 for Training']['ALL'] = fitted_full_model.rsquared_adj
perf_mat.loc['R2 for Training']['FORWARD'] = Forward_best_model.rsquared_adj
perf_mat.loc['R2 for Training']['BACKWARD'] = Backward_best_model.rsquared_adj
perf_mat.loc['R2 for Training']['STEPWISE'] = Stepwise_best_model.rsquared_adj
perf_mat.loc['R2 for Test']['ALL'] = r2_score(test_y, pred_y_full)
perf_mat.loc['R2 for Test']['FORWARD'] = r2_score(test_y, pred_y_forward)
perf_mat.loc['R2 for Test']['BACKWARD'] = r2_score(test_y, pred_y_backward)
perf_mat.loc['R2 for Test']['STEPWISE'] = r2_score(test_y, pred_y_stepwise)
perf_mat
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
ALL | FORWARD | BACKWARD | STEPWISE | |
---|---|---|---|---|
MSE | 3739019.455251 | 2609601481.047268 | 2856746960.099178 | 2604864530.586144 |
RMSE | 1933.65443 | 51084.258642 | 53448.544976 | 51037.873492 |
MAE | 1560.6847 | 51048.437127 | 53414.346131 | 51002.031696 |
MAPE | 3.886255 | 125.225567 | 131.039202 | 125.111652 |
nParams | (30,) | (26,) | (29,) | (25,) |
AIC | 70796.141107 | 70675.553694 | 70678.421245 | 70674.175319 |
R2 for Training | 0.997976 | 0.882204 | 0.882207 | 0.882215 |
R2 for Test | 0.868563 | -90.734963 | -99.422834 | -90.568445 |
plt.figure(figsize=(6,6))
plt.scatter(test_y, fitted_full_model.predict(test_X_scaled), alpha=0.4)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Regression:: Compare Actual vs. Model Prediction")
plt.axline([30000,30000],[60000,60000], color='black', linewidth=1,linestyle='--',alpha=1)
plt.xlim([30000, 60000])
plt.ylim([30000, 60000])
plt.show()
# # x축은 실제 가격, y축은 예측가격이다. 만약 정답을 맞춘다면 정확히 선으로 일치되어 나올 거다.
plt.figure(figsize=(12,6))
plt.title("Regression:: Price Prediction")
plt.plot(y,'b' ,label = 'actual price')
# plt.plot(y,'b',label = 'actual price',lw=8,alpha=0.15)
plt.plot(fitted_full_model.predict(pd.DataFrame(scaler.transform(X),columns=X.columns,index=X.index)),'r' ,label = 'predicted price by the model')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
OLS_residual2 = np.array(np.array(y) - np.array(fitted_full_model.predict(pd.DataFrame(scaler.transform(X),columns=X.columns,index=X.index))))
# 검증데이터에 대한 예측
# 예측데이터 잔차
plt.figure(figsize=(12,6))
plt.title("Regression:: Prediction error 예측오차(aka. Residual)")
plt.plot(OLS_residual2[3948:],'r')
plt.ylim([-6000, 6000])
plt.axhline(y=0, color='black', linewidth=1,linestyle='--',alpha=0.8)
# plt.grid(alpha=0.3)
plt.show()
# rf_model1
# RandomForestRegressor 무작위 분할
X = df2.iloc[:,1:]
y = df2.avg_price
def rfmodel(seed):
train_x, test_x, train_y, test_y = train_test_split(X,y, train_size = 0.7, test_size = 0.3, random_state=seed)
#학습
scaler = MinMaxScaler()
scaler.fit(train_x)
train_X_scaled = pd.DataFrame(scaler.transform(train_x),columns=train_x.columns,index=train_x.index)
test_X_scaled = pd.DataFrame(scaler.transform(test_x),columns=test_x.columns,index=test_x.index)
rf_model1 = RandomForestRegressor(n_estimators = 200,max_depth=12)
rf_model1.fit(train_X_scaled, train_y)
y_pred = rf_model1.predict(test_X_scaled)
return r2_score(np.array(test_y),np.array(y_pred.astype(int)))
test2 = []
for i in range(10):
test2.append(rfmodel(i))
test2
[0.9979956503342118,
0.9978098231150424,
0.9979941096306769,
0.9979722796702802,
0.9975907629417546,
0.9981565860117012,
0.9984285949865582,
0.9982498195104308,
0.9980938308139637,
0.997809918264121]
# rf_model1
# RandomForestRegressor 무작위 분할
X = df2.iloc[:,1:]
y = df2.avg_price
train_x, test_x, train_y, test_y = train_test_split(X,y, train_size = 0.7, test_size = 0.3)
#학습
scaler = MinMaxScaler()
scaler.fit(train_x)
train_X_scaled = pd.DataFrame(scaler.transform(train_x),columns=train_x.columns,index=train_x.index)
test_X_scaled = pd.DataFrame(scaler.transform(test_x),columns=test_x.columns,index=test_x.index)
rf_model1 = RandomForestRegressor(n_estimators = 200,max_depth=12)
rf_model1.fit(train_X_scaled, train_y)
y_pred = rf_model1.predict(test_X_scaled)
result = pd.DataFrame({'pred' : y_pred, 'real' : test_y})
test_X_scaled1 = test_X_scaled
test_y1 = test_y
print(f'Max Depth: {rf_model1.estimators_[0].tree_.max_depth}')
print(f'R2: {r2_score(np.array(test_y),np.array(y_pred.astype(int)))}')
print(f'RMSE: {np.sqrt(metrics.mean_squared_error(test_y, y_pred))}')
plt.figure(figsize=(6,6))
plt.scatter(np.array(test_y), np.array(y_pred),alpha=0.4)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.axline([30000,30000],[60000,60000], color='black', linewidth=1,linestyle='--',alpha=1)
plt.xlim([30000, 60000])
plt.ylim([30000, 60000])
plt.title("RandomForestRegressor:: Compare Actual vs. Model Prediction")
plt.show()
Max Depth: 12
R2: 0.9979968414168353
RMSE: 243.65379367422867
plt.figure(figsize=(12,6))
plt.title("RandomForestRegressor:: Price Prediction", fontsize=15)
plt.plot(y,'b',label = 'actual price',alpha=1,lw=3)
# plt.plot(y,'b',label = 'actual price',lw=8,alpha=0.1)
plt.plot(rf_model1.predict(pd.DataFrame(scaler.transform(X),columns=X.columns,index=X.index)),'r',label = 'predicted price by the model',lw=3)
plt.legend()
plt.grid(alpha=0.3)
plt.show()
RFR_residual = np.array(np.array(y) - np.array(rf_model1.predict(pd.DataFrame(scaler.transform(X),columns=X.columns,index=X.index))))
plt.figure(figsize=(12,6))
plt.title("RandomForestRegressor:: Prediction error 예측오차(aka. Residual)")
plt.plot(RFR_residual,'r')
plt.ylim([-6000, 6000])
plt.axhline(y=0, color='black', linewidth=1,linestyle='--',alpha=1)
plt.show()
plt.figure(figsize=(20,20))
_ = tree.plot_tree(rf_model1.estimators_[0], feature_names=X.columns, filled=True)
# 모델에 의해 예측된/추정된 값
y_pred1 = rf_model1.predict(test_X_scaled1)
perf_mat = pd.DataFrame(columns=["RF_Regressor"],index =['MSE', 'RMSE', 'MAE', 'R2'])
# 성능지표
perf_mat.loc['MSE']['RF_Regressor'] = metrics.mean_squared_error(test_y1,y_pred1)
perf_mat.loc['RMSE']['RF_Regressor'] = np.sqrt(metrics.mean_squared_error(test_y1, y_pred1))
perf_mat.loc['MAE']['RF_Regressor'] = metrics.mean_absolute_error(test_y1, y_pred1)
perf_mat.loc['R2']['RF_Regressor'] = r2_score(np.array(test_y1),np.array(y_pred1.astype(int)))
perf_mat
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
RF_Regressor | |
---|---|
MSE | 61109.041195 |
RMSE | 247.20243 |
MAE | 110.905475 |
R2 | 0.997868 |
# Feature Importance
def plot_feature_importance(model):
n_features = train_X_scaled.shape[1]
plt.figure(figsize=(12,6))
plt.barh(np.arange(n_features), model.feature_importances_, align="center")
plt.yticks(np.arange(n_features), train_X_scaled.columns)
plt.title("RandomForestClassifier:: Feature Importance")
plt.xlabel("Random Forest Feature Importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
plot_feature_importance(rf_model1)
perm = PermutationImportance(rf_model1,random_state=1).fit(test_X_scaled1,test_y1)
eli5.show_weights(perm, top = 29,feature_names = test_X_scaled1.columns.tolist())
Weight Feature
0.2999 ± 0.0277 housing_water_electricity_fuel
0.1934 ± 0.0087 consumption
0.1807 ± 0.0065 world_Production_per_cultivation_area
0.1203 ± 0.0109 traffic
0.0771 ± 0.0060 CPI
0.0719 ± 0.0063 price_US
0.0499 ± 0.0042 household_goods_housekeeping_services
0.0445 ± 0.0042 price_Thailand
0.0305 ± 0.0026 other_goods_services
0.0205 ± 0.0026 domestic
0.0102 ± 0.0011 export
0.0100 ± 0.0016 temperature
0.0096 ± 0.0014 WTI
0.0076 ± 0.0007 import
0.0046 ± 0.0005 education
0.0027 ± 0.0002 consumer_price_growth_rate
0.0018 ± 0.0002 total_Production_per_cultivation_area
0.0018 ± 0.0001 communication
0.0015 ± 0.0002 entertainment_culture
0.0012 ± 0.0002 Liquor_tobacco
0.0004 ± 0.0001 sanitation
0.0004 ± 0.0001 food_non_alcoholic_beverages
0.0004 ± 0.0001 total_insolation
0.0001 ± 0.0000 food_accommodation
0.0000 ± 0.0000 clothes_shoes
0.0000 ± 0.0000 precipitation
0.0000 ± 0.0000 maximum_snow_depth
0.0000 ± 0.0000 relative_humidity
0.0000 ± 0.0001 wind_speed
# AIC 기준을 사용하여 데이터의 stationarity를 찾기 위해 ADF(Advanced Dickey-Fuller test)와 같은 통계적 테스트를 수행
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
var_df=df3#pd.read_csv("data/rice_df2.csv")
var_df['date']=pd.to_datetime(var_df['date'])
var_df.set_index('date',inplace=True)
# var_df = var_df[['avg_price','consumption', 'CPI','world_Production_per_cultivation_area']]
var_list = list(var_df.columns)
<ipython-input-12-84322e100b01>:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
var_df['date']=pd.to_datetime(var_df['date'])
var_df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_price | consumption | CPI | consumer_price_growth_rate | housing_water_electricity_fuel | world_Production_per_cultivation_area | traffic | price_US | price_Thailand | temperature | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2000-01-03 | 41900 | 93.600 | 66.6 | 2.3 | 3.7 | 2.626257 | 4.5 | 0.499000 | 247.500000 | 0.692857 |
2000-01-04 | 41900 | 93.600 | 66.6 | 2.3 | 3.7 | 2.626257 | 4.5 | 0.499000 | 247.500000 | 1.428571 |
2000-01-05 | 41800 | 93.600 | 66.6 | 2.3 | 3.7 | 2.626257 | 4.5 | 0.499000 | 247.500000 | 5.642857 |
2000-01-06 | 41800 | 93.600 | 66.6 | 2.3 | 3.7 | 2.626257 | 4.5 | 0.499000 | 247.500000 | 5.171429 |
2000-01-07 | 41900 | 93.600 | 66.6 | 2.3 | 3.7 | 2.626257 | 4.5 | 0.499000 | 247.500000 | -3.750000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-11-05 | 54020 | 52.609 | 105.4 | 0.5 | 0.5 | 3.118175 | -1.8 | 0.745517 | 496.120606 | 12.220000 |
2021-11-08 | 53880 | 52.609 | 105.4 | 0.5 | 0.5 | 3.118175 | -1.8 | 0.745517 | 496.120606 | 9.786667 |
2021-11-09 | 53880 | 52.609 | 105.4 | 0.5 | 0.5 | 3.118175 | -1.8 | 0.745517 | 496.120606 | 7.020000 |
2021-11-10 | 53880 | 52.609 | 105.4 | 0.5 | 0.5 | 3.118175 | -1.8 | 0.745517 | 496.120606 | 6.453333 |
2021-11-11 | 53880 | 52.609 | 105.4 | 0.5 | 0.5 | 3.118175 | -1.8 | 0.745517 | 496.120606 | 6.480000 |
5641 rows × 10 columns
stationary = list()
non_stationary = list()
for i in var_list:
adfuller_test = adfuller(var_df[f'{i}'], autolag= "AIC")
print(f"# {i}")
print("ADF test statistic: {}".format(adfuller_test[0]))
print("p-value: {}".format(adfuller_test[1]))
if adfuller_test[1] < 0.05:
stationary.append(f'{i}')
elif adfuller_test[1] > 0.05:
non_stationary.append(f'{i}')
print("====================================")
# avg_price
ADF test statistic: -1.0122937952380708
p-value: 0.7487540543218493
====================================
# consumption
ADF test statistic: -0.5944797324880658
p-value: 0.8722245086279843
====================================
# CPI
ADF test statistic: -1.2840683224126532
p-value: 0.6364291925878801
====================================
# consumer_price_growth_rate
ADF test statistic: -1.6787084040856628
p-value: 0.4421145810217301
====================================
# housing_water_electricity_fuel
ADF test statistic: -2.215437061452082
p-value: 0.20069047620327513
====================================
# world_Production_per_cultivation_area
ADF test statistic: 0.00747441304398735
p-value: 0.9591458670977372
====================================
# traffic
ADF test statistic: -2.683959425055037
p-value: 0.07685535708464575
====================================
# price_US
ADF test statistic: -1.668614380273141
p-value: 0.4473514901261649
====================================
# price_Thailand
ADF test statistic: -2.704065307581357
p-value: 0.07333743133111935
====================================
# temperature
ADF test statistic: -10.486400222076213
p-value: 1.177318954026645e-18
====================================
sta_df = var_df[stationary]
nonsta_df = var_df[non_stationary]
mydata_diff = nonsta_df.diff().dropna()
mydata_diff
stationary2 = list()
non_stationary2 = list()
for i in non_stationary:
adfuller_test = adfuller(mydata_diff[f'{i}'], autolag= "AIC")
print(f"{i}")
print("ADF test statistic: {}".format(adfuller_test[0]))
print("p-value: {}".format(adfuller_test[1]))
if adfuller_test[1] < 0.05:
stationary2.append(f'{i}')
elif adfuller_test[1] > 0.05:
non_stationary2.append(f'{i}')
print("====================================")
non_stationary2
var_df_diff = pd.merge(mydata_diff,sta_df,left_index=True, right_index=True,how='left')
var_df_diff # 모두 stationary 상태
var_df_diff.index = pd.DatetimeIndex(var_df_diff.index).to_period('D');var_df_diff
avg_price
ADF test statistic: -11.556754808492185
p-value: 3.34457428212917e-21
====================================
consumption
ADF test statistic: -75.28422572299185
p-value: 0.0
====================================
CPI
ADF test statistic: -75.29911534243007
p-value: 0.0
====================================
consumer_price_growth_rate
ADF test statistic: -75.0818428819782
p-value: 0.0
====================================
housing_water_electricity_fuel
ADF test statistic: -75.08208113800826
p-value: 0.0
====================================
world_Production_per_cultivation_area
ADF test statistic: -75.16372856699427
p-value: 0.0
====================================
traffic
ADF test statistic: -75.08119618421198
p-value: 0.0
====================================
price_US
ADF test statistic: -13.671209800172662
p-value: 1.4585328433675597e-25
====================================
price_Thailand
ADF test statistic: -10.61922118726383
p-value: 5.571249597479361e-19
====================================
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_price | consumption | CPI | consumer_price_growth_rate | housing_water_electricity_fuel | world_Production_per_cultivation_area | traffic | price_US | price_Thailand | temperature | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2000-01-04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.428571 |
2000-01-05 | -100.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.642857 |
2000-01-06 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.171429 |
2000-01-07 | 100.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -3.750000 |
2000-01-08 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.292857 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-11-05 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.220000 |
2021-11-08 | -140.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.786667 |
2021-11-09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.020000 |
2021-11-10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.453333 |
2021-11-11 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.480000 |
5640 rows × 10 columns
train = var_df_diff.iloc[:-200,:]
test = var_df_diff.iloc[-200:,:]
test
forecasting_model = VAR(train)
results_aic = []
for p in range(1,10):
results = forecasting_model.fit(p)
results_aic.append(results.aic)
sns.set()
plt.plot(list(np.arange(1,10,1)), results_aic)
plt.xlabel("Order")
plt.ylabel("AIC")
plt.show()
results = forecasting_model.fit(1)
results.summary()
Summary of Regression Results
==================================
Model: VAR
Method: OLS
Date: Tue, 07, Dec, 2021
Time: 15:49:56
--------------------------------------------------------------------
No. of Equations: 10.0000 BIC: -31.2450
Nobs: 5439.00 HQIC: -31.3319
Log likelihood: 8267.84 FPE: 2.35765e-14
AIC: -31.3785 Det(Omega_mle): 2.31049e-14
--------------------------------------------------------------------
Results for equation avg_price
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const 5.296173 2.543815 2.082 0.037
L1.avg_price 0.099984 0.013536 7.386 0.000
L1.consumption -0.068098 15.713675 -0.004 0.997
L1.CPI 2.156312 19.401015 0.111 0.912
L1.consumer_price_growth_rate -12.375216 36.548856 -0.339 0.735
L1.housing_water_electricity_fuel 5.725233 21.721118 0.264 0.792
L1.world_Production_per_cultivation_area 31.836032 708.319986 0.045 0.964
L1.traffic -1.598371 8.366155 -0.191 0.848
L1.price_US 74.618489 408.483630 0.183 0.855
L1.price_Thailand -0.095832 0.207399 -0.462 0.644
L1.temperature -0.216247 0.155280 -1.393 0.164
===========================================================================================================
Results for equation consumption
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const -0.023859 0.003267 -7.302 0.000
L1.avg_price 0.000004 0.000017 0.247 0.805
L1.consumption -0.005550 0.020184 -0.275 0.783
L1.CPI 0.005663 0.024920 0.227 0.820
L1.consumer_price_growth_rate -0.005164 0.046946 -0.110 0.912
L1.housing_water_electricity_fuel -0.001133 0.027900 -0.041 0.968
L1.world_Production_per_cultivation_area 0.055809 0.909809 0.061 0.951
L1.traffic 0.000823 0.010746 0.077 0.939
L1.price_US -0.032361 0.524681 -0.062 0.951
L1.price_Thailand 0.000037 0.000266 0.140 0.889
L1.temperature 0.001230 0.000199 6.169 0.000
===========================================================================================================
Results for equation CPI
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const 0.022475 0.002985 7.528 0.000
L1.avg_price -0.000004 0.000016 -0.254 0.799
L1.consumption 0.005227 0.018441 0.283 0.777
L1.CPI -0.005335 0.022769 -0.234 0.815
L1.consumer_price_growth_rate 0.004862 0.042894 0.113 0.910
L1.housing_water_electricity_fuel 0.001069 0.025492 0.042 0.967
L1.world_Production_per_cultivation_area -0.052564 0.831280 -0.063 0.950
L1.traffic -0.000776 0.009818 -0.079 0.937
L1.price_US 0.030265 0.479394 0.063 0.950
L1.price_Thailand -0.000035 0.000243 -0.144 0.885
L1.temperature -0.001157 0.000182 -6.346 0.000
===========================================================================================================
Results for equation consumer_price_growth_rate
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const -0.001162 0.001498 -0.776 0.438
L1.avg_price 0.000000 0.000008 0.026 0.979
L1.consumption -0.000271 0.009251 -0.029 0.977
L1.CPI 0.000275 0.011422 0.024 0.981
L1.consumer_price_growth_rate -0.000254 0.021518 -0.012 0.991
L1.housing_water_electricity_fuel -0.000054 0.012788 -0.004 0.997
L1.world_Production_per_cultivation_area 0.002727 0.417014 0.007 0.995
L1.traffic 0.000040 0.004925 0.008 0.994
L1.price_US -0.001807 0.240490 -0.008 0.994
L1.price_Thailand 0.000002 0.000122 0.015 0.988
L1.temperature 0.000063 0.000091 0.685 0.493
===========================================================================================================
Results for equation housing_water_electricity_fuel
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const -0.001470 0.002509 -0.586 0.558
L1.avg_price 0.000000 0.000013 0.020 0.984
L1.consumption -0.000338 0.015497 -0.022 0.983
L1.CPI 0.000350 0.019134 0.018 0.985
L1.consumer_price_growth_rate -0.000309 0.036045 -0.009 0.993
L1.housing_water_electricity_fuel -0.000075 0.021422 -0.004 0.997
L1.world_Production_per_cultivation_area 0.003405 0.698556 0.005 0.996
L1.traffic 0.000052 0.008251 0.006 0.995
L1.price_US -0.001193 0.402853 -0.003 0.998
L1.price_Thailand 0.000002 0.000205 0.010 0.992
L1.temperature 0.000066 0.000153 0.434 0.665
===========================================================================================================
Results for equation world_Production_per_cultivation_area
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const 0.000290 0.000061 4.721 0.000
L1.avg_price -0.000000 0.000000 -0.159 0.873
L1.consumption 0.000067 0.000379 0.178 0.859
L1.CPI -0.000069 0.000468 -0.147 0.883
L1.consumer_price_growth_rate 0.000063 0.000881 0.071 0.943
L1.housing_water_electricity_fuel 0.000014 0.000524 0.026 0.979
L1.world_Production_per_cultivation_area -0.000678 0.017077 -0.040 0.968
L1.traffic -0.000010 0.000202 -0.049 0.961
L1.price_US 0.000399 0.009848 0.041 0.968
L1.price_Thailand -0.000000 0.000005 -0.091 0.928
L1.temperature -0.000015 0.000004 -4.009 0.000
===========================================================================================================
Results for equation traffic
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const -0.003256 0.006467 -0.503 0.615
L1.avg_price 0.000001 0.000034 0.017 0.986
L1.consumption -0.000753 0.039948 -0.019 0.985
L1.CPI 0.000774 0.049322 0.016 0.987
L1.consumer_price_growth_rate -0.000695 0.092915 -0.007 0.994
L1.housing_water_electricity_fuel -0.000160 0.055220 -0.003 0.998
L1.world_Production_per_cultivation_area 0.007580 1.800706 0.004 0.997
L1.traffic 0.000114 0.021269 0.005 0.996
L1.price_US -0.003578 1.038455 -0.003 0.997
L1.price_Thailand 0.000005 0.000527 0.009 0.993
L1.temperature 0.000158 0.000395 0.400 0.689
===========================================================================================================
Results for equation price_US
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const 0.000003 0.000085 0.039 0.969
L1.avg_price 0.000001 0.000000 1.590 0.112
L1.consumption -0.000001 0.000527 -0.002 0.998
L1.CPI -0.000001 0.000650 -0.002 0.998
L1.consumer_price_growth_rate -0.000006 0.001225 -0.005 0.996
L1.housing_water_electricity_fuel 0.000005 0.000728 0.007 0.994
L1.world_Production_per_cultivation_area 0.000162 0.023733 0.007 0.995
L1.traffic 0.000000 0.000280 0.000 1.000
L1.price_US -0.001713 0.013687 -0.125 0.900
L1.price_Thailand 0.000000 0.000007 0.001 0.999
L1.temperature 0.000004 0.000005 0.715 0.475
===========================================================================================================
Results for equation price_Thailand
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const 0.314606 0.167354 1.880 0.060
L1.avg_price 0.000338 0.000891 0.380 0.704
L1.consumption 0.074504 1.033779 0.072 0.943
L1.CPI -0.074418 1.276363 -0.058 0.954
L1.consumer_price_growth_rate 0.070213 2.404494 0.029 0.977
L1.housing_water_electricity_fuel 0.014292 1.428999 0.010 0.992
L1.world_Production_per_cultivation_area -0.665197 46.599294 -0.014 0.989
L1.traffic -0.009705 0.550397 -0.018 0.986
L1.price_US 0.002093 26.873516 0.000 1.000
L1.price_Thailand -0.000610 0.013644 -0.045 0.964
L1.temperature -0.020063 0.010216 -1.964 0.050
===========================================================================================================
Results for equation temperature
===========================================================================================================
coefficient std. error t-stat prob
-----------------------------------------------------------------------------------------------------------
const 0.346130 0.049972 6.926 0.000
L1.avg_price -0.000097 0.000266 -0.365 0.715
L1.consumption 0.512356 0.308689 1.660 0.097
L1.CPI 0.048918 0.381126 0.128 0.898
L1.consumer_price_growth_rate 1.715269 0.717988 2.389 0.017
L1.housing_water_electricity_fuel -0.473688 0.426703 -1.110 0.267
L1.world_Production_per_cultivation_area -0.877840 13.914675 -0.063 0.950
L1.traffic -0.441019 0.164350 -2.683 0.007
L1.price_US -2.278680 8.024504 -0.284 0.776
L1.price_Thailand -0.002813 0.004074 -0.690 0.490
L1.temperature 0.974041 0.003050 319.315 0.000
===========================================================================================================
Correlation matrix of residuals
avg_price consumption CPI consumer_price_growth_rate housing_water_electricity_fuel world_Production_per_cultivation_area traffic price_US price_Thailand temperature
avg_price 1.000000 0.002759 -0.002847 0.000288 0.000231 -0.001781 0.000194 0.060907 0.002915 -0.014308
consumption 0.002759 1.000000 -0.699718 -0.003055 0.040840 -0.327365 0.159166 -0.016480 -0.061077 -0.009147
CPI -0.002847 -0.699718 1.000000 0.136598 0.113906 0.522821 0.074280 0.044447 0.049449 0.026096
consumer_price_growth_rate 0.000288 -0.003055 0.136598 1.000000 0.715158 -0.196918 0.681497 0.024739 -0.018145 0.021154
housing_water_electricity_fuel 0.000231 0.040840 0.113906 0.715158 1.000000 -0.101544 0.696399 0.066081 -0.061292 0.031045
world_Production_per_cultivation_area -0.001781 -0.327365 0.522821 -0.196918 -0.101544 1.000000 -0.180101 0.070655 0.030330 0.018708
traffic 0.000194 0.159166 0.074280 0.681497 0.696399 -0.180101 1.000000 0.055463 -0.056359 0.041191
price_US 0.060907 -0.016480 0.044447 0.024739 0.066081 0.070655 0.055463 1.000000 -0.035601 0.019834
price_Thailand 0.002915 -0.061077 0.049449 -0.018145 -0.061292 0.030330 -0.056359 -0.035601 1.000000 -0.003345
temperature -0.014308 -0.009147 0.026096 0.021154 0.031045 0.018708 0.041191 0.019834 -0.003345 1.000000
laaged_values = train.values[-200:]
forecast = pd.DataFrame(results.forecast(y= laaged_values, steps=200), index = test.index, columns= ['avg_price_d', 'consumption_d','CPI_d', 'consumer_price_growth_rate_d', 'housing_water_electricity_fuel_d', 'world_Production_per_cultivation_area_d', 'traffic_d', 'price_US_d', 'price_Thailand_d', 'temperature_d'])
forecast
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_price_d | consumption_d | CPI_d | consumer_price_growth_rate_d | housing_water_electricity_fuel_d | world_Production_per_cultivation_area_d | traffic_d | price_US_d | price_Thailand_d | temperature_d | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2021-01-21 | 24.975781 | -0.021196 | 0.019971 | -0.001028 | -0.001318 | 0.000257 | -0.002905 | 0.000153 | 0.352815 | 1.755285 |
2021-01-22 | 7.453891 | -0.021334 | 0.020102 | -0.001034 | -0.001331 | 0.000259 | -0.002928 | 0.000028 | 0.284322 | 2.042114 |
2021-01-25 | 5.637550 | -0.021053 | 0.019838 | -0.001019 | -0.001316 | 0.000255 | -0.002893 | 0.000016 | 0.272662 | 2.323619 |
2021-01-26 | 5.394489 | -0.020718 | 0.019523 | -0.001002 | -0.001298 | 0.000251 | -0.002850 | 0.000016 | 0.266451 | 2.598188 |
2021-01-27 | 5.310380 | -0.020385 | 0.019210 | -0.000985 | -0.001280 | 0.000247 | -0.002807 | 0.000017 | 0.260916 | 2.865833 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-11-05 | 2.759144 | -0.007633 | 0.007224 | -0.000336 | -0.000594 | 0.000092 | -0.001171 | 0.000054 | 0.051418 | 13.105962 |
2021-11-08 | 2.758656 | -0.007630 | 0.007222 | -0.000336 | -0.000593 | 0.000092 | -0.001170 | 0.000054 | 0.051378 | 13.107922 |
2021-11-09 | 2.758181 | -0.007628 | 0.007220 | -0.000336 | -0.000593 | 0.000092 | -0.001170 | 0.000054 | 0.051338 | 13.109831 |
2021-11-10 | 2.757717 | -0.007626 | 0.007217 | -0.000335 | -0.000593 | 0.000092 | -0.001170 | 0.000054 | 0.051300 | 13.111693 |
2021-11-11 | 2.757265 | -0.007623 | 0.007215 | -0.000335 | -0.000593 | 0.000092 | -0.001169 | 0.000054 | 0.051263 | 13.113507 |
200 rows × 10 columns
forecast["avg_price_forecasted"] = var_df["avg_price"].iloc[-10-1] + forecast['avg_price_d'].cumsum()
forecast["consumption_forecasted"] = var_df["consumption"].iloc[-10-1] + forecast['consumption_d'].cumsum()
forecast["CPI_forecasted"] = var_df["CPI"].iloc[-10-1] + forecast['CPI_d'].cumsum()
forecast["world_Production_per_cultivation_area_forecasted"] = var_df["world_Production_per_cultivation_area"].iloc[-10-1] + forecast['world_Production_per_cultivation_area_d'].cumsum()
forecast
#왼쪽 4열(_d)은 차분에 대한 예측값, 오른쪽 4열(_forcasted)은 원래 시리즈에 대한 예측값
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_price_d | consumption_d | CPI_d | consumer_price_growth_rate_d | housing_water_electricity_fuel_d | world_Production_per_cultivation_area_d | traffic_d | price_US_d | price_Thailand_d | temperature_d | avg_price_forecasted | consumption_forecasted | CPI_forecasted | world_Production_per_cultivation_area_forecasted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||||
2021-01-21 | 24.975781 | -0.021196 | 0.019971 | -0.001028 | -0.001318 | 0.000257 | -0.002905 | 0.000153 | 0.352815 | 1.755285 | 54644.975781 | 52.587804 | 105.419971 | 3.118432 |
2021-01-22 | 7.453891 | -0.021334 | 0.020102 | -0.001034 | -0.001331 | 0.000259 | -0.002928 | 0.000028 | 0.284322 | 2.042114 | 54652.429672 | 52.566470 | 105.440072 | 3.118691 |
2021-01-25 | 5.637550 | -0.021053 | 0.019838 | -0.001019 | -0.001316 | 0.000255 | -0.002893 | 0.000016 | 0.272662 | 2.323619 | 54658.067222 | 52.545416 | 105.459910 | 3.118946 |
2021-01-26 | 5.394489 | -0.020718 | 0.019523 | -0.001002 | -0.001298 | 0.000251 | -0.002850 | 0.000016 | 0.266451 | 2.598188 | 54663.461711 | 52.524698 | 105.479433 | 3.119197 |
2021-01-27 | 5.310380 | -0.020385 | 0.019210 | -0.000985 | -0.001280 | 0.000247 | -0.002807 | 0.000017 | 0.260916 | 2.865833 | 54668.772091 | 52.504313 | 105.498643 | 3.119445 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-11-05 | 2.759144 | -0.007633 | 0.007224 | -0.000336 | -0.000594 | 0.000092 | -0.001171 | 0.000054 | 0.051418 | 13.105962 | 55290.357825 | 50.573293 | 107.323175 | 3.142715 |
2021-11-08 | 2.758656 | -0.007630 | 0.007222 | -0.000336 | -0.000593 | 0.000092 | -0.001170 | 0.000054 | 0.051378 | 13.107922 | 55293.116481 | 50.565663 | 107.330396 | 3.142807 |
2021-11-09 | 2.758181 | -0.007628 | 0.007220 | -0.000336 | -0.000593 | 0.000092 | -0.001170 | 0.000054 | 0.051338 | 13.109831 | 55295.874662 | 50.558035 | 107.337616 | 3.142898 |
2021-11-10 | 2.757717 | -0.007626 | 0.007217 | -0.000335 | -0.000593 | 0.000092 | -0.001170 | 0.000054 | 0.051300 | 13.111693 | 55298.632379 | 50.550410 | 107.344833 | 3.142990 |
2021-11-11 | 2.757265 | -0.007623 | 0.007215 | -0.000335 | -0.000593 | 0.000092 | -0.001169 | 0.000054 | 0.051263 | 13.113507 | 55301.389644 | 50.542786 | 107.352048 | 3.143081 |
200 rows × 14 columns
test =var_df.iloc[-200:,:];test
pd.set_option('mode.chained_assignment', None)
forecast[['avg_price_d']].plot()
# test[['avg_price']].plot()
<AxesSubplot:xlabel='date'>
plt.figure(figsize=(18,10))
plt.title("Compare Actual vs. Model Prediction", fontsize=15)
plt.plot(np.array(forecast['avg_price_forecasted']), label = 'predicted price by the model')
plt.plot(np.array(test['avg_price']), label = 'actual price')
plt.legend()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
# rcParams['figure.figsize']=20,10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout,Dense
from sklearn.preprocessing import MinMaxScaler
# 테스트셋 비교
df=pd.read_csv("data/rice_df2.csv")
# df = df[selected_vars]
df["date"]=pd.to_datetime(df['date'])
df.index=df['date']
data=df.sort_index(ascending=True,axis=0)
new_dataset=pd.DataFrame(index=range(0,len(df)),columns=['date','avg_price'])
for i in range(0,len(data)):
new_dataset["date"][i]=data['date'][i]
new_dataset["avg_price"][i]=data["avg_price"][i]
scaler=MinMaxScaler()
new_dataset.index=new_dataset.date
new_dataset.drop("date",axis=1,inplace=True)
final_dataset=new_dataset.values
train_data=final_dataset[0:3948,:]
valid_data=final_dataset[3948:,:]
valid_data.shape
scaler=MinMaxScaler(feature_range=(0,1))
scaled_data=scaler.fit_transform(final_dataset)
x_train_data,y_train_data=[],[]
for i in range(60,len(train_data)):
x_train_data.append(scaled_data[i-60:i,0])
y_train_data.append(scaled_data[i,0])
x_train_data,y_train_data=np.array(x_train_data),np.array(y_train_data)
x_train_data=np.reshape(x_train_data,(x_train_data.shape[0],x_train_data.shape[1],1))
lstm_model=Sequential()
lstm_model.add(LSTM(units=50,return_sequences=True,input_shape=(x_train_data.shape[1],1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))
inputs_data=new_dataset[len(new_dataset)-len(valid_data)-60:].values
inputs_data=inputs_data.reshape(-1,1)
inputs_data=scaler.transform(inputs_data)
lstm_model.compile(loss='mean_squared_error',optimizer='adam')
lstm_model.fit(x_train_data,y_train_data,epochs=1,batch_size=1,verbose=2)
X_test=[]
for i in range(60,inputs_data.shape[0]):
X_test.append(inputs_data[i-60:i,0])
X_test=np.array(X_test)
X_test=np.reshape(X_test,(X_test.shape[0],X_test.shape[1],1))
predicted_closing_price=lstm_model.predict(X_test)
predicted_closing_price=scaler.inverse_transform(predicted_closing_price)
lstm_model.save("saved_model.h5")
train_data=new_dataset[:3948]
valid_data=new_dataset[3948:]
valid_data['Predictions']=predicted_closing_price
3888/3888 - 136s - loss: 2.9229e-04
<ipython-input-27-54f4bed43d96>:60: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
valid_data['Predictions']=predicted_closing_price
plt.figure(figsize=(6,6))
plt.scatter(valid_data['avg_price'], valid_data['Predictions'], alpha=0.4)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("LSTM:: Compare Actual vs. Model Prediction")
plt.axline([30000,30000],[60000,60000], color='black', linewidth=1,linestyle='--',alpha=1)
plt.xlim([30000, 60000])
plt.ylim([30000, 60000])
plt.show()
# x축은 실제 가격, y축은 예측가격이다. 만약 정답을 맞춘다면 정확히 선으로 일치되어 나올 거다.
# 테스트셋만 비교
plt.figure(figsize=(12,6))
plt.title("LSTM:: Price Prediction")
plt.plot(train_data["avg_price"], 'b', label = 'actual price',lw=3)
plt.plot(valid_data['avg_price'], 'b',lw=3)
plt.plot(valid_data['Predictions'],'r', label = 'test Predictions',lw=3)
plt.legend()
plt.grid(alpha=0.3)
plt.show()
LSTM_Residual = np.array(valid_data['avg_price'] - valid_data['Predictions'])
# 검증데이터에 대한 예측
# pred_y2 = fitted_full_model.predict(pd.DataFrame(scaler.transform(X),columns=X.columns,index=X.index))
# 예측데이터 잔차
plt.figure(figsize=(12,6))
plt.title("LSTM:: Prediction error 예측오차(aka. Residual)")
plt.plot(LSTM_Residual,'r')
plt.ylim([-6000, 6000])
plt.legend()
plt.axhline(y=0, color='black', linewidth=1,linestyle='--',alpha=0.8)
plt.show()
No handles with labels found to put in legend.
MAE = mean_absolute_error(valid_data['avg_price'], valid_data['Predictions'])
RMSE = np.sqrt(mean_squared_error(valid_data['avg_price'], valid_data['Predictions']))
# MSLE = mean_squared_log_error(test_label,pred_test)
# RMSLE = np.sqrt(mean_squared_log_error(test_label,pred_test))
R2 = r2_score(valid_data['avg_price'], valid_data['Predictions'])
pd.options.display.float_format = '{:.5f}'.format
Result = pd.DataFrame(data = [MAE,RMSE,R2],
index=['MAE','RMSE','R2'],
columns=['Results']);Result
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Results | |
---|---|
MAE | 266.67171 |
RMSE | 430.40663 |
R2 | 0.99727 |
Regression | RandomForestRegressor | LSTM | 농촌경제연구원 | |
---|---|---|---|---|
RMSE | 1884.973479 | 215.413278 | 296.15996 | 841.664897 |
R2 | 0.875065 | 0.998984 | 0.99871 | 0.9870561 |
rd={
'OLS':list(OLS_residual2[3948:]),
'RFR':list(RFR_residual[3948:]),
'LSTM':list(LSTM_Residual)
}
residual = pd.DataFrame(rd,index=df.index[3948:])
## # 예측데이터 잔차
plt.figure(figsize=(12,6))
plt.title("Prediction error")
plt.plot(residual.OLS,'C0',label = 'Regression', linewidth=1.5)#
plt.plot(residual.LSTM,'C1',label = 'LSTM', linewidth=1.5)
plt.plot(residual.RFR,'C2' ,label = 'RandomForest', linewidth=1.5)
plt.ylim([-6000, 6000])
plt.legend()
plt.axhline(y=0, color='black', linewidth=1,linestyle='--',alpha=1)
plt.show()
com = pd.read_csv('data/일반계가격.csv');com
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
date | actual | pred | |
---|---|---|---|
0 | 2017. 07 | 31891 | 31300 |
1 | 2017. 08 | 32556 | 31300 |
2 | 2017. 09 | 33168 | 31300 |
3 | 2018. 07 | 44126 | 44000 |
4 | 2018. 08 | 44435 | 44000 |
5 | 2018. 09 | 44618 | 44000 |
6 | 2019. 07 | 47446 | 46750 |
7 | 2019. 08 | 47157 | 46750 |
8 | 2019. 09 | 46834 | 46750 |
9 | 2020. 07 | 47846 | 47750 |
10 | 2020. 08 | 47869 | 47750 |
11 | 2020. 09 | 48143 | 47750 |
12 | 2021. 07 | 55850 | 56100 |
13 | 2021. 08 | 55630 | 56100 |
14 | 2021. 09 | 54228 | 56100 |
r2_score(com['actual'],com['pred'])
0.9870561181225767
np.sqrt(metrics.mean_squared_error(com['actual'], com['pred']))
841.6648976879101