Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24,145 changes: 24,145 additions & 0 deletions 파머완/4장/고객만족.ipynb

Large diffs are not rendered by default.

435 changes: 435 additions & 0 deletions 파머완/4장/신용카드.ipynb

Large diffs are not rendered by default.

321 changes: 321 additions & 0 deletions 파머완/5장/Bike Sharing Demand.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.9 Regression 실습 - Bike Sharing Demand\n",
"### 데이터 클렌징 및 가공"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n",
"\n",
"bike_df = pd.read_csv('./bike_train.csv')\n",
"print(bike_df.shape)\n",
"bike_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bike_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 문자열을 datetime 타입으로 변경. \n",
"bike_df['datetime'] = bike_df.datetime.apply(pd.to_datetime)\n",
"\n",
"# datetime 타입에서 년, 월, 일, 시간 추출\n",
"bike_df['year'] = bike_df.datetime.apply(lambda x : x.year)\n",
"bike_df['month'] = bike_df.datetime.apply(lambda x : x.month)\n",
"bike_df['day'] = bike_df.datetime.apply(lambda x : x.day)\n",
"bike_df['hour'] = bike_df.datetime.apply(lambda x: x.hour)\n",
"bike_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"drop_columns = ['datetime','casual','registered']\n",
"bike_df.drop(drop_columns, axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(figsize=(16, 8), ncols=4, nrows=2)\n",
"cat_features = ['year', 'month','season','weather','day', 'hour', 'holiday','workingday']\n",
"# cat_features에 있는 모든 칼럼별로 개별 칼럼값에 따른 count의 합을 barplot으로 시각화\n",
"for i, feature in enumerate(cat_features):\n",
" row = int(i/4)\n",
" col = i%4\n",
" # 시본의 barplot을 이용해 칼럼값에 따른 count의 합을 표현\n",
" sns.barplot(x=feature, y='count', data=bike_df, ax=axs[row][col])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
"\n",
"# log 값 변환 시 NaN등의 이슈로 log() 가 아닌 log1p() 를 이용하여 RMSLE 계산\n",
"def rmsle(y, pred):\n",
" log_y = np.log1p(y)\n",
" log_pred = np.log1p(pred)\n",
" squared_error = (log_y - log_pred) ** 2\n",
" rmsle = np.sqrt(np.mean(squared_error))\n",
" return rmsle\n",
"\n",
"# 사이킷런의 mean_square_error() 를 이용하여 RMSE 계산\n",
"def rmse(y,pred):\n",
" return np.sqrt(mean_squared_error(y,pred))\n",
"\n",
"# MSE, RMSE, RMSLE 를 모두 계산 \n",
"def evaluate_regr(y,pred):\n",
" rmsle_val = rmsle(y,pred)\n",
" rmse_val = rmse(y,pred)\n",
" # MAE 는 scikit learn의 mean_absolute_error() 로 계산\n",
" mae_val = mean_absolute_error(y,pred)\n",
" print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MAE: {2:.3F}'.format(rmsle_val, rmse_val, mae_val))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 로그 변환, 피처 인코딩, 모델 학습/예측/평가 "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split , GridSearchCV\n",
"from sklearn.linear_model import LinearRegression , Ridge , Lasso\n",
"\n",
"y_target = bike_df['count']\n",
"X_features = bike_df.drop(['count'],axis=1,inplace=False)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)\n",
"\n",
"lr_reg = LinearRegression()\n",
"lr_reg.fit(X_train, y_train)\n",
"pred = lr_reg.predict(X_test)\n",
"\n",
"evaluate_regr(y_test ,pred)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_top_error_data(y_test, pred, n_tops = 5):\n",
" # DataFrame에 컬럼들로 실제 대여횟수(count)와 예측 값을 서로 비교 할 수 있도록 생성. \n",
" result_df = pd.DataFrame(y_test.values, columns=['real_count'])\n",
" result_df['predicted_count']= np.round(pred)\n",
" result_df['diff'] = np.abs(result_df['real_count'] - result_df['predicted_count'])\n",
" # 예측값과 실제값이 가장 큰 데이터 순으로 출력. \n",
" print(result_df.sort_values('diff', ascending=False)[:n_tops])\n",
" \n",
"get_top_error_data(y_test,pred,n_tops=5)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_target.hist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_log_transform = np.log1p(y_target)\n",
"y_log_transform.hist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 타깃 칼럼인 count 값을 log1p로 로그 변환\n",
"y_target_log = np.log1p(y_target)\n",
"\n",
"# 로그 변환된 y_target_log를 반영하여 학습/테스트 데이터 셋 분할\n",
"X_train, X_test, y_train, y_test = train_test_split(X_features, y_target_log, test_size=0.3, random_state=0)\n",
"lr_reg = LinearRegression()\n",
"lr_reg.fit(X_train, y_train)\n",
"pred = lr_reg.predict(X_test)\n",
"\n",
"# 테스트 데이터 셋의 Target 값은 Log 변환되었으므로 다시 expm1를 이용하여 원래 scale로 변환\n",
"y_test_exp = np.expm1(y_test)\n",
"\n",
"# 예측 값 역시 Log 변환된 타깃 기반으로 학습되어 예측되었으므로 다시 exmpl으로 scale변환\n",
"pred_exp = np.expm1(pred)\n",
"\n",
"evaluate_regr(y_test_exp ,pred_exp)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"coef = pd.Series(lr_reg.coef_, index=X_features.columns)\n",
"coef_sort = coef.sort_values(ascending=False)\n",
"sns.barplot(x=coef_sort.values, y=coef_sort.index)\n",
"plt.savefig('log_transform.tif', format='tif', dpi=300, bbox_inches='tight')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 'year', month', 'day', hour'등의 피처들을 One Hot Encoding\n",
"X_features_ohe = pd.get_dummies(X_features, columns=['year', 'month','day', 'hour', 'holiday',\n",
" 'workingday','season','weather'])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# 원-핫 인코딩이 적용된 feature 데이터 세트 기반으로 학습/예측 데이터 분할. \n",
"X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log,\n",
" test_size=0.3, random_state=0)\n",
"\n",
"# 모델과 학습/테스트 데이터 셋을 입력하면 성능 평가 수치를 반환\n",
"def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):\n",
" model.fit(X_train, y_train)\n",
" pred = model.predict(X_test)\n",
" if is_expm1 :\n",
" y_test = np.expm1(y_test)\n",
" pred = np.expm1(pred)\n",
" print('###',model.__class__.__name__,'###')\n",
" evaluate_regr(y_test, pred)\n",
"# end of function get_model_predict \n",
"\n",
"# model 별로 평가 수행\n",
"lr_reg = LinearRegression()\n",
"ridge_reg = Ridge(alpha=10)\n",
"lasso_reg = Lasso(alpha=0.01)\n",
"\n",
"for model in [lr_reg, ridge_reg, lasso_reg]:\n",
" get_model_predict(model,X_train, X_test, y_train, y_test,is_expm1=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"coef = pd.Series(lr_reg.coef_ , index=X_features_ohe.columns)\n",
"coef_sort = coef.sort_values(ascending=False)[:20]\n",
"sns.barplot(x=coef_sort.values , y=coef_sort.index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
"from xgboost import XGBRegressor\n",
"from lightgbm import LGBMRegressor\n",
"\n",
"# 랜덤 포레스트, GBM, XGBoost, LightGBM model 별로 평가 수행\n",
"rf_reg = RandomForestRegressor(n_estimators=500)\n",
"gbm_reg = GradientBoostingRegressor(n_estimators=500)\n",
"xgb_reg = XGBRegressor(n_estimators=500)\n",
"lgbm_reg = LGBMRegressor(n_estimators=500)\n",
"\n",
"for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]:\n",
" # XGBoost의 경우 DataFrame이 입력 될 경우 버전에 따라 오류 발생 가능. ndarray로 변환.\n",
" get_model_predict(model,X_train.values, X_test.values, y_train.values, y_test.values,is_expm1=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading