8-DAstudy-8 · zoo0YOON · Sep 5, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/파머완/4장/고객만족.ipynb b/파머완/4장/고객만족.ipynb
diff --git a/파머완/4장/신용카드.ipynb b/파머완/4장/신용카드.ipynb
diff --git a/파머완/5장/Bike Sharing Demand.ipynb b/파머완/5장/Bike Sharing Demand.ipynb
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5.9 Regression 실습 - Bike Sharing Demand\n",
+    "### 데이터 클렌징 및 가공"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n",
+    "\n",
+    "bike_df = pd.read_csv('./bike_train.csv')\n",
+    "print(bike_df.shape)\n",
+    "bike_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bike_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 문자열을 datetime 타입으로 변경. \n",
+    "bike_df['datetime'] = bike_df.datetime.apply(pd.to_datetime)\n",
+    "\n",
+    "# datetime 타입에서 년, 월, 일, 시간 추출\n",
+    "bike_df['year'] = bike_df.datetime.apply(lambda x : x.year)\n",
+    "bike_df['month'] = bike_df.datetime.apply(lambda x : x.month)\n",
+    "bike_df['day'] = bike_df.datetime.apply(lambda x : x.day)\n",
+    "bike_df['hour'] = bike_df.datetime.apply(lambda x: x.hour)\n",
+    "bike_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "drop_columns = ['datetime','casual','registered']\n",
+    "bike_df.drop(drop_columns, axis=1,inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axs = plt.subplots(figsize=(16, 8), ncols=4, nrows=2)\n",
+    "cat_features = ['year', 'month','season','weather','day', 'hour', 'holiday','workingday']\n",
+    "# cat_features에 있는 모든 칼럼별로 개별 칼럼값에 따른 count의 합을 barplot으로 시각화\n",
+    "for i, feature in enumerate(cat_features):\n",
+    "    row = int(i/4)\n",
+    "    col = i%4\n",
+    "    # 시본의 barplot을 이용해 칼럼값에 따른 count의 합을 표현\n",
+    "    sns.barplot(x=feature, y='count', data=bike_df, ax=axs[row][col])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
+    "\n",
+    "# log 값 변환 시 NaN등의 이슈로 log() 가 아닌 log1p() 를 이용하여 RMSLE 계산\n",
+    "def rmsle(y, pred):\n",
+    "    log_y = np.log1p(y)\n",
+    "    log_pred = np.log1p(pred)\n",
+    "    squared_error = (log_y - log_pred) ** 2\n",
+    "    rmsle = np.sqrt(np.mean(squared_error))\n",
+    "    return rmsle\n",
+    "\n",
+    "# 사이킷런의 mean_square_error() 를 이용하여 RMSE 계산\n",
+    "def rmse(y,pred):\n",
+    "    return np.sqrt(mean_squared_error(y,pred))\n",
+    "\n",
+    "# MSE, RMSE, RMSLE 를 모두 계산 \n",
+    "def evaluate_regr(y,pred):\n",
+    "    rmsle_val = rmsle(y,pred)\n",
+    "    rmse_val = rmse(y,pred)\n",
+    "    # MAE 는 scikit learn의 mean_absolute_error() 로 계산\n",
+    "    mae_val = mean_absolute_error(y,pred)\n",
+    "    print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MAE: {2:.3F}'.format(rmsle_val, rmse_val, mae_val))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 로그 변환, 피처 인코딩, 모델 학습/예측/평가 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split , GridSearchCV\n",
+    "from sklearn.linear_model import LinearRegression , Ridge , Lasso\n",
+    "\n",
+    "y_target = bike_df['count']\n",
+    "X_features = bike_df.drop(['count'],axis=1,inplace=False)\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)\n",
+    "\n",
+    "lr_reg = LinearRegression()\n",
+    "lr_reg.fit(X_train, y_train)\n",
+    "pred = lr_reg.predict(X_test)\n",
+    "\n",
+    "evaluate_regr(y_test ,pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_top_error_data(y_test, pred, n_tops = 5):\n",
+    "    # DataFrame에 컬럼들로 실제 대여횟수(count)와 예측 값을 서로 비교 할 수 있도록 생성. \n",
+    "    result_df = pd.DataFrame(y_test.values, columns=['real_count'])\n",
+    "    result_df['predicted_count']= np.round(pred)\n",
+    "    result_df['diff'] = np.abs(result_df['real_count'] - result_df['predicted_count'])\n",
+    "    # 예측값과 실제값이 가장 큰 데이터 순으로 출력. \n",
+    "    print(result_df.sort_values('diff', ascending=False)[:n_tops])\n",
+    "    \n",
+    "get_top_error_data(y_test,pred,n_tops=5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_target.hist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_log_transform = np.log1p(y_target)\n",
+    "y_log_transform.hist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 타깃 칼럼인 count 값을 log1p로 로그 변환\n",
+    "y_target_log = np.log1p(y_target)\n",
+    "\n",
+    "# 로그 변환된 y_target_log를 반영하여 학습/테스트 데이터 셋 분할\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_features, y_target_log, test_size=0.3, random_state=0)\n",
+    "lr_reg = LinearRegression()\n",
+    "lr_reg.fit(X_train, y_train)\n",
+    "pred = lr_reg.predict(X_test)\n",
+    "\n",
+    "# 테스트 데이터 셋의 Target 값은 Log 변환되었으므로 다시 expm1를 이용하여 원래 scale로 변환\n",
+    "y_test_exp = np.expm1(y_test)\n",
+    "\n",
+    "# 예측 값 역시 Log 변환된 타깃 기반으로 학습되어 예측되었으므로 다시 exmpl으로 scale변환\n",
+    "pred_exp = np.expm1(pred)\n",
+    "\n",
+    "evaluate_regr(y_test_exp ,pred_exp)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "coef = pd.Series(lr_reg.coef_, index=X_features.columns)\n",
+    "coef_sort = coef.sort_values(ascending=False)\n",
+    "sns.barplot(x=coef_sort.values, y=coef_sort.index)\n",
+    "plt.savefig('log_transform.tif', format='tif', dpi=300, bbox_inches='tight')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 'year', month', 'day', hour'등의 피처들을 One Hot Encoding\n",
+    "X_features_ohe = pd.get_dummies(X_features, columns=['year', 'month','day', 'hour', 'holiday',\n",
+    "                                              'workingday','season','weather'])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 원-핫 인코딩이 적용된 feature 데이터 세트 기반으로 학습/예측 데이터 분할. \n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log,\n",
+    "                                                    test_size=0.3, random_state=0)\n",
+    "\n",
+    "# 모델과 학습/테스트 데이터 셋을 입력하면 성능 평가 수치를 반환\n",
+    "def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):\n",
+    "    model.fit(X_train, y_train)\n",
+    "    pred = model.predict(X_test)\n",
+    "    if is_expm1 :\n",
+    "        y_test = np.expm1(y_test)\n",
+    "        pred = np.expm1(pred)\n",
+    "    print('###',model.__class__.__name__,'###')\n",
+    "    evaluate_regr(y_test, pred)\n",
+    "# end of function get_model_predict    \n",
+    "\n",
+    "# model 별로 평가 수행\n",
+    "lr_reg = LinearRegression()\n",
+    "ridge_reg = Ridge(alpha=10)\n",
+    "lasso_reg = Lasso(alpha=0.01)\n",
+    "\n",
+    "for model in [lr_reg, ridge_reg, lasso_reg]:\n",
+    "    get_model_predict(model,X_train, X_test, y_train, y_test,is_expm1=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "coef = pd.Series(lr_reg.coef_ , index=X_features_ohe.columns)\n",
+    "coef_sort = coef.sort_values(ascending=False)[:20]\n",
+    "sns.barplot(x=coef_sort.values , y=coef_sort.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
+    "from xgboost import XGBRegressor\n",
+    "from lightgbm import LGBMRegressor\n",
+    "\n",
+    "# 랜덤 포레스트, GBM, XGBoost, LightGBM model 별로 평가 수행\n",
+    "rf_reg = RandomForestRegressor(n_estimators=500)\n",
+    "gbm_reg = GradientBoostingRegressor(n_estimators=500)\n",
+    "xgb_reg = XGBRegressor(n_estimators=500)\n",
+    "lgbm_reg = LGBMRegressor(n_estimators=500)\n",
+    "\n",
+    "for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]:\n",
+    "    # XGBoost의 경우 DataFrame이 입력 될 경우 버전에 따라 오류 발생 가능. ndarray로 변환.\n",
+    "    get_model_predict(model,X_train.values, X_test.values, y_train.values, y_test.values,is_expm1=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}