diff --git a/feature_engineering/feature_engineering.ipynb b/feature_engineering/feature_engineering.ipynb
index 77b5929c..17c5e370 100644
--- a/feature_engineering/feature_engineering.ipynb
+++ b/feature_engineering/feature_engineering.ipynb
@@ -51,7 +51,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 57,
"metadata": {},
"outputs": [
{
@@ -155,7 +155,7 @@
"5 3650.0 Male "
]
},
- "execution_count": 2,
+ "execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
@@ -179,7 +179,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@@ -219,7 +219,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 59,
"metadata": {},
"outputs": [
{
@@ -230,7 +230,7 @@
" [1.47749591e-03]])"
]
},
- "execution_count": 4,
+ "execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
@@ -252,7 +252,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 60,
"metadata": {},
"outputs": [
{
@@ -313,7 +313,7 @@
"4 18.263268"
]
},
- "execution_count": 5,
+ "execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
@@ -381,7 +381,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
@@ -405,19 +405,19 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
- "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ "LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"LinearRegression()"
]
},
- "execution_count": 8,
+ "execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
@@ -448,7 +448,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 63,
"metadata": {},
"outputs": [
{
@@ -457,7 +457,7 @@
"7.297305899612306"
]
},
- "execution_count": 9,
+ "execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
@@ -469,7 +469,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 64,
"metadata": {},
"outputs": [
{
@@ -478,7 +478,7 @@
"array([0.05812622])"
]
},
- "execution_count": 10,
+ "execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
@@ -501,9 +501,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 65,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The RMSE of the model is 1.1549363099239012\n"
+ ]
+ }
+ ],
"source": [
"y_hat_one_feature = my_model.predict(penguins[[\"flipper_length_mm\"]])\n",
"\n",
@@ -523,7 +531,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 66,
"metadata": {},
"outputs": [
{
@@ -559,9 +567,78 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 67,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Y_hat from OLS | \n",
+ " Y_hat from sklearn | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 18.322561 | \n",
+ " 18.322561 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 18.445578 | \n",
+ " 18.445578 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 17.721412 | \n",
+ " 17.721412 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 17.997254 | \n",
+ " 17.997254 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 18.263268 | \n",
+ " 18.263268 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Y_hat from OLS Y_hat from sklearn\n",
+ "0 18.322561 18.322561\n",
+ "1 18.445578 18.445578\n",
+ "2 17.721412 17.721412\n",
+ "3 17.997254 17.997254\n",
+ "4 18.263268 18.263268"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"#| code-fold: true\n",
"pd.DataFrame({\"Y_hat from OLS\":np.squeeze(y_hat), \"Y_hat from sklearn\":y_hat_two_features}).head()"
@@ -616,7 +693,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 83,
"metadata": {},
"outputs": [
{
@@ -651,69 +728,69 @@
" \n",
" \n",
" \n",
- " 54 | \n",
- " 25.56 | \n",
- " 4.34 | \n",
- " Male | \n",
+ " 0 | \n",
+ " 16.99 | \n",
+ " 1.01 | \n",
+ " Female | \n",
" No | \n",
" Sun | \n",
" Dinner | \n",
- " 4 | \n",
+ " 2 | \n",
"
\n",
" \n",
- " 46 | \n",
- " 22.23 | \n",
- " 5.00 | \n",
+ " 1 | \n",
+ " 10.34 | \n",
+ " 1.66 | \n",
" Male | \n",
" No | \n",
" Sun | \n",
" Dinner | \n",
- " 2 | \n",
+ " 3 | \n",
"
\n",
" \n",
- " 86 | \n",
- " 13.03 | \n",
- " 2.00 | \n",
+ " 2 | \n",
+ " 21.01 | \n",
+ " 3.50 | \n",
" Male | \n",
" No | \n",
- " Thur | \n",
- " Lunch | \n",
- " 2 | \n",
+ " Sun | \n",
+ " Dinner | \n",
+ " 3 | \n",
"
\n",
" \n",
- " 199 | \n",
- " 13.51 | \n",
- " 2.00 | \n",
+ " 3 | \n",
+ " 23.68 | \n",
+ " 3.31 | \n",
" Male | \n",
- " Yes | \n",
- " Thur | \n",
- " Lunch | \n",
+ " No | \n",
+ " Sun | \n",
+ " Dinner | \n",
" 2 | \n",
"
\n",
" \n",
- " 106 | \n",
- " 20.49 | \n",
- " 4.06 | \n",
- " Male | \n",
- " Yes | \n",
- " Sat | \n",
+ " 4 | \n",
+ " 24.59 | \n",
+ " 3.61 | \n",
+ " Female | \n",
+ " No | \n",
+ " Sun | \n",
" Dinner | \n",
- " 2 | \n",
+ " 4 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " total_bill tip sex smoker day time size\n",
- "54 25.56 4.34 Male No Sun Dinner 4\n",
- "46 22.23 5.00 Male No Sun Dinner 2\n",
- "86 13.03 2.00 Male No Thur Lunch 2\n",
- "199 13.51 2.00 Male Yes Thur Lunch 2\n",
- "106 20.49 4.06 Male Yes Sat Dinner 2"
+ " total_bill tip sex smoker day time size\n",
+ "0 16.99 1.01 Female No Sun Dinner 2\n",
+ "1 10.34 1.66 Male No Sun Dinner 3\n",
+ "2 21.01 3.50 Male No Sun Dinner 3\n",
+ "3 23.68 3.31 Male No Sun Dinner 2\n",
+ "4 24.59 3.61 Female No Sun Dinner 4"
]
},
- "execution_count": 12,
+ "execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
@@ -721,9 +798,8 @@
"source": [
"#| code-fold: true\n",
"import numpy as np\n",
- "np.random.seed(1337)\n",
- "tips = sns.load_dataset(\"tips\").sample(100)\n",
- "tips.head(5)"
+ "tips = sns.load_dataset(\"tips\")\n",
+ "tips.head()"
]
},
{
@@ -736,22 +812,14 @@
"\n",
"\n",
"\n",
- "The one-hot encoded features can then be used in the design matrix to train a model:\n",
- "\n",
- "\n",
- "\n",
- "$$\\hat{y} = \\theta_1 (\\text{total}\\textunderscore\\text{bill}) + \\theta_2 (\\text{size}) + \\theta_3 (\\text{day}\\textunderscore\\text{Fri}) + \\theta_4 (\\text{day}\\textunderscore\\text{Sat}) + \\theta_5 (\\text{day}\\textunderscore\\text{Sun}) + \\theta_6 (\\text{day}\\textunderscore\\text{Thur})$$\n",
+ "
\n",
"\n",
- "Or in shorthand:\n",
- "\n",
- "$$\\hat{y} = \\theta_1\\phi_1 + \\theta_2\\phi_2 + \\theta_3\\phi_3 + \\theta_4\\phi_4 + \\theta_5\\phi_5 + \\theta_6\\phi_6$$\n",
- "\n",
- "The `OneHotEncoder` class of `sklearn` ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out)) offers a quick way to perform one-hot encoding. You will explore its use in detail in the lab. For now, recognize that we follow a very similar workflow to when we were working with the `LinearRegression` class: we initialize a `OneHotEncoder` object, fit it to our data, then use `.transform` to apply the fitted encoder."
+ "The `OneHotEncoder` class of `sklearn` ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out)) offers a quick way to perform this one-hot encoding. You will explore its use in detail in the lab. For now, recognize that we follow a very similar workflow to when we were working with the `LinearRegression` class: we initialize a `OneHotEncoder` object, fit it to our data, then use `.transform` to apply the fitted encoder."
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 84,
"metadata": {},
"outputs": [
{
@@ -800,21 +868,21 @@
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.0 | \n",
" 1.0 | \n",
+ " 0.0 | \n",
" \n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.0 | \n",
" 1.0 | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
- " 1.0 | \n",
" 0.0 | \n",
+ " 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
@@ -825,12 +893,12 @@
" day_Fri day_Sat day_Sun day_Thur\n",
"0 0.0 0.0 1.0 0.0\n",
"1 0.0 0.0 1.0 0.0\n",
- "2 0.0 0.0 0.0 1.0\n",
- "3 0.0 0.0 0.0 1.0\n",
- "4 0.0 1.0 0.0 0.0"
+ "2 0.0 0.0 1.0 0.0\n",
+ "3 0.0 0.0 1.0 0.0\n",
+ "4 0.0 0.0 1.0 0.0"
]
},
- "execution_count": 13,
+ "execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
@@ -853,11 +921,137 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
+ "The one-hot encoded features can then be used in the design matrix to train a model:\n",
+ "\n",
+ "\n",
+ "\n",
+ "$$\\hat{y} = \\theta_1 (\\text{total}\\textunderscore\\text{bill}) + \\theta_2 (\\text{size}) + \\theta_3 (\\text{day}\\textunderscore\\text{Fri}) + \\theta_4 (\\text{day}\\textunderscore\\text{Sat}) + \\theta_5 (\\text{day}\\textunderscore\\text{Sun}) + \\theta_6 (\\text{day}\\textunderscore\\text{Thur})$$\n",
+ "\n",
+ "Or in shorthand:\n",
+ "\n",
+ "$$\\hat{y} = \\theta_1\\phi_1 + \\theta_2\\phi_2 + \\theta_3\\phi_3 + \\theta_4\\phi_4 + \\theta_5\\phi_5 + \\theta_6\\phi_6$$\n",
+ "\n",
"Now, the `\"day\"` feature (or rather, the four new boolean features that represent day) can be used to fit a model.\n",
"\n",
+ "Using `sklearn` to fit the new model, we can determine the model coefficients, allowing us to understand how each feature impacts the predicted tip."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Feature | \n",
+ " Model Coefficient | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " total_bill | \n",
+ " 0.092994 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " size | \n",
+ " 0.187132 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " day_Fri | \n",
+ " 0.745787 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " day_Sat | \n",
+ " 0.621129 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " day_Sun | \n",
+ " 0.732289 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " day_Thur | \n",
+ " 0.668294 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Feature Model Coefficient\n",
+ "0 total_bill 0.092994\n",
+ "1 size 0.187132\n",
+ "2 day_Fri 0.745787\n",
+ "3 day_Sat 0.621129\n",
+ "4 day_Sun 0.732289\n",
+ "5 day_Thur 0.668294"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.linear_model import LinearRegression\n",
+ "data_w_ohe = tips[[\"total_bill\", \"size\", \"day\"]].join(encoded_day_df).drop(columns = \"day\")\n",
+ "ohe_model = lm.LinearRegression(fit_intercept=False) #Tell sklearn to not add an additional bias column. Why?\n",
+ "ohe_model.fit(data_w_ohe, tips[\"tip\"])\n",
+ "\n",
+ "pd.DataFrame({\"Feature\":data_w_ohe.columns, \"Model Coefficient\":ohe_model.coef_})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For example, when looking at the coefficient for \"day_Fri\", we can understand how much the fact that it is Friday impacts the predicted tip. \n",
+ "\n",
+ "When one-hot encoding, keep in mind that any set of one-hot encoded columns will always sum to a column of all ones, representing the bias column. More formally, the bias column is a linear combination of the OHE columns.\n",
+ "\n",
+ "\n",
+ "\n",
+ "We must be careful not to include this bias column in our design matrix. Otherwise, there will be linear dependence in the model, meaning $\\mathbb{X}^T\\mathbb{X}$ would no longer be invertible, and our OLS estimate $\\hat{\\theta} = (\\mathbb{X}^T\\mathbb{X})^{-1}\\mathbb{X}^T\\mathbb{Y}$ fails.\n",
+ "\n",
+ "To resolve this issue, we simply omit one of the one-hot encoded columns *or* do not include an intercept term. \n",
+ "\n",
+ "\n",
+ "\n",
+ "Either approach works — we still retain the same information as the omitted column being a linear combination of the remaining columns."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"## Polynomial Features\n",
"\n",
"We have encountered a few cases now where models with linear features have performed poorly on datasets that show clear non-linear curvature. \n",
@@ -1007,6 +1201,13 @@
"\n",
"This begs the question: how do we control the complexity of a model? Stay tuned for our Lecture 16 on Cross-Validation and Regularization!"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/feature_engineering/images/bias.png b/feature_engineering/images/bias.png
new file mode 100644
index 00000000..e6455ca2
Binary files /dev/null and b/feature_engineering/images/bias.png differ
diff --git a/feature_engineering/images/remove.png b/feature_engineering/images/remove.png
new file mode 100644
index 00000000..bd09ddcf
Binary files /dev/null and b/feature_engineering/images/remove.png differ